src/freedreno/ir3/ir3_sched.c

   1 /*
   2  * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21  * SOFTWARE.
  22  *
  23  * Authors:
  24  *    Rob Clark <robclark@freedesktop.org>
  25  */
  26
  27
  28 #include "util/u_math.h"
  29
  30 #include "ir3.h"
  31
  32 /*
  33  * Instruction Scheduling:
  34  *
  35  * A recursive depth based scheduling algo.  Recursively find an eligible
  36  * instruction to schedule from the deepest instruction (recursing through
  37  * it's unscheduled src instructions).  Normally this would result in a
  38  * lot of re-traversal of the same instructions, so we cache results in
  39  * instr->data (and clear cached results that would be no longer valid
  40  * after scheduling an instruction).
  41  *
  42  * There are a few special cases that need to be handled, since sched
  43  * is currently independent of register allocation.  Usages of address
  44  * register (a0.x) or predicate register (p0.x) must be serialized.  Ie.
  45  * if you have two pairs of instructions that write the same special
  46  * register and then read it, then those pairs cannot be interleaved.
  47  * To solve this, when we are in such a scheduling "critical section",
  48  * and we encounter a conflicting write to a special register, we try
  49  * to schedule any remaining instructions that use that value first.
  50  */
  51
  52 struct ir3_sched_ctx {
  53         struct ir3_block *block;           /* the current block */
  54         struct list_head depth_list;       /* depth sorted unscheduled instrs */
  55         struct ir3_instruction *scheduled; /* last scheduled instr XXX remove*/
  56         struct ir3_instruction *addr;      /* current a0.x user, if any */
  57         struct ir3_instruction *pred;      /* current p0.x user, if any */
  58         int live_values;                   /* estimate of current live values */
  59         bool error;
  60 };
  61
  62 static bool is_sfu_or_mem(struct ir3_instruction *instr)
  63 {
  64         return is_sfu(instr) || is_mem(instr);
  65 }
  66
  67 static void
  68 unuse_each_src(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
  69 {
  70         struct ir3_instruction *src;
  71
  72         foreach_ssa_src_n(src, n, instr) {
  73                 if (__is_false_dep(instr, n))
  74                         continue;
  75                 if (instr->block != src->block)
  76                         continue;
  77                 if ((src->opc == OPC_META_FI) || (src->opc == OPC_META_FO)) {
  78                         unuse_each_src(ctx, src);
  79                 } else {
  80                         debug_assert(src->use_count > 0);
  81
  82                         if (--src->use_count == 0) {
  83                                 ctx->live_values -= dest_regs(src);
  84                                 debug_assert(ctx->live_values >= 0);
  85                         }
  86                 }
  87         }
  88 }
  89
  90 static void use_instr(struct ir3_instruction *instr);
  91
  92 static void
  93 use_each_src(struct ir3_instruction *instr)
  94 {
  95         struct ir3_instruction *src;
  96
  97         foreach_ssa_src_n(src, n, instr) {
  98                 if (__is_false_dep(instr, n))
  99                         continue;
 100                 use_instr(src);
 101         }
 102 }
 103
 104 static void
 105 use_instr(struct ir3_instruction *instr)
 106 {
 107         if ((instr->opc == OPC_META_FI) || (instr->opc == OPC_META_FO)) {
 108                 use_each_src(instr);
 109         } else {
 110                 instr->use_count++;
 111         }
 112 }
 113
 114 static void
 115 update_live_values(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
 116 {
 117         if ((instr->opc == OPC_META_FI) || (instr->opc == OPC_META_FO))
 118                 return;
 119
 120         ctx->live_values += dest_regs(instr);
 121         unuse_each_src(ctx, instr);
 122 }
 123
 124 static void
 125 update_use_count(struct ir3 *ir)
 126 {
 127         list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
 128                 list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
 129                         instr->use_count = 0;
 130                 }
 131         }
 132
 133         list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
 134                 list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
 135                         if ((instr->opc == OPC_META_FI) || (instr->opc == OPC_META_FO))
 136                                 continue;
 137
 138                         use_each_src(instr);
 139                 }
 140         }
 141
 142         /* Shader outputs are also used:
 143          */
 144         for (unsigned i = 0; i <  ir->noutputs; i++) {
 145                 struct ir3_instruction  *out = ir->outputs[i];
 146
 147                 if (!out)
 148                         continue;
 149
 150                 use_instr(out);
 151         }
 152 }
 153
 154 #define NULL_INSTR ((void *)~0)
 155
 156 static void
 157 clear_cache(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
 158 {
 159         list_for_each_entry (struct ir3_instruction, instr2, &ctx->depth_list, node) {
 160                 if ((instr2->data == instr) || (instr2->data == NULL_INSTR) || !instr)
 161                         instr2->data = NULL;
 162         }
 163 }
 164
 165 static void
 166 schedule(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
 167 {
 168         debug_assert(ctx->block == instr->block);
 169
 170         /* maybe there is a better way to handle this than just stuffing
 171          * a nop.. ideally we'd know about this constraint in the
 172          * scheduling and depth calculation..
 173          */
 174         if (ctx->scheduled && is_sfu_or_mem(ctx->scheduled) && is_sfu_or_mem(instr))
 175                 ir3_NOP(ctx->block);
 176
 177         /* remove from depth list:
 178          */
 179         list_delinit(&instr->node);
 180
 181         if (writes_addr(instr)) {
 182                 debug_assert(ctx->addr == NULL);
 183                 ctx->addr = instr;
 184         }
 185
 186         if (writes_pred(instr)) {
 187                 debug_assert(ctx->pred == NULL);
 188                 ctx->pred = instr;
 189         }
 190
 191         instr->flags |= IR3_INSTR_MARK;
 192
 193         list_addtail(&instr->node, &instr->block->instr_list);
 194         ctx->scheduled = instr;
 195
 196         update_live_values(ctx, instr);
 197
 198         if (writes_addr(instr) || writes_pred(instr) || is_input(instr)) {
 199                 clear_cache(ctx, NULL);
 200         } else {
 201                 /* invalidate only the necessary entries.. */
 202                 clear_cache(ctx, instr);
 203         }
 204 }
 205
 206 static struct ir3_instruction *
 207 deepest(struct ir3_instruction **srcs, unsigned nsrcs)
 208 {
 209         struct ir3_instruction *d = NULL;
 210         unsigned i = 0, id = 0;
 211
 212         while ((i < nsrcs) && !(d = srcs[id = i]))
 213                 i++;
 214
 215         if (!d)
 216                 return NULL;
 217
 218         for (; i < nsrcs; i++)
 219                 if (srcs[i] && (srcs[i]->sun > d->sun))
 220                         d = srcs[id = i];
 221
 222         srcs[id] = NULL;
 223
 224         return d;
 225 }
 226
 227 /**
 228  * @block: the block to search in, starting from end; in first pass,
 229  *    this will be the block the instruction would be inserted into
 230  *    (but has not yet, ie. it only contains already scheduled
 231  *    instructions).  For intra-block scheduling (second pass), this
 232  *    would be one of the predecessor blocks.
 233  * @instr: the instruction to search for
 234  * @maxd:  max distance, bail after searching this # of instruction
 235  *    slots, since it means the instruction we are looking for is
 236  *    far enough away
 237  * @pred:  if true, recursively search into predecessor blocks to
 238  *    find the worst case (shortest) distance (only possible after
 239  *    individual blocks are all scheduled
 240  */
 241 static unsigned
 242 distance(struct ir3_block *block, struct ir3_instruction *instr,
 243                 unsigned maxd, bool pred)
 244 {
 245         unsigned d = 0;
 246
 247         list_for_each_entry_rev (struct ir3_instruction, n, &block->instr_list, node) {
 248                 if ((n == instr) || (d >= maxd))
 249                         return d;
 250                 /* NOTE: don't count branch/jump since we don't know yet if they will
 251                  * be eliminated later in resolve_jumps().. really should do that
 252                  * earlier so we don't have this constraint.
 253                  */
 254                 if (is_alu(n) || (is_flow(n) && (n->opc != OPC_JUMP) && (n->opc != OPC_BR)))
 255                         d++;
 256         }
 257
 258         /* if coming from a predecessor block, assume it is assigned far
 259          * enough away.. we'll fix up later.
 260          */
 261         if (!pred)
 262                 return maxd;
 263
 264         if (pred && (block->data != block)) {
 265                 /* Search into predecessor blocks, finding the one with the
 266                  * shortest distance, since that will be the worst case
 267                  */
 268                 unsigned min = maxd - d;
 269
 270                 /* (ab)use block->data to prevent recursion: */
 271                 block->data = block;
 272
 273                 for (unsigned i = 0; i < block->predecessors_count; i++) {
 274                         unsigned n;
 275
 276                         n = distance(block->predecessors[i], instr, min, pred);
 277
 278                         min = MIN2(min, n);
 279                 }
 280
 281                 block->data = NULL;
 282                 d += min;
 283         }
 284
 285         return d;
 286 }
 287
 288 /* calculate delay for specified src: */
 289 static unsigned
 290 delay_calc_srcn(struct ir3_block *block,
 291                 struct ir3_instruction *assigner,
 292                 struct ir3_instruction *consumer,
 293                 unsigned srcn, bool soft, bool pred)
 294 {
 295         unsigned delay = 0;
 296
 297         if (is_meta(assigner)) {
 298                 struct ir3_instruction *src;
 299                 foreach_ssa_src(src, assigner) {
 300                         unsigned d;
 301                         d = delay_calc_srcn(block, src, consumer, srcn, soft, pred);
 302                         delay = MAX2(delay, d);
 303                 }
 304         } else {
 305                 if (soft) {
 306                         if (is_sfu(assigner)) {
 307                                 delay = 4;
 308                         } else {
 309                                 delay = ir3_delayslots(assigner, consumer, srcn);
 310                         }
 311                 } else {
 312                         delay = ir3_delayslots(assigner, consumer, srcn);
 313                 }
 314                 delay -= distance(block, assigner, delay, pred);
 315         }
 316
 317         return delay;
 318 }
 319
 320 /* calculate delay for instruction (maximum of delay for all srcs): */
 321 static unsigned
 322 delay_calc(struct ir3_block *block, struct ir3_instruction *instr,
 323                 bool soft, bool pred)
 324 {
 325         unsigned delay = 0;
 326         struct ir3_instruction *src;
 327
 328         foreach_ssa_src_n(src, i, instr) {
 329                 unsigned d;
 330                 d = delay_calc_srcn(block, src, instr, i, soft, pred);
 331                 delay = MAX2(delay, d);
 332         }
 333
 334         return delay;
 335 }
 336
 337 struct ir3_sched_notes {
 338         /* there is at least one kill which could be scheduled, except
 339          * for unscheduled bary.f's:
 340          */
 341         bool blocked_kill;
 342         /* there is at least one instruction that could be scheduled,
 343          * except for conflicting address/predicate register usage:
 344          */
 345         bool addr_conflict, pred_conflict;
 346 };
 347
 348 static bool is_scheduled(struct ir3_instruction *instr)
 349 {
 350         return !!(instr->flags & IR3_INSTR_MARK);
 351 }
 352
 353 /* could an instruction be scheduled if specified ssa src was scheduled? */
 354 static bool
 355 could_sched(struct ir3_instruction *instr, struct ir3_instruction *src)
 356 {
 357         struct ir3_instruction *other_src;
 358         foreach_ssa_src(other_src, instr) {
 359                 /* if dependency not scheduled, we aren't ready yet: */
 360                 if ((src != other_src) && !is_scheduled(other_src)) {
 361                         return false;
 362                 }
 363         }
 364         return true;
 365 }
 366
 367 /* Check if instruction is ok to schedule.  Make sure it is not blocked
 368  * by use of addr/predicate register, etc.
 369  */
 370 static bool
 371 check_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
 372                 struct ir3_instruction *instr)
 373 {
 374         /* For instructions that write address register we need to
 375          * make sure there is at least one instruction that uses the
 376          * addr value which is otherwise ready.
 377          *
 378          * TODO if any instructions use pred register and have other
 379          * src args, we would need to do the same for writes_pred()..
 380          */
 381         if (writes_addr(instr)) {
 382                 struct ir3 *ir = instr->block->shader;
 383                 bool ready = false;
 384                 for (unsigned i = 0; (i < ir->indirects_count) && !ready; i++) {
 385                         struct ir3_instruction *indirect = ir->indirects[i];
 386                         if (!indirect)
 387                                 continue;
 388                         if (indirect->address != instr)
 389                                 continue;
 390                         ready = could_sched(indirect, instr);
 391                 }
 392
 393                 /* nothing could be scheduled, so keep looking: */
 394                 if (!ready)
 395                         return false;
 396         }
 397
 398         /* if this is a write to address/predicate register, and that
 399          * register is currently in use, we need to defer until it is
 400          * free:
 401          */
 402         if (writes_addr(instr) && ctx->addr) {
 403                 debug_assert(ctx->addr != instr);
 404                 notes->addr_conflict = true;
 405                 return false;
 406         }
 407
 408         if (writes_pred(instr) && ctx->pred) {
 409                 debug_assert(ctx->pred != instr);
 410                 notes->pred_conflict = true;
 411                 return false;
 412         }
 413
 414         /* if the instruction is a kill, we need to ensure *every*
 415          * bary.f is scheduled.  The hw seems unhappy if the thread
 416          * gets killed before the end-input (ei) flag is hit.
 417          *
 418          * We could do this by adding each bary.f instruction as
 419          * virtual ssa src for the kill instruction.  But we have
 420          * fixed length instr->regs[].
 421          *
 422          * TODO this wouldn't be quite right if we had multiple
 423          * basic blocks, if any block was conditional.  We'd need
 424          * to schedule the bary.f's outside of any block which
 425          * was conditional that contained a kill.. I think..
 426          */
 427         if (is_kill(instr)) {
 428                 struct ir3 *ir = instr->block->shader;
 429
 430                 for (unsigned i = 0; i < ir->baryfs_count; i++) {
 431                         struct ir3_instruction *baryf = ir->baryfs[i];
 432                         if (baryf->flags & IR3_INSTR_UNUSED)
 433                                 continue;
 434                         if (!is_scheduled(baryf)) {
 435                                 notes->blocked_kill = true;
 436                                 return false;
 437                         }
 438                 }
 439         }
 440
 441         return true;
 442 }
 443
 444 /* Find the best instruction to schedule from specified instruction or
 445  * recursively it's ssa sources.
 446  */
 447 static struct ir3_instruction *
 448 find_instr_recursive(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
 449                 struct ir3_instruction *instr)
 450 {
 451         struct ir3_instruction *srcs[__ssa_src_cnt(instr)];
 452         struct ir3_instruction *src;
 453         unsigned nsrcs = 0;
 454
 455         if (is_scheduled(instr))
 456                 return NULL;
 457
 458         /* use instr->data to cache the results of recursing up the
 459          * instr src's.  Otherwise the recursive algo can scale quite
 460          * badly w/ shader size.  But this takes some care to clear
 461          * the cache appropriately when instructions are scheduled.
 462          */
 463         if (instr->data) {
 464                 if (instr->data == NULL_INSTR)
 465                         return NULL;
 466                 return instr->data;
 467         }
 468
 469         /* find unscheduled srcs: */
 470         foreach_ssa_src(src, instr) {
 471                 if (!is_scheduled(src) && (src->block == instr->block)) {
 472                         debug_assert(nsrcs < ARRAY_SIZE(srcs));
 473                         srcs[nsrcs++] = src;
 474                 }
 475         }
 476
 477         /* if all our src's are already scheduled: */
 478         if (nsrcs == 0) {
 479                 if (check_instr(ctx, notes, instr)) {
 480                         instr->data = instr;
 481                         return instr;
 482                 }
 483                 return NULL;
 484         }
 485
 486         while ((src = deepest(srcs, nsrcs))) {
 487                 struct ir3_instruction *candidate;
 488
 489                 candidate = find_instr_recursive(ctx, notes, src);
 490                 if (!candidate)
 491                         continue;
 492
 493                 if (check_instr(ctx, notes, candidate)) {
 494                         instr->data = candidate;
 495                         return candidate;
 496                 }
 497         }
 498
 499         instr->data = NULL_INSTR;
 500         return NULL;
 501 }
 502
 503 /* find instruction to schedule: */
 504 static struct ir3_instruction *
 505 find_eligible_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
 506                 bool soft)
 507 {
 508         struct ir3_instruction *best_instr = NULL;
 509         unsigned min_delay = ~0;
 510
 511         /* TODO we'd really rather use the list/array of block outputs.  But we
 512          * don't have such a thing.  Recursing *every* instruction in the list
 513          * will result in a lot of repeated traversal, since instructions will
 514          * get traversed both when they appear as ssa src to a later instruction
 515          * as well as where they appear in the depth_list.
 516          */
 517         list_for_each_entry_rev (struct ir3_instruction, instr, &ctx->depth_list, node) {
 518                 struct ir3_instruction *candidate;
 519                 unsigned delay;
 520
 521                 candidate = find_instr_recursive(ctx, notes, instr);
 522                 if (!candidate)
 523                         continue;
 524
 525                 if (ctx->live_values > 16*4) {
 526                         /* under register pressure, only care about reducing live values: */
 527                         if (!best_instr || (candidate->sun > best_instr->sun))
 528                                 best_instr = candidate;
 529                 } else {
 530                         delay = delay_calc(ctx->block, candidate, soft, false);
 531                         if ((delay < min_delay) ||
 532                                         ((delay <= (min_delay + 2)) && (candidate->sun > best_instr->sun))) {
 533                                 best_instr = candidate;
 534                                 min_delay = delay;
 535                         }
 536                 }
 537         }
 538
 539         return best_instr;
 540 }
 541
 542 /* "spill" the address register by remapping any unscheduled
 543  * instructions which depend on the current address register
 544  * to a clone of the instruction which wrote the address reg.
 545  */
 546 static struct ir3_instruction *
 547 split_addr(struct ir3_sched_ctx *ctx)
 548 {
 549         struct ir3 *ir;
 550         struct ir3_instruction *new_addr = NULL;
 551         unsigned i;
 552
 553         debug_assert(ctx->addr);
 554
 555         ir = ctx->addr->block->shader;
 556
 557         for (i = 0; i < ir->indirects_count; i++) {
 558                 struct ir3_instruction *indirect = ir->indirects[i];
 559
 560                 if (!indirect)
 561                         continue;
 562
 563                 /* skip instructions already scheduled: */
 564                 if (is_scheduled(indirect))
 565                         continue;
 566
 567                 /* remap remaining instructions using current addr
 568                  * to new addr:
 569                  */
 570                 if (indirect->address == ctx->addr) {
 571                         if (!new_addr) {
 572                                 new_addr = ir3_instr_clone(ctx->addr);
 573                                 /* original addr is scheduled, but new one isn't: */
 574                                 new_addr->flags &= ~IR3_INSTR_MARK;
 575                         }
 576                         ir3_instr_set_address(indirect, new_addr);
 577                 }
 578         }
 579
 580         /* all remaining indirects remapped to new addr: */
 581         ctx->addr = NULL;
 582
 583         return new_addr;
 584 }
 585
 586 /* "spill" the predicate register by remapping any unscheduled
 587  * instructions which depend on the current predicate register
 588  * to a clone of the instruction which wrote the address reg.
 589  */
 590 static struct ir3_instruction *
 591 split_pred(struct ir3_sched_ctx *ctx)
 592 {
 593         struct ir3 *ir;
 594         struct ir3_instruction *new_pred = NULL;
 595         unsigned i;
 596
 597         debug_assert(ctx->pred);
 598
 599         ir = ctx->pred->block->shader;
 600
 601         for (i = 0; i < ir->predicates_count; i++) {
 602                 struct ir3_instruction *predicated = ir->predicates[i];
 603
 604                 /* skip instructions already scheduled: */
 605                 if (is_scheduled(predicated))
 606                         continue;
 607
 608                 /* remap remaining instructions using current pred
 609                  * to new pred:
 610                  *
 611                  * TODO is there ever a case when pred isn't first
 612                  * (and only) src?
 613                  */
 614                 if (ssa(predicated->regs[1]) == ctx->pred) {
 615                         if (!new_pred) {
 616                                 new_pred = ir3_instr_clone(ctx->pred);
 617                                 /* original pred is scheduled, but new one isn't: */
 618                                 new_pred->flags &= ~IR3_INSTR_MARK;
 619                         }
 620                         predicated->regs[1]->instr = new_pred;
 621                 }
 622         }
 623
 624         /* all remaining predicated remapped to new pred: */
 625         ctx->pred = NULL;
 626
 627         return new_pred;
 628 }
 629
 630 static void
 631 sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
 632 {
 633         struct list_head unscheduled_list;
 634
 635         ctx->block = block;
 636
 637         /* addr/pred writes are per-block: */
 638         ctx->addr = NULL;
 639         ctx->pred = NULL;
 640
 641         /* move all instructions to the unscheduled list, and
 642          * empty the block's instruction list (to which we will
 643          * be inserting).
 644          */
 645         list_replace(&block->instr_list, &unscheduled_list);
 646         list_inithead(&block->instr_list);
 647         list_inithead(&ctx->depth_list);
 648
 649         /* first a pre-pass to schedule all meta:input instructions
 650          * (which need to appear first so that RA knows the register is
 651          * occupied), and move remaining to depth sorted list:
 652          */
 653         list_for_each_entry_safe (struct ir3_instruction, instr, &unscheduled_list, node) {
 654                 if (instr->opc == OPC_META_INPUT) {
 655                         schedule(ctx, instr);
 656                 } else {
 657                         ir3_insert_by_depth(instr, &ctx->depth_list);
 658                 }
 659         }
 660
 661         while (!list_empty(&ctx->depth_list)) {
 662                 struct ir3_sched_notes notes = {0};
 663                 struct ir3_instruction *instr;
 664
 665                 instr = find_eligible_instr(ctx, &notes, true);
 666                 if (!instr)
 667                         instr = find_eligible_instr(ctx, &notes, false);
 668
 669                 if (instr) {
 670                         unsigned delay = delay_calc(ctx->block, instr, false, false);
 671
 672                         /* and if we run out of instructions that can be scheduled,
 673                          * then it is time for nop's:
 674                          */
 675                         debug_assert(delay <= 6);
 676                         while (delay > 0) {
 677                                 ir3_NOP(block);
 678                                 delay--;
 679                         }
 680
 681                         schedule(ctx, instr);
 682                 } else {
 683                         struct ir3_instruction *new_instr = NULL;
 684
 685                         /* nothing available to schedule.. if we are blocked on
 686                          * address/predicate register conflict, then break the
 687                          * deadlock by cloning the instruction that wrote that
 688                          * reg:
 689                          */
 690                         if (notes.addr_conflict) {
 691                                 new_instr = split_addr(ctx);
 692                         } else if (notes.pred_conflict) {
 693                                 new_instr = split_pred(ctx);
 694                         } else {
 695                                 debug_assert(0);
 696                                 ctx->error = true;
 697                                 return;
 698                         }
 699
 700                         if (new_instr) {
 701                                 /* clearing current addr/pred can change what is
 702                                  * available to schedule, so clear cache..
 703                                  */
 704                                 clear_cache(ctx, NULL);
 705
 706                                 ir3_insert_by_depth(new_instr, &ctx->depth_list);
 707                                 /* the original instr that wrote addr/pred may have
 708                                  * originated from a different block:
 709                                  */
 710                                 new_instr->block = block;
 711                         }
 712                 }
 713         }
 714
 715         /* And lastly, insert branch/jump instructions to take us to
 716          * the next block.  Later we'll strip back out the branches
 717          * that simply jump to next instruction.
 718          */
 719         if (block->successors[1]) {
 720                 /* if/else, conditional branches to "then" or "else": */
 721                 struct ir3_instruction *br;
 722                 unsigned delay = 6;
 723
 724                 debug_assert(ctx->pred);
 725                 debug_assert(block->condition);
 726
 727                 delay -= distance(ctx->block, ctx->pred, delay, false);
 728
 729                 while (delay > 0) {
 730                         ir3_NOP(block);
 731                         delay--;
 732                 }
 733
 734                 /* create "else" branch first (since "then" block should
 735                  * frequently/always end up being a fall-thru):
 736                  */
 737                 br = ir3_BR(block);
 738                 br->cat0.inv = true;
 739                 br->cat0.target = block->successors[1];
 740
 741                 /* NOTE: we have to hard code delay of 6 above, since
 742                  * we want to insert the nop's before constructing the
 743                  * branch.  Throw in an assert so we notice if this
 744                  * ever breaks on future generation:
 745                  */
 746                 debug_assert(ir3_delayslots(ctx->pred, br, 0) == 6);
 747
 748                 br = ir3_BR(block);
 749                 br->cat0.target = block->successors[0];
 750
 751         } else if (block->successors[0]) {
 752                 /* otherwise unconditional jump to next block: */
 753                 struct ir3_instruction *jmp;
 754
 755                 jmp = ir3_JUMP(block);
 756                 jmp->cat0.target = block->successors[0];
 757         }
 758
 759         /* NOTE: if we kept track of the predecessors, we could do a better
 760          * job w/ (jp) flags.. every node w/ > predecessor is a join point.
 761          * Note that as we eliminate blocks which contain only an unconditional
 762          * jump we probably need to propagate (jp) flag..
 763          */
 764 }
 765
 766 /* After scheduling individual blocks, we still could have cases where
 767  * one (or more) paths into a block, a value produced by a previous
 768  * has too few delay slots to be legal.  We can't deal with this in the
 769  * first pass, because loops (ie. we can't ensure all predecessor blocks
 770  * are already scheduled in the first pass).  All we can really do at
 771  * this point is stuff in extra nop's until things are legal.
 772  */
 773 static void
 774 sched_intra_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
 775 {
 776         unsigned n = 0;
 777
 778         ctx->block = block;
 779
 780         list_for_each_entry_safe (struct ir3_instruction, instr, &block->instr_list, node) {
 781                 unsigned delay = 0;
 782
 783                 for (unsigned i = 0; i < block->predecessors_count; i++) {
 784                         unsigned d = delay_calc(block->predecessors[i], instr, false, true);
 785                         delay = MAX2(d, delay);
 786                 }
 787
 788                 while (delay > n) {
 789                         struct ir3_instruction *nop = ir3_NOP(block);
 790
 791                         /* move to before instr: */
 792                         list_delinit(&nop->node);
 793                         list_addtail(&nop->node, &instr->node);
 794
 795                         n++;
 796                 }
 797
 798                 /* we can bail once we hit worst case delay: */
 799                 if (++n > 6)
 800                         break;
 801         }
 802 }
 803
 804 int ir3_sched(struct ir3 *ir)
 805 {
 806         struct ir3_sched_ctx ctx = {0};
 807
 808         ir3_clear_mark(ir);
 809         update_use_count(ir);
 810
 811         list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
 812                 ctx.live_values = 0;
 813                 sched_block(&ctx, block);
 814         }
 815
 816         list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
 817                 sched_intra_block(&ctx, block);
 818         }
 819
 820         if (ctx.error)
 821                 return -1;
 822
 823         return 0;
 824 }
 825
 826 static unsigned
 827 get_array_id(struct ir3_instruction *instr)
 828 {
 829         /* The expectation is that there is only a single array
 830          * src or dst, ir3_cp should enforce this.
 831          */
 832
 833         for (unsigned i = 0; i < instr->regs_count; i++)
 834                 if (instr->regs[i]->flags & IR3_REG_ARRAY)
 835                         return instr->regs[i]->array.id;
 836
 837         unreachable("this was unexpected");
 838 }
 839
 840 /* does instruction 'prior' need to be scheduled before 'instr'? */
 841 static bool
 842 depends_on(struct ir3_instruction *instr, struct ir3_instruction *prior)
 843 {
 844         /* TODO for dependencies that are related to a specific object, ie
 845          * a specific SSBO/image/array, we could relax this constraint to
 846          * make accesses to unrelated objects not depend on each other (at
 847          * least as long as not declared coherent)
 848          */
 849         if (((instr->barrier_class & IR3_BARRIER_EVERYTHING) && prior->barrier_class) ||
 850                         ((prior->barrier_class & IR3_BARRIER_EVERYTHING) && instr->barrier_class))
 851                 return true;
 852
 853         if (instr->barrier_class & prior->barrier_conflict) {
 854                 if (!(instr->barrier_class & ~(IR3_BARRIER_ARRAY_R | IR3_BARRIER_ARRAY_W))) {
 855                         /* if only array barrier, then we can further limit false-deps
 856                          * by considering the array-id, ie reads/writes to different
 857                          * arrays do not depend on each other (no aliasing)
 858                          */
 859                         if (get_array_id(instr) != get_array_id(prior)) {
 860                                 return false;
 861                         }
 862                 }
 863
 864                 return true;
 865         }
 866
 867         return false;
 868 }
 869
 870 static void
 871 add_barrier_deps(struct ir3_block *block, struct ir3_instruction *instr)
 872 {
 873         struct list_head *prev = instr->node.prev;
 874         struct list_head *next = instr->node.next;
 875
 876         /* add dependencies on previous instructions that must be scheduled
 877          * prior to the current instruction
 878          */
 879         while (prev != &block->instr_list) {
 880                 struct ir3_instruction *pi =
 881                         LIST_ENTRY(struct ir3_instruction, prev, node);
 882
 883                 prev = prev->prev;
 884
 885                 if (is_meta(pi))
 886                         continue;
 887
 888                 if (instr->barrier_class == pi->barrier_class) {
 889                         ir3_instr_add_dep(instr, pi);
 890                         break;
 891                 }
 892
 893                 if (depends_on(instr, pi))
 894                         ir3_instr_add_dep(instr, pi);
 895         }
 896
 897         /* add dependencies on this instruction to following instructions
 898          * that must be scheduled after the current instruction:
 899          */
 900         while (next != &block->instr_list) {
 901                 struct ir3_instruction *ni =
 902                         LIST_ENTRY(struct ir3_instruction, next, node);
 903
 904                 next = next->next;
 905
 906                 if (is_meta(ni))
 907                         continue;
 908
 909                 if (instr->barrier_class == ni->barrier_class) {
 910                         ir3_instr_add_dep(ni, instr);
 911                         break;
 912                 }
 913
 914                 if (depends_on(ni, instr))
 915                         ir3_instr_add_dep(ni, instr);
 916         }
 917 }
 918
 919 /* before scheduling a block, we need to add any necessary false-dependencies
 920  * to ensure that:
 921  *
 922  *  (1) barriers are scheduled in the right order wrt instructions related
 923  *      to the barrier
 924  *
 925  *  (2) reads that come before a write actually get scheduled before the
 926  *      write
 927  */
 928 static void
 929 calculate_deps(struct ir3_block *block)
 930 {
 931         list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
 932                 if (instr->barrier_class) {
 933                         add_barrier_deps(block, instr);
 934                 }
 935         }
 936 }
 937
 938 void
 939 ir3_sched_add_deps(struct ir3 *ir)
 940 {
 941         list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
 942                 calculate_deps(block);
 943         }
 944 }