src/freedreno/ir3/ir3_sched.c

   1 /*
   2  * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21  * SOFTWARE.
  22  *
  23  * Authors:
  24  *    Rob Clark <robclark@freedesktop.org>
  25  */
  26
  27
  28 #include "util/u_math.h"
  29
  30 #include "ir3.h"
  31
  32 /*
  33  * Instruction Scheduling:
  34  *
  35  * A recursive depth based scheduling algo.  Recursively find an eligible
  36  * instruction to schedule from the deepest instruction (recursing through
  37  * it's unscheduled src instructions).  Normally this would result in a
  38  * lot of re-traversal of the same instructions, so we cache results in
  39  * instr->data (and clear cached results that would be no longer valid
  40  * after scheduling an instruction).
  41  *
  42  * There are a few special cases that need to be handled, since sched
  43  * is currently independent of register allocation.  Usages of address
  44  * register (a0.x) or predicate register (p0.x) must be serialized.  Ie.
  45  * if you have two pairs of instructions that write the same special
  46  * register and then read it, then those pairs cannot be interleaved.
  47  * To solve this, when we are in such a scheduling "critical section",
  48  * and we encounter a conflicting write to a special register, we try
  49  * to schedule any remaining instructions that use that value first.
  50  */
  51
  52 struct ir3_sched_ctx {
  53         struct ir3_block *block;           /* the current block */
  54         struct list_head depth_list;       /* depth sorted unscheduled instrs */
  55         struct ir3_instruction *scheduled; /* last scheduled instr XXX remove*/
  56         struct ir3_instruction *addr;      /* current a0.x user, if any */
  57         struct ir3_instruction *pred;      /* current p0.x user, if any */
  58         int live_values;                   /* estimate of current live values */
  59         bool error;
  60 };
  61
  62 static bool is_sfu_or_mem(struct ir3_instruction *instr)
  63 {
  64         return is_sfu(instr) || is_mem(instr);
  65 }
  66
  67 static void
  68 unuse_each_src(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
  69 {
  70         struct ir3_instruction *src;
  71
  72         foreach_ssa_src_n(src, n, instr) {
  73                 if (__is_false_dep(instr, n))
  74                         continue;
  75                 if (instr->block != src->block)
  76                         continue;
  77                 if ((src->opc == OPC_META_FI) || (src->opc == OPC_META_FO)) {
  78                         unuse_each_src(ctx, src);
  79                 } else {
  80                         debug_assert(src->use_count > 0);
  81
  82                         if (--src->use_count == 0) {
  83                                 ctx->live_values -= dest_regs(src);
  84                                 debug_assert(ctx->live_values >= 0);
  85                         }
  86                 }
  87         }
  88 }
  89
  90 static void use_instr(struct ir3_instruction *instr);
  91
  92 static void
  93 use_each_src(struct ir3_instruction *instr)
  94 {
  95         struct ir3_instruction *src;
  96
  97         foreach_ssa_src_n(src, n, instr) {
  98                 if (__is_false_dep(instr, n))
  99                         continue;
 100                 use_instr(src);
 101         }
 102 }
 103
 104 static void
 105 use_instr(struct ir3_instruction *instr)
 106 {
 107         if ((instr->opc == OPC_META_FI) || (instr->opc == OPC_META_FO)) {
 108                 use_each_src(instr);
 109         } else {
 110                 instr->use_count++;
 111         }
 112 }
 113
 114 static void
 115 update_live_values(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
 116 {
 117         if ((instr->opc == OPC_META_FI) || (instr->opc == OPC_META_FO))
 118                 return;
 119
 120         ctx->live_values += dest_regs(instr);
 121         unuse_each_src(ctx, instr);
 122 }
 123
 124 static void
 125 update_use_count(struct ir3 *ir)
 126 {
 127         list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
 128                 list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
 129                         instr->use_count = 0;
 130                 }
 131         }
 132
 133         list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
 134                 list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
 135                         if ((instr->opc == OPC_META_FI) || (instr->opc == OPC_META_FO))
 136                                 continue;
 137
 138                         use_each_src(instr);
 139                 }
 140         }
 141
 142         /* Shader outputs are also used:
 143          */
 144         for (unsigned i = 0; i <  ir->noutputs; i++) {
 145                 struct ir3_instruction  *out = ir->outputs[i];
 146
 147                 if (!out)
 148                         continue;
 149
 150                 use_instr(out);
 151         }
 152 }
 153
 154 #define NULL_INSTR ((void *)~0)
 155
 156 static void
 157 clear_cache(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
 158 {
 159         list_for_each_entry (struct ir3_instruction, instr2, &ctx->depth_list, node) {
 160                 if ((instr2->data == instr) || (instr2->data == NULL_INSTR) || !instr)
 161                         instr2->data = NULL;
 162         }
 163 }
 164
 165 static void
 166 schedule(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
 167 {
 168         debug_assert(ctx->block == instr->block);
 169
 170         /* maybe there is a better way to handle this than just stuffing
 171          * a nop.. ideally we'd know about this constraint in the
 172          * scheduling and depth calculation..
 173          */
 174         if (ctx->scheduled && is_sfu_or_mem(ctx->scheduled) && is_sfu_or_mem(instr))
 175                 ir3_NOP(ctx->block);
 176
 177         /* remove from depth list:
 178          */
 179         list_delinit(&instr->node);
 180
 181         if (writes_addr(instr)) {
 182                 debug_assert(ctx->addr == NULL);
 183                 ctx->addr = instr;
 184         }
 185
 186         if (writes_pred(instr)) {
 187                 debug_assert(ctx->pred == NULL);
 188                 ctx->pred = instr;
 189         }
 190
 191         instr->flags |= IR3_INSTR_MARK;
 192
 193         list_addtail(&instr->node, &instr->block->instr_list);
 194         ctx->scheduled = instr;
 195
 196         update_live_values(ctx, instr);
 197
 198         if (writes_addr(instr) || writes_pred(instr) || is_input(instr)) {
 199                 clear_cache(ctx, NULL);
 200         } else {
 201                 /* invalidate only the necessary entries.. */
 202                 clear_cache(ctx, instr);
 203         }
 204 }
 205
 206 static struct ir3_instruction *
 207 deepest(struct ir3_instruction **srcs, unsigned nsrcs)
 208 {
 209         struct ir3_instruction *d = NULL;
 210         unsigned i = 0, id = 0;
 211
 212         while ((i < nsrcs) && !(d = srcs[id = i]))
 213                 i++;
 214
 215         if (!d)
 216                 return NULL;
 217
 218         for (; i < nsrcs; i++)
 219                 if (srcs[i] && (srcs[i]->depth > d->depth))
 220                         d = srcs[id = i];
 221
 222         srcs[id] = NULL;
 223
 224         return d;
 225 }
 226
 227 /**
 228  * @block: the block to search in, starting from end; in first pass,
 229  *    this will be the block the instruction would be inserted into
 230  *    (but has not yet, ie. it only contains already scheduled
 231  *    instructions).  For intra-block scheduling (second pass), this
 232  *    would be one of the predecessor blocks.
 233  * @instr: the instruction to search for
 234  * @maxd:  max distance, bail after searching this # of instruction
 235  *    slots, since it means the instruction we are looking for is
 236  *    far enough away
 237  * @pred:  if true, recursively search into predecessor blocks to
 238  *    find the worst case (shortest) distance (only possible after
 239  *    individual blocks are all scheduled
 240  */
 241 static unsigned
 242 distance(struct ir3_block *block, struct ir3_instruction *instr,
 243                 unsigned maxd, bool pred)
 244 {
 245         unsigned d = 0;
 246
 247         list_for_each_entry_rev (struct ir3_instruction, n, &block->instr_list, node) {
 248                 if ((n == instr) || (d >= maxd))
 249                         return d;
 250                 /* NOTE: don't count branch/jump since we don't know yet if they will
 251                  * be eliminated later in resolve_jumps().. really should do that
 252                  * earlier so we don't have this constraint.
 253                  */
 254                 if (is_alu(n) || (is_flow(n) && (n->opc != OPC_JUMP) && (n->opc != OPC_BR)))
 255                         d++;
 256         }
 257
 258         /* if coming from a predecessor block, assume it is assigned far
 259          * enough away.. we'll fix up later.
 260          */
 261         if (!pred)
 262                 return maxd;
 263
 264         if (pred && (block->data != block)) {
 265                 /* Search into predecessor blocks, finding the one with the
 266                  * shortest distance, since that will be the worst case
 267                  */
 268                 unsigned min = maxd - d;
 269
 270                 /* (ab)use block->data to prevent recursion: */
 271                 block->data = block;
 272
 273                 for (unsigned i = 0; i < block->predecessors_count; i++) {
 274                         unsigned n;
 275
 276                         n = distance(block->predecessors[i], instr, min, pred);
 277
 278                         min = MIN2(min, n);
 279                 }
 280
 281                 block->data = NULL;
 282                 d += min;
 283         }
 284
 285         return d;
 286 }
 287
 288 /* calculate delay for specified src: */
 289 static unsigned
 290 delay_calc_srcn(struct ir3_block *block,
 291                 struct ir3_instruction *assigner,
 292                 struct ir3_instruction *consumer,
 293                 unsigned srcn, bool soft, bool pred)
 294 {
 295         unsigned delay = 0;
 296
 297         if (is_meta(assigner)) {
 298                 struct ir3_instruction *src;
 299                 foreach_ssa_src(src, assigner) {
 300                         unsigned d;
 301                         d = delay_calc_srcn(block, src, consumer, srcn, soft, pred);
 302                         delay = MAX2(delay, d);
 303                 }
 304         } else {
 305                 if (soft) {
 306                         if (is_sfu(assigner)) {
 307                                 delay = 4;
 308                         } else {
 309                                 delay = ir3_delayslots(assigner, consumer, srcn);
 310                         }
 311                 } else {
 312                         delay = ir3_delayslots(assigner, consumer, srcn);
 313                 }
 314                 delay -= distance(block, assigner, delay, pred);
 315         }
 316
 317         return delay;
 318 }
 319
 320 /* calculate delay for instruction (maximum of delay for all srcs): */
 321 static unsigned
 322 delay_calc(struct ir3_block *block, struct ir3_instruction *instr,
 323                 bool soft, bool pred)
 324 {
 325         unsigned delay = 0;
 326         struct ir3_instruction *src;
 327
 328         foreach_ssa_src_n(src, i, instr) {
 329                 unsigned d;
 330                 d = delay_calc_srcn(block, src, instr, i, soft, pred);
 331                 delay = MAX2(delay, d);
 332         }
 333
 334         return delay;
 335 }
 336
 337 struct ir3_sched_notes {
 338         /* there is at least one kill which could be scheduled, except
 339          * for unscheduled bary.f's:
 340          */
 341         bool blocked_kill;
 342         /* there is at least one instruction that could be scheduled,
 343          * except for conflicting address/predicate register usage:
 344          */
 345         bool addr_conflict, pred_conflict;
 346 };
 347
 348 static bool is_scheduled(struct ir3_instruction *instr)
 349 {
 350         return !!(instr->flags & IR3_INSTR_MARK);
 351 }
 352
 353 /* could an instruction be scheduled if specified ssa src was scheduled? */
 354 static bool
 355 could_sched(struct ir3_instruction *instr, struct ir3_instruction *src)
 356 {
 357         struct ir3_instruction *other_src;
 358         foreach_ssa_src(other_src, instr) {
 359                 /* if dependency not scheduled, we aren't ready yet: */
 360                 if ((src != other_src) && !is_scheduled(other_src)) {
 361                         return false;
 362                 }
 363         }
 364         return true;
 365 }
 366
 367 /* Check if instruction is ok to schedule.  Make sure it is not blocked
 368  * by use of addr/predicate register, etc.
 369  */
 370 static bool
 371 check_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
 372                 struct ir3_instruction *instr)
 373 {
 374         /* For instructions that write address register we need to
 375          * make sure there is at least one instruction that uses the
 376          * addr value which is otherwise ready.
 377          *
 378          * TODO if any instructions use pred register and have other
 379          * src args, we would need to do the same for writes_pred()..
 380          */
 381         if (writes_addr(instr)) {
 382                 struct ir3 *ir = instr->block->shader;
 383                 bool ready = false;
 384                 for (unsigned i = 0; (i < ir->indirects_count) && !ready; i++) {
 385                         struct ir3_instruction *indirect = ir->indirects[i];
 386                         if (!indirect)
 387                                 continue;
 388                         if (indirect->address != instr)
 389                                 continue;
 390                         ready = could_sched(indirect, instr);
 391                 }
 392
 393                 /* nothing could be scheduled, so keep looking: */
 394                 if (!ready)
 395                         return false;
 396         }
 397
 398         /* if this is a write to address/predicate register, and that
 399          * register is currently in use, we need to defer until it is
 400          * free:
 401          */
 402         if (writes_addr(instr) && ctx->addr) {
 403                 debug_assert(ctx->addr != instr);
 404                 notes->addr_conflict = true;
 405                 return false;
 406         }
 407
 408         if (writes_pred(instr) && ctx->pred) {
 409                 debug_assert(ctx->pred != instr);
 410                 notes->pred_conflict = true;
 411                 return false;
 412         }
 413
 414         /* if the instruction is a kill, we need to ensure *every*
 415          * bary.f is scheduled.  The hw seems unhappy if the thread
 416          * gets killed before the end-input (ei) flag is hit.
 417          *
 418          * We could do this by adding each bary.f instruction as
 419          * virtual ssa src for the kill instruction.  But we have
 420          * fixed length instr->regs[].
 421          *
 422          * TODO this wouldn't be quite right if we had multiple
 423          * basic blocks, if any block was conditional.  We'd need
 424          * to schedule the bary.f's outside of any block which
 425          * was conditional that contained a kill.. I think..
 426          */
 427         if (is_kill(instr)) {
 428                 struct ir3 *ir = instr->block->shader;
 429
 430                 for (unsigned i = 0; i < ir->baryfs_count; i++) {
 431                         struct ir3_instruction *baryf = ir->baryfs[i];
 432                         if (baryf->flags & IR3_INSTR_UNUSED)
 433                                 continue;
 434                         if (!is_scheduled(baryf)) {
 435                                 notes->blocked_kill = true;
 436                                 return false;
 437                         }
 438                 }
 439         }
 440
 441         return true;
 442 }
 443
 444 /* Find the best instruction to schedule from specified instruction or
 445  * recursively it's ssa sources.
 446  */
 447 static struct ir3_instruction *
 448 find_instr_recursive(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
 449                 struct ir3_instruction *instr)
 450 {
 451         struct ir3_instruction *srcs[__ssa_src_cnt(instr)];
 452         struct ir3_instruction *src;
 453         unsigned nsrcs = 0;
 454
 455         if (is_scheduled(instr))
 456                 return NULL;
 457
 458         /* use instr->data to cache the results of recursing up the
 459          * instr src's.  Otherwise the recursive algo can scale quite
 460          * badly w/ shader size.  But this takes some care to clear
 461          * the cache appropriately when instructions are scheduled.
 462          */
 463         if (instr->data) {
 464                 if (instr->data == NULL_INSTR)
 465                         return NULL;
 466                 return instr->data;
 467         }
 468
 469         /* find unscheduled srcs: */
 470         foreach_ssa_src(src, instr) {
 471                 if (!is_scheduled(src) && (src->block == instr->block)) {
 472                         debug_assert(nsrcs < ARRAY_SIZE(srcs));
 473                         srcs[nsrcs++] = src;
 474                 }
 475         }
 476
 477         /* if all our src's are already scheduled: */
 478         if (nsrcs == 0) {
 479                 if (check_instr(ctx, notes, instr)) {
 480                         instr->data = instr;
 481                         return instr;
 482                 }
 483                 return NULL;
 484         }
 485
 486         while ((src = deepest(srcs, nsrcs))) {
 487                 struct ir3_instruction *candidate;
 488
 489                 candidate = find_instr_recursive(ctx, notes, src);
 490                 if (!candidate)
 491                         continue;
 492
 493                 if (check_instr(ctx, notes, candidate)) {
 494                         instr->data = candidate;
 495                         return candidate;
 496                 }
 497         }
 498
 499         instr->data = NULL_INSTR;
 500         return NULL;
 501 }
 502
 503 /* find net change to live values if instruction were scheduled: */
 504 static int
 505 live_effect(struct ir3_instruction *instr)
 506 {
 507         struct ir3_instruction *src;
 508         int new_live = dest_regs(instr);
 509         int old_live = 0;
 510
 511         foreach_ssa_src_n(src, n, instr) {
 512                 if (__is_false_dep(instr, n))
 513                         continue;
 514
 515                 if (instr->block != src->block)
 516                         continue;
 517
 518                 /* for fanout/split, just pass things along to the real src: */
 519                 if (src->opc == OPC_META_FO)
 520                         src = ssa(src->regs[1]);
 521
 522                 /* for fanin/collect, if this is the last use of *each* src,
 523                  * then it will decrease the live values, since RA treats
 524                  * them as a whole:
 525                  */
 526                 if (src->opc == OPC_META_FI) {
 527                         struct ir3_instruction *src2;
 528                         bool last_use = true;
 529
 530                         foreach_ssa_src(src2, src) {
 531                                 if (src2->use_count > 1) {
 532                                         last_use = false;
 533                                         break;
 534                                 }
 535                         }
 536
 537                         if (last_use)
 538                                 old_live += dest_regs(src);
 539
 540                 } else {
 541                         debug_assert(src->use_count > 0);
 542
 543                         if (src->use_count == 1) {
 544                                 old_live += dest_regs(src);
 545                         }
 546                 }
 547         }
 548
 549         return new_live - old_live;
 550 }
 551
 552 /* find instruction to schedule: */
 553 static struct ir3_instruction *
 554 find_eligible_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
 555                 bool soft)
 556 {
 557         struct ir3_instruction *best_instr = NULL;
 558         int best_rank = INT_MAX;      /* lower is better */
 559         unsigned deepest = 0;
 560
 561         /* TODO we'd really rather use the list/array of block outputs.  But we
 562          * don't have such a thing.  Recursing *every* instruction in the list
 563          * will result in a lot of repeated traversal, since instructions will
 564          * get traversed both when they appear as ssa src to a later instruction
 565          * as well as where they appear in the depth_list.
 566          */
 567         list_for_each_entry_rev (struct ir3_instruction, instr, &ctx->depth_list, node) {
 568                 struct ir3_instruction *candidate;
 569
 570                 candidate = find_instr_recursive(ctx, notes, instr);
 571                 if (!candidate)
 572                         continue;
 573
 574                 if (is_meta(candidate))
 575                         return candidate;
 576
 577                 deepest = MAX2(deepest, candidate->depth);
 578         }
 579
 580         /* traverse the list a second time.. but since we cache the result of
 581          * find_instr_recursive() it isn't as bad as it looks.
 582          */
 583         list_for_each_entry_rev (struct ir3_instruction, instr, &ctx->depth_list, node) {
 584                 struct ir3_instruction *candidate;
 585
 586                 candidate = find_instr_recursive(ctx, notes, instr);
 587                 if (!candidate)
 588                         continue;
 589
 590                 /* determine net change to # of live values: */
 591                 int le = live_effect(candidate);
 592
 593                 /* if there is a net increase in # of live values, then apply some
 594                  * threshold to avoid instructions getting scheduled *too* early
 595                  * and increasing register pressure.
 596                  */
 597                 if (le >= 1) {
 598                         unsigned threshold;
 599
 600                         if (ctx->live_values > 4*4) {
 601                                 threshold = 4;
 602                         } else {
 603                                 threshold = 6;
 604                         }
 605
 606                         /* Filter out any "shallow" instructions which would otherwise
 607                          * tend to get scheduled too early to fill delay slots even
 608                          * when they are not needed for a while.  There will probably
 609                          * be later delay slots that they could just as easily fill.
 610                          *
 611                          * A classic case where this comes up is frag shaders that
 612                          * write a constant value (like 1.0f) to one of the channels
 613                          * of the output color(s).  Since the mov from immed has no
 614                          * dependencies, it would otherwise get scheduled early to
 615                          * fill delay slots, occupying a register until the end of
 616                          * the program.
 617                          */
 618                         if ((deepest - candidate->depth) > threshold)
 619                                 continue;
 620                 }
 621
 622                 int rank = delay_calc(ctx->block, candidate, soft, false);
 623
 624                 /* if too many live values, prioritize instructions that reduce the
 625                  * number of live values:
 626                  */
 627                 if (ctx->live_values > 16*4) {
 628                         rank = le;
 629                 } else if (ctx->live_values > 4*4) {
 630                         rank += le;
 631                 }
 632
 633                 if (rank < best_rank) {
 634                         best_instr = candidate;
 635                         best_rank = rank;
 636                 }
 637         }
 638
 639         return best_instr;
 640 }
 641
 642 /* "spill" the address register by remapping any unscheduled
 643  * instructions which depend on the current address register
 644  * to a clone of the instruction which wrote the address reg.
 645  */
 646 static struct ir3_instruction *
 647 split_addr(struct ir3_sched_ctx *ctx)
 648 {
 649         struct ir3 *ir;
 650         struct ir3_instruction *new_addr = NULL;
 651         unsigned i;
 652
 653         debug_assert(ctx->addr);
 654
 655         ir = ctx->addr->block->shader;
 656
 657         for (i = 0; i < ir->indirects_count; i++) {
 658                 struct ir3_instruction *indirect = ir->indirects[i];
 659
 660                 if (!indirect)
 661                         continue;
 662
 663                 /* skip instructions already scheduled: */
 664                 if (is_scheduled(indirect))
 665                         continue;
 666
 667                 /* remap remaining instructions using current addr
 668                  * to new addr:
 669                  */
 670                 if (indirect->address == ctx->addr) {
 671                         if (!new_addr) {
 672                                 new_addr = ir3_instr_clone(ctx->addr);
 673                                 /* original addr is scheduled, but new one isn't: */
 674                                 new_addr->flags &= ~IR3_INSTR_MARK;
 675                         }
 676                         ir3_instr_set_address(indirect, new_addr);
 677                 }
 678         }
 679
 680         /* all remaining indirects remapped to new addr: */
 681         ctx->addr = NULL;
 682
 683         return new_addr;
 684 }
 685
 686 /* "spill" the predicate register by remapping any unscheduled
 687  * instructions which depend on the current predicate register
 688  * to a clone of the instruction which wrote the address reg.
 689  */
 690 static struct ir3_instruction *
 691 split_pred(struct ir3_sched_ctx *ctx)
 692 {
 693         struct ir3 *ir;
 694         struct ir3_instruction *new_pred = NULL;
 695         unsigned i;
 696
 697         debug_assert(ctx->pred);
 698
 699         ir = ctx->pred->block->shader;
 700
 701         for (i = 0; i < ir->predicates_count; i++) {
 702                 struct ir3_instruction *predicated = ir->predicates[i];
 703
 704                 /* skip instructions already scheduled: */
 705                 if (is_scheduled(predicated))
 706                         continue;
 707
 708                 /* remap remaining instructions using current pred
 709                  * to new pred:
 710                  *
 711                  * TODO is there ever a case when pred isn't first
 712                  * (and only) src?
 713                  */
 714                 if (ssa(predicated->regs[1]) == ctx->pred) {
 715                         if (!new_pred) {
 716                                 new_pred = ir3_instr_clone(ctx->pred);
 717                                 /* original pred is scheduled, but new one isn't: */
 718                                 new_pred->flags &= ~IR3_INSTR_MARK;
 719                         }
 720                         predicated->regs[1]->instr = new_pred;
 721                 }
 722         }
 723
 724         /* all remaining predicated remapped to new pred: */
 725         ctx->pred = NULL;
 726
 727         return new_pred;
 728 }
 729
 730 static void
 731 sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
 732 {
 733         struct list_head unscheduled_list;
 734
 735         ctx->block = block;
 736
 737         /* addr/pred writes are per-block: */
 738         ctx->addr = NULL;
 739         ctx->pred = NULL;
 740
 741         /* move all instructions to the unscheduled list, and
 742          * empty the block's instruction list (to which we will
 743          * be inserting).
 744          */
 745         list_replace(&block->instr_list, &unscheduled_list);
 746         list_inithead(&block->instr_list);
 747         list_inithead(&ctx->depth_list);
 748
 749         /* first a pre-pass to schedule all meta:input instructions
 750          * (which need to appear first so that RA knows the register is
 751          * occupied), and move remaining to depth sorted list:
 752          */
 753         list_for_each_entry_safe (struct ir3_instruction, instr, &unscheduled_list, node) {
 754                 if (instr->opc == OPC_META_INPUT) {
 755                         schedule(ctx, instr);
 756                 } else {
 757                         ir3_insert_by_depth(instr, &ctx->depth_list);
 758                 }
 759         }
 760
 761         while (!list_empty(&ctx->depth_list)) {
 762                 struct ir3_sched_notes notes = {0};
 763                 struct ir3_instruction *instr;
 764
 765                 instr = find_eligible_instr(ctx, &notes, true);
 766                 if (!instr)
 767                         instr = find_eligible_instr(ctx, &notes, false);
 768
 769                 if (instr) {
 770                         unsigned delay = delay_calc(ctx->block, instr, false, false);
 771
 772                         /* and if we run out of instructions that can be scheduled,
 773                          * then it is time for nop's:
 774                          */
 775                         debug_assert(delay <= 6);
 776                         while (delay > 0) {
 777                                 ir3_NOP(block);
 778                                 delay--;
 779                         }
 780
 781                         schedule(ctx, instr);
 782                 } else {
 783                         struct ir3_instruction *new_instr = NULL;
 784
 785                         /* nothing available to schedule.. if we are blocked on
 786                          * address/predicate register conflict, then break the
 787                          * deadlock by cloning the instruction that wrote that
 788                          * reg:
 789                          */
 790                         if (notes.addr_conflict) {
 791                                 new_instr = split_addr(ctx);
 792                         } else if (notes.pred_conflict) {
 793                                 new_instr = split_pred(ctx);
 794                         } else {
 795                                 debug_assert(0);
 796                                 ctx->error = true;
 797                                 return;
 798                         }
 799
 800                         if (new_instr) {
 801                                 /* clearing current addr/pred can change what is
 802                                  * available to schedule, so clear cache..
 803                                  */
 804                                 clear_cache(ctx, NULL);
 805
 806                                 ir3_insert_by_depth(new_instr, &ctx->depth_list);
 807                                 /* the original instr that wrote addr/pred may have
 808                                  * originated from a different block:
 809                                  */
 810                                 new_instr->block = block;
 811                         }
 812                 }
 813         }
 814
 815         /* And lastly, insert branch/jump instructions to take us to
 816          * the next block.  Later we'll strip back out the branches
 817          * that simply jump to next instruction.
 818          */
 819         if (block->successors[1]) {
 820                 /* if/else, conditional branches to "then" or "else": */
 821                 struct ir3_instruction *br;
 822                 unsigned delay = 6;
 823
 824                 debug_assert(ctx->pred);
 825                 debug_assert(block->condition);
 826
 827                 delay -= distance(ctx->block, ctx->pred, delay, false);
 828
 829                 while (delay > 0) {
 830                         ir3_NOP(block);
 831                         delay--;
 832                 }
 833
 834                 /* create "else" branch first (since "then" block should
 835                  * frequently/always end up being a fall-thru):
 836                  */
 837                 br = ir3_BR(block);
 838                 br->cat0.inv = true;
 839                 br->cat0.target = block->successors[1];
 840
 841                 /* NOTE: we have to hard code delay of 6 above, since
 842                  * we want to insert the nop's before constructing the
 843                  * branch.  Throw in an assert so we notice if this
 844                  * ever breaks on future generation:
 845                  */
 846                 debug_assert(ir3_delayslots(ctx->pred, br, 0) == 6);
 847
 848                 br = ir3_BR(block);
 849                 br->cat0.target = block->successors[0];
 850
 851         } else if (block->successors[0]) {
 852                 /* otherwise unconditional jump to next block: */
 853                 struct ir3_instruction *jmp;
 854
 855                 jmp = ir3_JUMP(block);
 856                 jmp->cat0.target = block->successors[0];
 857         }
 858
 859         /* NOTE: if we kept track of the predecessors, we could do a better
 860          * job w/ (jp) flags.. every node w/ > predecessor is a join point.
 861          * Note that as we eliminate blocks which contain only an unconditional
 862          * jump we probably need to propagate (jp) flag..
 863          */
 864 }
 865
 866 /* After scheduling individual blocks, we still could have cases where
 867  * one (or more) paths into a block, a value produced by a previous
 868  * has too few delay slots to be legal.  We can't deal with this in the
 869  * first pass, because loops (ie. we can't ensure all predecessor blocks
 870  * are already scheduled in the first pass).  All we can really do at
 871  * this point is stuff in extra nop's until things are legal.
 872  */
 873 static void
 874 sched_intra_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
 875 {
 876         unsigned n = 0;
 877
 878         ctx->block = block;
 879
 880         list_for_each_entry_safe (struct ir3_instruction, instr, &block->instr_list, node) {
 881                 unsigned delay = 0;
 882
 883                 for (unsigned i = 0; i < block->predecessors_count; i++) {
 884                         unsigned d = delay_calc(block->predecessors[i], instr, false, true);
 885                         delay = MAX2(d, delay);
 886                 }
 887
 888                 while (delay > n) {
 889                         struct ir3_instruction *nop = ir3_NOP(block);
 890
 891                         /* move to before instr: */
 892                         list_delinit(&nop->node);
 893                         list_addtail(&nop->node, &instr->node);
 894
 895                         n++;
 896                 }
 897
 898                 /* we can bail once we hit worst case delay: */
 899                 if (++n > 6)
 900                         break;
 901         }
 902 }
 903
 904 int ir3_sched(struct ir3 *ir)
 905 {
 906         struct ir3_sched_ctx ctx = {0};
 907
 908         ir3_clear_mark(ir);
 909         update_use_count(ir);
 910
 911         list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
 912                 ctx.live_values = 0;
 913                 sched_block(&ctx, block);
 914         }
 915
 916         list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
 917                 sched_intra_block(&ctx, block);
 918         }
 919
 920         if (ctx.error)
 921                 return -1;
 922
 923         return 0;
 924 }
 925
 926 static unsigned
 927 get_array_id(struct ir3_instruction *instr)
 928 {
 929         /* The expectation is that there is only a single array
 930          * src or dst, ir3_cp should enforce this.
 931          */
 932
 933         for (unsigned i = 0; i < instr->regs_count; i++)
 934                 if (instr->regs[i]->flags & IR3_REG_ARRAY)
 935                         return instr->regs[i]->array.id;
 936
 937         unreachable("this was unexpected");
 938 }
 939
 940 /* does instruction 'prior' need to be scheduled before 'instr'? */
 941 static bool
 942 depends_on(struct ir3_instruction *instr, struct ir3_instruction *prior)
 943 {
 944         /* TODO for dependencies that are related to a specific object, ie
 945          * a specific SSBO/image/array, we could relax this constraint to
 946          * make accesses to unrelated objects not depend on each other (at
 947          * least as long as not declared coherent)
 948          */
 949         if (((instr->barrier_class & IR3_BARRIER_EVERYTHING) && prior->barrier_class) ||
 950                         ((prior->barrier_class & IR3_BARRIER_EVERYTHING) && instr->barrier_class))
 951                 return true;
 952
 953         if (instr->barrier_class & prior->barrier_conflict) {
 954                 if (!(instr->barrier_class & ~(IR3_BARRIER_ARRAY_R | IR3_BARRIER_ARRAY_W))) {
 955                         /* if only array barrier, then we can further limit false-deps
 956                          * by considering the array-id, ie reads/writes to different
 957                          * arrays do not depend on each other (no aliasing)
 958                          */
 959                         if (get_array_id(instr) != get_array_id(prior)) {
 960                                 return false;
 961                         }
 962                 }
 963
 964                 return true;
 965         }
 966
 967         return false;
 968 }
 969
 970 static void
 971 add_barrier_deps(struct ir3_block *block, struct ir3_instruction *instr)
 972 {
 973         struct list_head *prev = instr->node.prev;
 974         struct list_head *next = instr->node.next;
 975
 976         /* add dependencies on previous instructions that must be scheduled
 977          * prior to the current instruction
 978          */
 979         while (prev != &block->instr_list) {
 980                 struct ir3_instruction *pi =
 981                         LIST_ENTRY(struct ir3_instruction, prev, node);
 982
 983                 prev = prev->prev;
 984
 985                 if (is_meta(pi))
 986                         continue;
 987
 988                 if (instr->barrier_class == pi->barrier_class) {
 989                         ir3_instr_add_dep(instr, pi);
 990                         break;
 991                 }
 992
 993                 if (depends_on(instr, pi))
 994                         ir3_instr_add_dep(instr, pi);
 995         }
 996
 997         /* add dependencies on this instruction to following instructions
 998          * that must be scheduled after the current instruction:
 999          */
1000         while (next != &block->instr_list) {
1001                 struct ir3_instruction *ni =
1002                         LIST_ENTRY(struct ir3_instruction, next, node);
1003
1004                 next = next->next;
1005
1006                 if (is_meta(ni))
1007                         continue;
1008
1009                 if (instr->barrier_class == ni->barrier_class) {
1010                         ir3_instr_add_dep(ni, instr);
1011                         break;
1012                 }
1013
1014                 if (depends_on(ni, instr))
1015                         ir3_instr_add_dep(ni, instr);
1016         }
1017 }
1018
1019 /* before scheduling a block, we need to add any necessary false-dependencies
1020  * to ensure that:
1021  *
1022  *  (1) barriers are scheduled in the right order wrt instructions related
1023  *      to the barrier
1024  *
1025  *  (2) reads that come before a write actually get scheduled before the
1026  *      write
1027  */
1028 static void
1029 calculate_deps(struct ir3_block *block)
1030 {
1031         list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
1032                 if (instr->barrier_class) {
1033                         add_barrier_deps(block, instr);
1034                 }
1035         }
1036 }
1037
1038 void
1039 ir3_sched_add_deps(struct ir3 *ir)
1040 {
1041         list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
1042                 calculate_deps(block);
1043         }
1044 }