src/freedreno/ir3/ir3_sched.c

   1 /*
   2  * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21  * SOFTWARE.
  22  *
  23  * Authors:
  24  *    Rob Clark <robclark@freedesktop.org>
  25  */
  26
  27
  28 #include "util/u_math.h"
  29
  30 #include "ir3.h"
  31
  32 /*
  33  * Instruction Scheduling:
  34  *
  35  * A recursive depth based scheduling algo.  Recursively find an eligible
  36  * instruction to schedule from the deepest instruction (recursing through
  37  * it's unscheduled src instructions).  Normally this would result in a
  38  * lot of re-traversal of the same instructions, so we cache results in
  39  * instr->data (and clear cached results that would be no longer valid
  40  * after scheduling an instruction).
  41  *
  42  * There are a few special cases that need to be handled, since sched
  43  * is currently independent of register allocation.  Usages of address
  44  * register (a0.x) or predicate register (p0.x) must be serialized.  Ie.
  45  * if you have two pairs of instructions that write the same special
  46  * register and then read it, then those pairs cannot be interleaved.
  47  * To solve this, when we are in such a scheduling "critical section",
  48  * and we encounter a conflicting write to a special register, we try
  49  * to schedule any remaining instructions that use that value first.
  50  */
  51
  52 struct ir3_sched_ctx {
  53         struct ir3_block *block;           /* the current block */
  54         struct list_head depth_list;       /* depth sorted unscheduled instrs */
  55         struct ir3_instruction *scheduled; /* last scheduled instr XXX remove*/
  56         struct ir3_instruction *addr;      /* current a0.x user, if any */
  57         struct ir3_instruction *pred;      /* current p0.x user, if any */
  58         int live_values;                   /* estimate of current live values */
  59         bool error;
  60 };
  61
  62 static bool is_sfu_or_mem(struct ir3_instruction *instr)
  63 {
  64         return is_sfu(instr) || is_mem(instr);
  65 }
  66
  67 static void
  68 unuse_each_src(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
  69 {
  70         struct ir3_instruction *src;
  71
  72         foreach_ssa_src_n(src, n, instr) {
  73                 if (__is_false_dep(instr, n))
  74                         continue;
  75                 if (instr->block != src->block)
  76                         continue;
  77                 if ((src->opc == OPC_META_FI) || (src->opc == OPC_META_FO)) {
  78                         unuse_each_src(ctx, src);
  79                 } else {
  80                         debug_assert(src->use_count > 0);
  81
  82                         if (--src->use_count == 0) {
  83                                 ctx->live_values -= dest_regs(src);
  84                                 debug_assert(ctx->live_values >= 0);
  85                         }
  86                 }
  87         }
  88 }
  89
  90 static void
  91 use_each_src(struct ir3_instruction *instr)
  92 {
  93         struct ir3_instruction *src;
  94
  95         foreach_ssa_src_n(src, n, instr) {
  96                 if (__is_false_dep(instr, n))
  97                         continue;
  98                 if (instr->block != src->block)
  99                         continue;
 100                 if ((src->opc == OPC_META_FI) || (src->opc == OPC_META_FO)) {
 101                         use_each_src(src);
 102                 } else {
 103                         src->use_count++;
 104                 }
 105         }
 106 }
 107
 108 static void
 109 update_live_values(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
 110 {
 111         if ((instr->opc == OPC_META_FI) || (instr->opc == OPC_META_FO))
 112                 return;
 113
 114         ctx->live_values += dest_regs(instr);
 115         unuse_each_src(ctx, instr);
 116 }
 117
 118 /* This is *slightly* different than how ir3_cp uses use_count, in that
 119  * we just track it per block (because we schedule a block at a time) and
 120  * because we don't track meta instructions and false dependencies (since
 121  * they don't contribute real register pressure).
 122  */
 123 static void
 124 update_use_count(struct ir3_block *block)
 125 {
 126         list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
 127                 instr->use_count = 0;
 128         }
 129
 130         list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
 131                 if ((instr->opc == OPC_META_FI) || (instr->opc == OPC_META_FO))
 132                         continue;
 133
 134                 use_each_src(instr);
 135         }
 136 }
 137
 138 #define NULL_INSTR ((void *)~0)
 139
 140 static void
 141 clear_cache(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
 142 {
 143         list_for_each_entry (struct ir3_instruction, instr2, &ctx->depth_list, node) {
 144                 if ((instr2->data == instr) || (instr2->data == NULL_INSTR) || !instr)
 145                         instr2->data = NULL;
 146         }
 147 }
 148
 149 static void
 150 schedule(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
 151 {
 152         debug_assert(ctx->block == instr->block);
 153
 154         /* maybe there is a better way to handle this than just stuffing
 155          * a nop.. ideally we'd know about this constraint in the
 156          * scheduling and depth calculation..
 157          */
 158         if (ctx->scheduled && is_sfu_or_mem(ctx->scheduled) && is_sfu_or_mem(instr))
 159                 ir3_NOP(ctx->block);
 160
 161         /* remove from depth list:
 162          */
 163         list_delinit(&instr->node);
 164
 165         if (writes_addr(instr)) {
 166                 debug_assert(ctx->addr == NULL);
 167                 ctx->addr = instr;
 168         }
 169
 170         if (writes_pred(instr)) {
 171                 debug_assert(ctx->pred == NULL);
 172                 ctx->pred = instr;
 173         }
 174
 175         instr->flags |= IR3_INSTR_MARK;
 176
 177         list_addtail(&instr->node, &instr->block->instr_list);
 178         ctx->scheduled = instr;
 179
 180         update_live_values(ctx, instr);
 181
 182         if (writes_addr(instr) || writes_pred(instr) || is_input(instr)) {
 183                 clear_cache(ctx, NULL);
 184         } else {
 185                 /* invalidate only the necessary entries.. */
 186                 clear_cache(ctx, instr);
 187         }
 188 }
 189
 190 static struct ir3_instruction *
 191 deepest(struct ir3_instruction **srcs, unsigned nsrcs)
 192 {
 193         struct ir3_instruction *d = NULL;
 194         unsigned i = 0, id = 0;
 195
 196         while ((i < nsrcs) && !(d = srcs[id = i]))
 197                 i++;
 198
 199         if (!d)
 200                 return NULL;
 201
 202         for (; i < nsrcs; i++)
 203                 if (srcs[i] && (srcs[i]->sun > d->sun))
 204                         d = srcs[id = i];
 205
 206         srcs[id] = NULL;
 207
 208         return d;
 209 }
 210
 211 /**
 212  * @block: the block to search in, starting from end; in first pass,
 213  *    this will be the block the instruction would be inserted into
 214  *    (but has not yet, ie. it only contains already scheduled
 215  *    instructions).  For intra-block scheduling (second pass), this
 216  *    would be one of the predecessor blocks.
 217  * @instr: the instruction to search for
 218  * @maxd:  max distance, bail after searching this # of instruction
 219  *    slots, since it means the instruction we are looking for is
 220  *    far enough away
 221  * @pred:  if true, recursively search into predecessor blocks to
 222  *    find the worst case (shortest) distance (only possible after
 223  *    individual blocks are all scheduled
 224  */
 225 static unsigned
 226 distance(struct ir3_block *block, struct ir3_instruction *instr,
 227                 unsigned maxd, bool pred)
 228 {
 229         unsigned d = 0;
 230
 231         list_for_each_entry_rev (struct ir3_instruction, n, &block->instr_list, node) {
 232                 if ((n == instr) || (d >= maxd))
 233                         return d;
 234                 /* NOTE: don't count branch/jump since we don't know yet if they will
 235                  * be eliminated later in resolve_jumps().. really should do that
 236                  * earlier so we don't have this constraint.
 237                  */
 238                 if (is_alu(n) || (is_flow(n) && (n->opc != OPC_JUMP) && (n->opc != OPC_BR)))
 239                         d++;
 240         }
 241
 242         /* if coming from a predecessor block, assume it is assigned far
 243          * enough away.. we'll fix up later.
 244          */
 245         if (!pred)
 246                 return maxd;
 247
 248         if (pred && (block->data != block)) {
 249                 /* Search into predecessor blocks, finding the one with the
 250                  * shortest distance, since that will be the worst case
 251                  */
 252                 unsigned min = maxd - d;
 253
 254                 /* (ab)use block->data to prevent recursion: */
 255                 block->data = block;
 256
 257                 for (unsigned i = 0; i < block->predecessors_count; i++) {
 258                         unsigned n;
 259
 260                         n = distance(block->predecessors[i], instr, min, pred);
 261
 262                         min = MIN2(min, n);
 263                 }
 264
 265                 block->data = NULL;
 266                 d += min;
 267         }
 268
 269         return d;
 270 }
 271
 272 /* calculate delay for specified src: */
 273 static unsigned
 274 delay_calc_srcn(struct ir3_block *block,
 275                 struct ir3_instruction *assigner,
 276                 struct ir3_instruction *consumer,
 277                 unsigned srcn, bool soft, bool pred)
 278 {
 279         unsigned delay = 0;
 280
 281         if (is_meta(assigner)) {
 282                 struct ir3_instruction *src;
 283                 foreach_ssa_src(src, assigner) {
 284                         unsigned d;
 285                         d = delay_calc_srcn(block, src, consumer, srcn, soft, pred);
 286                         delay = MAX2(delay, d);
 287                 }
 288         } else {
 289                 if (soft) {
 290                         if (is_sfu(assigner)) {
 291                                 delay = 4;
 292                         } else {
 293                                 delay = ir3_delayslots(assigner, consumer, srcn);
 294                         }
 295                 } else {
 296                         delay = ir3_delayslots(assigner, consumer, srcn);
 297                 }
 298                 delay -= distance(block, assigner, delay, pred);
 299         }
 300
 301         return delay;
 302 }
 303
 304 /* calculate delay for instruction (maximum of delay for all srcs): */
 305 static unsigned
 306 delay_calc(struct ir3_block *block, struct ir3_instruction *instr,
 307                 bool soft, bool pred)
 308 {
 309         unsigned delay = 0;
 310         struct ir3_instruction *src;
 311
 312         foreach_ssa_src_n(src, i, instr) {
 313                 unsigned d;
 314                 d = delay_calc_srcn(block, src, instr, i, soft, pred);
 315                 delay = MAX2(delay, d);
 316         }
 317
 318         return delay;
 319 }
 320
 321 struct ir3_sched_notes {
 322         /* there is at least one kill which could be scheduled, except
 323          * for unscheduled bary.f's:
 324          */
 325         bool blocked_kill;
 326         /* there is at least one instruction that could be scheduled,
 327          * except for conflicting address/predicate register usage:
 328          */
 329         bool addr_conflict, pred_conflict;
 330 };
 331
 332 static bool is_scheduled(struct ir3_instruction *instr)
 333 {
 334         return !!(instr->flags & IR3_INSTR_MARK);
 335 }
 336
 337 /* could an instruction be scheduled if specified ssa src was scheduled? */
 338 static bool
 339 could_sched(struct ir3_instruction *instr, struct ir3_instruction *src)
 340 {
 341         struct ir3_instruction *other_src;
 342         foreach_ssa_src(other_src, instr) {
 343                 /* if dependency not scheduled, we aren't ready yet: */
 344                 if ((src != other_src) && !is_scheduled(other_src)) {
 345                         return false;
 346                 }
 347         }
 348         return true;
 349 }
 350
 351 /* Check if instruction is ok to schedule.  Make sure it is not blocked
 352  * by use of addr/predicate register, etc.
 353  */
 354 static bool
 355 check_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
 356                 struct ir3_instruction *instr)
 357 {
 358         /* For instructions that write address register we need to
 359          * make sure there is at least one instruction that uses the
 360          * addr value which is otherwise ready.
 361          *
 362          * TODO if any instructions use pred register and have other
 363          * src args, we would need to do the same for writes_pred()..
 364          */
 365         if (writes_addr(instr)) {
 366                 struct ir3 *ir = instr->block->shader;
 367                 bool ready = false;
 368                 for (unsigned i = 0; (i < ir->indirects_count) && !ready; i++) {
 369                         struct ir3_instruction *indirect = ir->indirects[i];
 370                         if (!indirect)
 371                                 continue;
 372                         if (indirect->address != instr)
 373                                 continue;
 374                         ready = could_sched(indirect, instr);
 375                 }
 376
 377                 /* nothing could be scheduled, so keep looking: */
 378                 if (!ready)
 379                         return false;
 380         }
 381
 382         /* if this is a write to address/predicate register, and that
 383          * register is currently in use, we need to defer until it is
 384          * free:
 385          */
 386         if (writes_addr(instr) && ctx->addr) {
 387                 debug_assert(ctx->addr != instr);
 388                 notes->addr_conflict = true;
 389                 return false;
 390         }
 391
 392         if (writes_pred(instr) && ctx->pred) {
 393                 debug_assert(ctx->pred != instr);
 394                 notes->pred_conflict = true;
 395                 return false;
 396         }
 397
 398         /* if the instruction is a kill, we need to ensure *every*
 399          * bary.f is scheduled.  The hw seems unhappy if the thread
 400          * gets killed before the end-input (ei) flag is hit.
 401          *
 402          * We could do this by adding each bary.f instruction as
 403          * virtual ssa src for the kill instruction.  But we have
 404          * fixed length instr->regs[].
 405          *
 406          * TODO this wouldn't be quite right if we had multiple
 407          * basic blocks, if any block was conditional.  We'd need
 408          * to schedule the bary.f's outside of any block which
 409          * was conditional that contained a kill.. I think..
 410          */
 411         if (is_kill(instr)) {
 412                 struct ir3 *ir = instr->block->shader;
 413
 414                 for (unsigned i = 0; i < ir->baryfs_count; i++) {
 415                         struct ir3_instruction *baryf = ir->baryfs[i];
 416                         if (baryf->flags & IR3_INSTR_UNUSED)
 417                                 continue;
 418                         if (!is_scheduled(baryf)) {
 419                                 notes->blocked_kill = true;
 420                                 return false;
 421                         }
 422                 }
 423         }
 424
 425         return true;
 426 }
 427
 428 /* Find the best instruction to schedule from specified instruction or
 429  * recursively it's ssa sources.
 430  */
 431 static struct ir3_instruction *
 432 find_instr_recursive(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
 433                 struct ir3_instruction *instr)
 434 {
 435         struct ir3_instruction *srcs[__ssa_src_cnt(instr)];
 436         struct ir3_instruction *src;
 437         unsigned nsrcs = 0;
 438
 439         if (is_scheduled(instr))
 440                 return NULL;
 441
 442         /* use instr->data to cache the results of recursing up the
 443          * instr src's.  Otherwise the recursive algo can scale quite
 444          * badly w/ shader size.  But this takes some care to clear
 445          * the cache appropriately when instructions are scheduled.
 446          */
 447         if (instr->data) {
 448                 if (instr->data == NULL_INSTR)
 449                         return NULL;
 450                 return instr->data;
 451         }
 452
 453         /* find unscheduled srcs: */
 454         foreach_ssa_src(src, instr) {
 455                 if (!is_scheduled(src)) {
 456                         debug_assert(nsrcs < ARRAY_SIZE(srcs));
 457                         srcs[nsrcs++] = src;
 458                 }
 459         }
 460
 461         /* if all our src's are already scheduled: */
 462         if (nsrcs == 0) {
 463                 if (check_instr(ctx, notes, instr)) {
 464                         instr->data = instr;
 465                         return instr;
 466                 }
 467                 return NULL;
 468         }
 469
 470         while ((src = deepest(srcs, nsrcs))) {
 471                 struct ir3_instruction *candidate;
 472
 473                 candidate = find_instr_recursive(ctx, notes, src);
 474                 if (!candidate)
 475                         continue;
 476
 477                 if (check_instr(ctx, notes, candidate)) {
 478                         instr->data = candidate;
 479                         return candidate;
 480                 }
 481         }
 482
 483         instr->data = NULL_INSTR;
 484         return NULL;
 485 }
 486
 487 /* find instruction to schedule: */
 488 static struct ir3_instruction *
 489 find_eligible_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
 490                 bool soft)
 491 {
 492         struct ir3_instruction *best_instr = NULL;
 493         unsigned min_delay = ~0;
 494
 495         /* TODO we'd really rather use the list/array of block outputs.  But we
 496          * don't have such a thing.  Recursing *every* instruction in the list
 497          * will result in a lot of repeated traversal, since instructions will
 498          * get traversed both when they appear as ssa src to a later instruction
 499          * as well as where they appear in the depth_list.
 500          */
 501         list_for_each_entry_rev (struct ir3_instruction, instr, &ctx->depth_list, node) {
 502                 struct ir3_instruction *candidate;
 503                 unsigned delay;
 504
 505                 candidate = find_instr_recursive(ctx, notes, instr);
 506                 if (!candidate)
 507                         continue;
 508
 509                 if (ctx->live_values > 16*4) {
 510                         /* under register pressure, only care about reducing live values: */
 511                         if (!best_instr || (candidate->sun > best_instr->sun))
 512                                 best_instr = candidate;
 513                 } else {
 514                         delay = delay_calc(ctx->block, candidate, soft, false);
 515                         if ((delay < min_delay) ||
 516                                         ((delay <= (min_delay + 2)) && (candidate->sun > best_instr->sun))) {
 517                                 best_instr = candidate;
 518                                 min_delay = delay;
 519                         }
 520                 }
 521         }
 522
 523         return best_instr;
 524 }
 525
 526 /* "spill" the address register by remapping any unscheduled
 527  * instructions which depend on the current address register
 528  * to a clone of the instruction which wrote the address reg.
 529  */
 530 static struct ir3_instruction *
 531 split_addr(struct ir3_sched_ctx *ctx)
 532 {
 533         struct ir3 *ir;
 534         struct ir3_instruction *new_addr = NULL;
 535         unsigned i;
 536
 537         debug_assert(ctx->addr);
 538
 539         ir = ctx->addr->block->shader;
 540
 541         for (i = 0; i < ir->indirects_count; i++) {
 542                 struct ir3_instruction *indirect = ir->indirects[i];
 543
 544                 if (!indirect)
 545                         continue;
 546
 547                 /* skip instructions already scheduled: */
 548                 if (is_scheduled(indirect))
 549                         continue;
 550
 551                 /* remap remaining instructions using current addr
 552                  * to new addr:
 553                  */
 554                 if (indirect->address == ctx->addr) {
 555                         if (!new_addr) {
 556                                 new_addr = ir3_instr_clone(ctx->addr);
 557                                 /* original addr is scheduled, but new one isn't: */
 558                                 new_addr->flags &= ~IR3_INSTR_MARK;
 559                         }
 560                         ir3_instr_set_address(indirect, new_addr);
 561                 }
 562         }
 563
 564         /* all remaining indirects remapped to new addr: */
 565         ctx->addr = NULL;
 566
 567         return new_addr;
 568 }
 569
 570 /* "spill" the predicate register by remapping any unscheduled
 571  * instructions which depend on the current predicate register
 572  * to a clone of the instruction which wrote the address reg.
 573  */
 574 static struct ir3_instruction *
 575 split_pred(struct ir3_sched_ctx *ctx)
 576 {
 577         struct ir3 *ir;
 578         struct ir3_instruction *new_pred = NULL;
 579         unsigned i;
 580
 581         debug_assert(ctx->pred);
 582
 583         ir = ctx->pred->block->shader;
 584
 585         for (i = 0; i < ir->predicates_count; i++) {
 586                 struct ir3_instruction *predicated = ir->predicates[i];
 587
 588                 /* skip instructions already scheduled: */
 589                 if (is_scheduled(predicated))
 590                         continue;
 591
 592                 /* remap remaining instructions using current pred
 593                  * to new pred:
 594                  *
 595                  * TODO is there ever a case when pred isn't first
 596                  * (and only) src?
 597                  */
 598                 if (ssa(predicated->regs[1]) == ctx->pred) {
 599                         if (!new_pred) {
 600                                 new_pred = ir3_instr_clone(ctx->pred);
 601                                 /* original pred is scheduled, but new one isn't: */
 602                                 new_pred->flags &= ~IR3_INSTR_MARK;
 603                         }
 604                         predicated->regs[1]->instr = new_pred;
 605                 }
 606         }
 607
 608         /* all remaining predicated remapped to new pred: */
 609         ctx->pred = NULL;
 610
 611         return new_pred;
 612 }
 613
 614 static void
 615 sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
 616 {
 617         struct list_head unscheduled_list;
 618
 619         ctx->block = block;
 620
 621         /* addr/pred writes are per-block: */
 622         ctx->addr = NULL;
 623         ctx->pred = NULL;
 624
 625         /* move all instructions to the unscheduled list, and
 626          * empty the block's instruction list (to which we will
 627          * be inserting).
 628          */
 629         list_replace(&block->instr_list, &unscheduled_list);
 630         list_inithead(&block->instr_list);
 631         list_inithead(&ctx->depth_list);
 632
 633         /* first a pre-pass to schedule all meta:input instructions
 634          * (which need to appear first so that RA knows the register is
 635          * occupied), and move remaining to depth sorted list:
 636          */
 637         list_for_each_entry_safe (struct ir3_instruction, instr, &unscheduled_list, node) {
 638                 if (instr->opc == OPC_META_INPUT) {
 639                         schedule(ctx, instr);
 640                 } else {
 641                         ir3_insert_by_depth(instr, &ctx->depth_list);
 642                 }
 643         }
 644
 645         while (!list_empty(&ctx->depth_list)) {
 646                 struct ir3_sched_notes notes = {0};
 647                 struct ir3_instruction *instr;
 648
 649                 instr = find_eligible_instr(ctx, &notes, true);
 650                 if (!instr)
 651                         instr = find_eligible_instr(ctx, &notes, false);
 652
 653                 if (instr) {
 654                         unsigned delay = delay_calc(ctx->block, instr, false, false);
 655
 656                         /* and if we run out of instructions that can be scheduled,
 657                          * then it is time for nop's:
 658                          */
 659                         debug_assert(delay <= 6);
 660                         while (delay > 0) {
 661                                 ir3_NOP(block);
 662                                 delay--;
 663                         }
 664
 665                         schedule(ctx, instr);
 666                 } else {
 667                         struct ir3_instruction *new_instr = NULL;
 668
 669                         /* nothing available to schedule.. if we are blocked on
 670                          * address/predicate register conflict, then break the
 671                          * deadlock by cloning the instruction that wrote that
 672                          * reg:
 673                          */
 674                         if (notes.addr_conflict) {
 675                                 new_instr = split_addr(ctx);
 676                         } else if (notes.pred_conflict) {
 677                                 new_instr = split_pred(ctx);
 678                         } else {
 679                                 debug_assert(0);
 680                                 ctx->error = true;
 681                                 return;
 682                         }
 683
 684                         if (new_instr) {
 685                                 /* clearing current addr/pred can change what is
 686                                  * available to schedule, so clear cache..
 687                                  */
 688                                 clear_cache(ctx, NULL);
 689
 690                                 ir3_insert_by_depth(new_instr, &ctx->depth_list);
 691                                 /* the original instr that wrote addr/pred may have
 692                                  * originated from a different block:
 693                                  */
 694                                 new_instr->block = block;
 695                         }
 696                 }
 697         }
 698
 699         /* And lastly, insert branch/jump instructions to take us to
 700          * the next block.  Later we'll strip back out the branches
 701          * that simply jump to next instruction.
 702          */
 703         if (block->successors[1]) {
 704                 /* if/else, conditional branches to "then" or "else": */
 705                 struct ir3_instruction *br;
 706                 unsigned delay = 6;
 707
 708                 debug_assert(ctx->pred);
 709                 debug_assert(block->condition);
 710
 711                 delay -= distance(ctx->block, ctx->pred, delay, false);
 712
 713                 while (delay > 0) {
 714                         ir3_NOP(block);
 715                         delay--;
 716                 }
 717
 718                 /* create "else" branch first (since "then" block should
 719                  * frequently/always end up being a fall-thru):
 720                  */
 721                 br = ir3_BR(block);
 722                 br->cat0.inv = true;
 723                 br->cat0.target = block->successors[1];
 724
 725                 /* NOTE: we have to hard code delay of 6 above, since
 726                  * we want to insert the nop's before constructing the
 727                  * branch.  Throw in an assert so we notice if this
 728                  * ever breaks on future generation:
 729                  */
 730                 debug_assert(ir3_delayslots(ctx->pred, br, 0) == 6);
 731
 732                 br = ir3_BR(block);
 733                 br->cat0.target = block->successors[0];
 734
 735         } else if (block->successors[0]) {
 736                 /* otherwise unconditional jump to next block: */
 737                 struct ir3_instruction *jmp;
 738
 739                 jmp = ir3_JUMP(block);
 740                 jmp->cat0.target = block->successors[0];
 741         }
 742
 743         /* NOTE: if we kept track of the predecessors, we could do a better
 744          * job w/ (jp) flags.. every node w/ > predecessor is a join point.
 745          * Note that as we eliminate blocks which contain only an unconditional
 746          * jump we probably need to propagate (jp) flag..
 747          */
 748 }
 749
 750 /* After scheduling individual blocks, we still could have cases where
 751  * one (or more) paths into a block, a value produced by a previous
 752  * has too few delay slots to be legal.  We can't deal with this in the
 753  * first pass, because loops (ie. we can't ensure all predecessor blocks
 754  * are already scheduled in the first pass).  All we can really do at
 755  * this point is stuff in extra nop's until things are legal.
 756  */
 757 static void
 758 sched_intra_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
 759 {
 760         unsigned n = 0;
 761
 762         ctx->block = block;
 763
 764         list_for_each_entry_safe (struct ir3_instruction, instr, &block->instr_list, node) {
 765                 unsigned delay = 0;
 766
 767                 for (unsigned i = 0; i < block->predecessors_count; i++) {
 768                         unsigned d = delay_calc(block->predecessors[i], instr, false, true);
 769                         delay = MAX2(d, delay);
 770                 }
 771
 772                 while (delay > n) {
 773                         struct ir3_instruction *nop = ir3_NOP(block);
 774
 775                         /* move to before instr: */
 776                         list_delinit(&nop->node);
 777                         list_addtail(&nop->node, &instr->node);
 778
 779                         n++;
 780                 }
 781
 782                 /* we can bail once we hit worst case delay: */
 783                 if (++n > 6)
 784                         break;
 785         }
 786 }
 787
 788 int ir3_sched(struct ir3 *ir)
 789 {
 790         struct ir3_sched_ctx ctx = {0};
 791
 792         ir3_clear_mark(ir);
 793
 794         list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
 795                 ctx.live_values = 0;
 796                 update_use_count(block);
 797                 sched_block(&ctx, block);
 798         }
 799
 800         list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
 801                 sched_intra_block(&ctx, block);
 802         }
 803
 804         if (ctx.error)
 805                 return -1;
 806
 807         return 0;
 808 }
 809
 810 /* does instruction 'prior' need to be scheduled before 'instr'? */
 811 static bool
 812 depends_on(struct ir3_instruction *instr, struct ir3_instruction *prior)
 813 {
 814         /* TODO for dependencies that are related to a specific object, ie
 815          * a specific SSBO/image/array, we could relax this constraint to
 816          * make accesses to unrelated objects not depend on each other (at
 817          * least as long as not declared coherent)
 818          */
 819         if (((instr->barrier_class & IR3_BARRIER_EVERYTHING) && prior->barrier_class) ||
 820                         ((prior->barrier_class & IR3_BARRIER_EVERYTHING) && instr->barrier_class))
 821                 return true;
 822         return !!(instr->barrier_class & prior->barrier_conflict);
 823 }
 824
 825 static void
 826 add_barrier_deps(struct ir3_block *block, struct ir3_instruction *instr)
 827 {
 828         struct list_head *prev = instr->node.prev;
 829         struct list_head *next = instr->node.next;
 830
 831         /* add dependencies on previous instructions that must be scheduled
 832          * prior to the current instruction
 833          */
 834         while (prev != &block->instr_list) {
 835                 struct ir3_instruction *pi =
 836                         LIST_ENTRY(struct ir3_instruction, prev, node);
 837
 838                 prev = prev->prev;
 839
 840                 if (is_meta(pi))
 841                         continue;
 842
 843                 if (instr->barrier_class == pi->barrier_class) {
 844                         ir3_instr_add_dep(instr, pi);
 845                         break;
 846                 }
 847
 848                 if (depends_on(instr, pi))
 849                         ir3_instr_add_dep(instr, pi);
 850         }
 851
 852         /* add dependencies on this instruction to following instructions
 853          * that must be scheduled after the current instruction:
 854          */
 855         while (next != &block->instr_list) {
 856                 struct ir3_instruction *ni =
 857                         LIST_ENTRY(struct ir3_instruction, next, node);
 858
 859                 next = next->next;
 860
 861                 if (is_meta(ni))
 862                         continue;
 863
 864                 if (instr->barrier_class == ni->barrier_class) {
 865                         ir3_instr_add_dep(ni, instr);
 866                         break;
 867                 }
 868
 869                 if (depends_on(ni, instr))
 870                         ir3_instr_add_dep(ni, instr);
 871         }
 872 }
 873
 874 /* before scheduling a block, we need to add any necessary false-dependencies
 875  * to ensure that:
 876  *
 877  *  (1) barriers are scheduled in the right order wrt instructions related
 878  *      to the barrier
 879  *
 880  *  (2) reads that come before a write actually get scheduled before the
 881  *      write
 882  */
 883 static void
 884 calculate_deps(struct ir3_block *block)
 885 {
 886         list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
 887                 if (instr->barrier_class) {
 888                         add_barrier_deps(block, instr);
 889                 }
 890         }
 891 }
 892
 893 void
 894 ir3_sched_add_deps(struct ir3 *ir)
 895 {
 896         list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
 897                 calculate_deps(block);
 898         }
 899 }