src/gallium/drivers/freedreno/ir3/ir3_sched.c

   1 /* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
   2
   3 /*
   4  * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the "Software"),
   8  * to deal in the Software without restriction, including without limitation
   9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  10  * and/or sell copies of the Software, and to permit persons to whom the
  11  * Software is furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice (including the next
  14  * paragraph) shall be included in all copies or substantial portions of the
  15  * Software.
  16  *
  17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  18  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  20  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  21  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  22  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  23  * SOFTWARE.
  24  *
  25  * Authors:
  26  *    Rob Clark <robclark@freedesktop.org>
  27  */
  28
  29
  30 #include "util/u_math.h"
  31
  32 #include "ir3.h"
  33
  34 /*
  35  * Instruction Scheduling:
  36  *
  37  * A priority-queue based scheduling algo.  Add eligible instructions,
  38  * ie. ones with all their dependencies scheduled, to the priority
  39  * (depth) sorted queue (list).  Pop highest priority instruction off
  40  * the queue and schedule it, add newly eligible instructions to the
  41  * priority queue, rinse, repeat.
  42  *
  43  * There are a few special cases that need to be handled, since sched
  44  * is currently independent of register allocation.  Usages of address
  45  * register (a0.x) or predicate register (p0.x) must be serialized.  Ie.
  46  * if you have two pairs of instructions that write the same special
  47  * register and then read it, then those pairs cannot be interleaved.
  48  * To solve this, when we are in such a scheduling "critical section",
  49  * and we encounter a conflicting write to a special register, we try
  50  * to schedule any remaining instructions that use that value first.
  51  */
  52
  53 struct ir3_sched_ctx {
  54         struct ir3_block *block;           /* the current block */
  55         struct ir3_instruction *scheduled; /* last scheduled instr XXX remove*/
  56         struct ir3_instruction *addr;      /* current a0.x user, if any */
  57         struct ir3_instruction *pred;      /* current p0.x user, if any */
  58         bool error;
  59 };
  60
  61 static bool is_sfu_or_mem(struct ir3_instruction *instr)
  62 {
  63         return is_sfu(instr) || is_mem(instr);
  64 }
  65
  66 static void
  67 schedule(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
  68 {
  69         debug_assert(ctx->block == instr->block);
  70
  71         /* maybe there is a better way to handle this than just stuffing
  72          * a nop.. ideally we'd know about this constraint in the
  73          * scheduling and depth calculation..
  74          */
  75         if (ctx->scheduled && is_sfu_or_mem(ctx->scheduled) && is_sfu_or_mem(instr))
  76                 ir3_NOP(ctx->block);
  77
  78         /* remove from depth list:
  79          */
  80         list_delinit(&instr->node);
  81
  82         if (writes_addr(instr)) {
  83                 debug_assert(ctx->addr == NULL);
  84                 ctx->addr = instr;
  85         }
  86
  87         if (writes_pred(instr)) {
  88                 debug_assert(ctx->pred == NULL);
  89                 ctx->pred = instr;
  90         }
  91
  92         instr->flags |= IR3_INSTR_MARK;
  93
  94         list_addtail(&instr->node, &instr->block->instr_list);
  95         ctx->scheduled = instr;
  96 }
  97
  98 static unsigned
  99 distance(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr,
 100                 unsigned maxd)
 101 {
 102         struct list_head *instr_list = &ctx->block->instr_list;
 103         unsigned d = 0;
 104
 105         list_for_each_entry_rev (struct ir3_instruction, n, instr_list, node) {
 106                 if ((n == instr) || (d >= maxd))
 107                         break;
 108                 if (is_alu(n) || is_flow(n))
 109                         d++;
 110         }
 111
 112         return d;
 113 }
 114
 115 /* calculate delay for specified src: */
 116 static unsigned
 117 delay_calc_srcn(struct ir3_sched_ctx *ctx,
 118                 struct ir3_instruction *assigner,
 119                 struct ir3_instruction *consumer, unsigned srcn)
 120 {
 121         unsigned delay = 0;
 122
 123         if (is_meta(assigner)) {
 124                 struct ir3_instruction *src;
 125                 foreach_ssa_src(src, assigner) {
 126                         unsigned d;
 127                         if (src->block != assigner->block)
 128                                 break;
 129                         d = delay_calc_srcn(ctx, src, consumer, srcn);
 130                         delay = MAX2(delay, d);
 131                 }
 132         } else {
 133                 delay = ir3_delayslots(assigner, consumer, srcn);
 134                 delay -= distance(ctx, assigner, delay);
 135         }
 136
 137         return delay;
 138 }
 139
 140 /* calculate delay for instruction (maximum of delay for all srcs): */
 141 static unsigned
 142 delay_calc(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
 143 {
 144         unsigned delay = 0;
 145         struct ir3_instruction *src;
 146
 147         foreach_ssa_src_n(src, i, instr) {
 148                 unsigned d;
 149                 if (src->block != instr->block)
 150                         continue;
 151                 d = delay_calc_srcn(ctx, src, instr, i);
 152                 delay = MAX2(delay, d);
 153         }
 154
 155         return delay;
 156 }
 157
 158 struct ir3_sched_notes {
 159         /* there is at least one kill which could be scheduled, except
 160          * for unscheduled bary.f's:
 161          */
 162         bool blocked_kill;
 163         /* there is at least one instruction that could be scheduled,
 164          * except for conflicting address/predicate register usage:
 165          */
 166         bool addr_conflict, pred_conflict;
 167 };
 168
 169 static bool is_scheduled(struct ir3_instruction *instr)
 170 {
 171         return !!(instr->flags & IR3_INSTR_MARK);
 172 }
 173
 174 static bool
 175 check_conflict(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
 176                 struct ir3_instruction *instr)
 177 {
 178         /* if this is a write to address/predicate register, and that
 179          * register is currently in use, we need to defer until it is
 180          * free:
 181          */
 182         if (writes_addr(instr) && ctx->addr) {
 183                 debug_assert(ctx->addr != instr);
 184                 notes->addr_conflict = true;
 185                 return true;
 186         }
 187
 188         if (writes_pred(instr) && ctx->pred) {
 189                 debug_assert(ctx->pred != instr);
 190                 notes->pred_conflict = true;
 191                 return true;
 192         }
 193
 194         return false;
 195 }
 196
 197 /* is this instruction ready to be scheduled?  Return negative for not
 198  * ready (updating notes if needed), or >= 0 to indicate number of
 199  * delay slots needed.
 200  */
 201 static int
 202 instr_eligibility(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
 203                 struct ir3_instruction *instr)
 204 {
 205         struct ir3_instruction *src;
 206         unsigned delay = 0;
 207
 208         /* Phi instructions can have a dependency on something not
 209          * scheduled yet (for ex, loops).  But OTOH we don't really
 210          * care.  By definition phi's should appear at the top of
 211          * the block, and it's sources should be values from the
 212          * previously executing block, so they are always ready to
 213          * be scheduled:
 214          */
 215         if (is_meta(instr) && (instr->opc == OPC_META_PHI))
 216                 return 0;
 217
 218         foreach_ssa_src(src, instr) {
 219                 /* if dependency not scheduled, we aren't ready yet: */
 220                 if (!is_scheduled(src))
 221                         return -1;
 222         }
 223
 224         /* all our dependents are scheduled, figure out if
 225          * we have enough delay slots to schedule ourself:
 226          */
 227         delay = delay_calc(ctx, instr);
 228         if (delay)
 229                 return delay;
 230
 231         /* if the instruction is a kill, we need to ensure *every*
 232          * bary.f is scheduled.  The hw seems unhappy if the thread
 233          * gets killed before the end-input (ei) flag is hit.
 234          *
 235          * We could do this by adding each bary.f instruction as
 236          * virtual ssa src for the kill instruction.  But we have
 237          * fixed length instr->regs[].
 238          *
 239          * TODO this wouldn't be quite right if we had multiple
 240          * basic blocks, if any block was conditional.  We'd need
 241          * to schedule the bary.f's outside of any block which
 242          * was conditional that contained a kill.. I think..
 243          */
 244         if (is_kill(instr)) {
 245                 struct ir3 *ir = instr->block->shader;
 246
 247                 for (unsigned i = 0; i < ir->baryfs_count; i++) {
 248                         struct ir3_instruction *baryf = ir->baryfs[i];
 249                         if (baryf->depth == DEPTH_UNUSED)
 250                                 continue;
 251                         if (!is_scheduled(baryf)) {
 252                                 notes->blocked_kill = true;
 253                                 return -1;
 254                         }
 255                 }
 256         }
 257
 258         if (check_conflict(ctx, notes, instr))
 259                 return -1;
 260
 261         return 0;
 262 }
 263
 264 /* could an instruction be scheduled if specified ssa src was scheduled? */
 265 static bool
 266 could_sched(struct ir3_instruction *instr, struct ir3_instruction *src)
 267 {
 268         struct ir3_instruction *other_src;
 269         foreach_ssa_src(other_src, instr) {
 270                 /* if dependency not scheduled, we aren't ready yet: */
 271                 if ((src != other_src) && !is_scheduled(other_src)) {
 272                         return false;
 273                 }
 274         }
 275         return true;
 276 }
 277
 278 /* move eligible instructions to the priority list: */
 279 static unsigned
 280 add_eligible_instrs(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
 281                 struct list_head *prio_queue, struct list_head *unscheduled_list)
 282 {
 283         unsigned min_delay = ~0;
 284
 285         list_for_each_entry_safe (struct ir3_instruction, instr, unscheduled_list, node) {
 286                 int e = instr_eligibility(ctx, notes, instr);
 287                 if (e < 0)
 288                         continue;
 289
 290                 /* For instructions that write address register we need to
 291                  * make sure there is at least one instruction that uses the
 292                  * addr value which is otherwise ready.
 293                  *
 294                  * TODO if any instructions use pred register and have other
 295                  * src args, we would need to do the same for writes_pred()..
 296                  */
 297                 if (unlikely(writes_addr(instr))) {
 298                         struct ir3 *ir = instr->block->shader;
 299                         bool ready = false;
 300                         for (unsigned i = 0; (i < ir->indirects_count) && !ready; i++) {
 301                                 struct ir3_instruction *indirect = ir->indirects[i];
 302                                 if (indirect->address != instr)
 303                                         continue;
 304                                 ready = could_sched(indirect, instr);
 305                         }
 306
 307                         /* nothing could be scheduled, so keep looking: */
 308                         if (!ready)
 309                                 continue;
 310                 }
 311
 312                 min_delay = MIN2(min_delay, e);
 313                 if (e == 0) {
 314                         /* remove from unscheduled list and into priority queue: */
 315                         list_delinit(&instr->node);
 316                         ir3_insert_by_depth(instr, prio_queue);
 317                 }
 318         }
 319
 320         return min_delay;
 321 }
 322
 323 /* "spill" the address register by remapping any unscheduled
 324  * instructions which depend on the current address register
 325  * to a clone of the instruction which wrote the address reg.
 326  */
 327 static struct ir3_instruction *
 328 split_addr(struct ir3_sched_ctx *ctx)
 329 {
 330         struct ir3 *ir;
 331         struct ir3_instruction *new_addr = NULL;
 332         unsigned i;
 333
 334         debug_assert(ctx->addr);
 335
 336         ir = ctx->addr->block->shader;
 337
 338         for (i = 0; i < ir->indirects_count; i++) {
 339                 struct ir3_instruction *indirect = ir->indirects[i];
 340
 341                 /* skip instructions already scheduled: */
 342                 if (is_scheduled(indirect))
 343                         continue;
 344
 345                 /* remap remaining instructions using current addr
 346                  * to new addr:
 347                  */
 348                 if (indirect->address == ctx->addr) {
 349                         if (!new_addr) {
 350                                 new_addr = ir3_instr_clone(ctx->addr);
 351                                 /* original addr is scheduled, but new one isn't: */
 352                                 new_addr->flags &= ~IR3_INSTR_MARK;
 353                         }
 354                         ir3_instr_set_address(indirect, new_addr);
 355                 }
 356         }
 357
 358         /* all remaining indirects remapped to new addr: */
 359         ctx->addr = NULL;
 360
 361         return new_addr;
 362 }
 363
 364 /* "spill" the predicate register by remapping any unscheduled
 365  * instructions which depend on the current predicate register
 366  * to a clone of the instruction which wrote the address reg.
 367  */
 368 static struct ir3_instruction *
 369 split_pred(struct ir3_sched_ctx *ctx)
 370 {
 371         struct ir3 *ir;
 372         struct ir3_instruction *new_pred = NULL;
 373         unsigned i;
 374
 375         debug_assert(ctx->pred);
 376
 377         ir = ctx->pred->block->shader;
 378
 379         for (i = 0; i < ir->predicates_count; i++) {
 380                 struct ir3_instruction *predicated = ir->predicates[i];
 381
 382                 /* skip instructions already scheduled: */
 383                 if (is_scheduled(predicated))
 384                         continue;
 385
 386                 /* remap remaining instructions using current pred
 387                  * to new pred:
 388                  *
 389                  * TODO is there ever a case when pred isn't first
 390                  * (and only) src?
 391                  */
 392                 if (ssa(predicated->regs[1]) == ctx->pred) {
 393                         if (!new_pred) {
 394                                 new_pred = ir3_instr_clone(ctx->pred);
 395                                 /* original pred is scheduled, but new one isn't: */
 396                                 new_pred->flags &= ~IR3_INSTR_MARK;
 397                         }
 398                         predicated->regs[1]->instr = new_pred;
 399                 }
 400         }
 401
 402         /* all remaining predicated remapped to new pred: */
 403         ctx->pred = NULL;
 404
 405         return new_pred;
 406 }
 407
 408 static void
 409 sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
 410 {
 411         struct list_head unscheduled_list, prio_queue;
 412
 413         ctx->block = block;
 414
 415         /* move all instructions to the unscheduled list, and
 416          * empty the block's instruction list (to which we will
 417          * be inserting.
 418          */
 419         list_replace(&block->instr_list, &unscheduled_list);
 420         list_inithead(&block->instr_list);
 421         list_inithead(&prio_queue);
 422
 423         /* first a pre-pass to schedule all meta:input/phi instructions
 424          * (which need to appear first so that RA knows the register is
 425          * occupied:
 426          */
 427         list_for_each_entry_safe (struct ir3_instruction, instr, &unscheduled_list, node) {
 428                 if (is_meta(instr) && ((instr->opc == OPC_META_INPUT) ||
 429                                 (instr->opc == OPC_META_PHI)))
 430                         schedule(ctx, instr);
 431         }
 432
 433         while (!(list_empty(&unscheduled_list) &&
 434                         list_empty(&prio_queue))) {
 435                 struct ir3_sched_notes notes = {0};
 436                 unsigned delay;
 437
 438                 delay = add_eligible_instrs(ctx, &notes, &prio_queue, &unscheduled_list);
 439
 440                 if (!list_empty(&prio_queue)) {
 441                         struct ir3_instruction *instr = list_last_entry(&prio_queue,
 442                                         struct ir3_instruction, node);
 443                         /* ugg, this is a bit ugly, but between the time when
 444                          * the instruction became eligible and now, a new
 445                          * conflict may have arose..
 446                          */
 447                         if (check_conflict(ctx, &notes, instr)) {
 448                                 list_del(&instr->node);
 449                                 list_addtail(&instr->node, &unscheduled_list);
 450                                 continue;
 451                         }
 452
 453                         schedule(ctx, instr);
 454                 } else if (delay == ~0) {
 455                         struct ir3_instruction *new_instr = NULL;
 456
 457                         /* nothing available to schedule.. if we are blocked on
 458                          * address/predicate register conflict, then break the
 459                          * deadlock by cloning the instruction that wrote that
 460                          * reg:
 461                          */
 462                         if (notes.addr_conflict) {
 463                                 new_instr = split_addr(ctx);
 464                         } else if (notes.pred_conflict) {
 465                                 new_instr = split_pred(ctx);
 466                         } else {
 467                                 debug_assert(0);
 468                                 ctx->error = true;
 469                                 return;
 470                         }
 471
 472                         if (new_instr) {
 473                                 list_del(&new_instr->node);
 474                                 list_addtail(&new_instr->node, &unscheduled_list);
 475                         }
 476
 477                 } else {
 478                         /* and if we run out of instructions that can be scheduled,
 479                          * then it is time for nop's:
 480                          */
 481                         debug_assert(delay <= 6);
 482                         while (delay > 0) {
 483                                 ir3_NOP(block);
 484                                 delay--;
 485                         }
 486                 }
 487         }
 488
 489         /* And lastly, insert branch/jump instructions to take us to
 490          * the next block.  Later we'll strip back out the branches
 491          * that simply jump to next instruction.
 492          */
 493         if (block->successors[1]) {
 494                 /* if/else, conditional branches to "then" or "else": */
 495                 struct ir3_instruction *br;
 496                 unsigned delay = 6;
 497
 498                 debug_assert(ctx->pred);
 499                 debug_assert(block->condition);
 500
 501                 delay -= distance(ctx, ctx->pred, delay);
 502
 503                 while (delay > 0) {
 504                         ir3_NOP(block);
 505                         delay--;
 506                 }
 507
 508                 /* create "else" branch first (since "then" block should
 509                  * frequently/always end up being a fall-thru):
 510                  */
 511                 br = ir3_BR(block);
 512                 br->cat0.inv = true;
 513                 br->cat0.target = block->successors[1];
 514
 515                 /* NOTE: we have to hard code delay of 6 above, since
 516                  * we want to insert the nop's before constructing the
 517                  * branch.  Throw in an assert so we notice if this
 518                  * ever breaks on future generation:
 519                  */
 520                 debug_assert(ir3_delayslots(ctx->pred, br, 0) == 6);
 521
 522                 br = ir3_BR(block);
 523                 br->cat0.target = block->successors[0];
 524
 525         } else if (block->successors[0]) {
 526                 /* otherwise unconditional jump to next block: */
 527                 struct ir3_instruction *jmp;
 528
 529                 jmp = ir3_JUMP(block);
 530                 jmp->cat0.target = block->successors[0];
 531         }
 532
 533         /* NOTE: if we kept track of the predecessors, we could do a better
 534          * job w/ (jp) flags.. every node w/ > predecessor is a join point.
 535          * Note that as we eliminate blocks which contain only an unconditional
 536          * jump we probably need to propagate (jp) flag..
 537          */
 538 }
 539
 540 /* this is needed to ensure later RA stage succeeds: */
 541 static void
 542 sched_insert_parallel_copies(struct ir3_block *block)
 543 {
 544         list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
 545                 if (is_meta(instr) && (instr->opc == OPC_META_PHI)) {
 546                         struct ir3_register *reg;
 547                         foreach_src(reg, instr) {
 548                                 struct ir3_instruction *src = reg->instr;
 549                                 struct ir3_instruction *mov =
 550                                         ir3_MOV(src->block, src, TYPE_U32);
 551                                 mov->regs[0]->flags |= IR3_REG_PHI_SRC;
 552                                 mov->regs[0]->instr = instr;
 553                                 reg->instr = mov;
 554                         }
 555                 }
 556         }
 557 }
 558
 559 int ir3_sched(struct ir3 *ir)
 560 {
 561         struct ir3_sched_ctx ctx = {0};
 562         list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
 563                 sched_insert_parallel_copies(block);
 564         }
 565         ir3_clear_mark(ir);
 566         list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
 567                 sched_block(&ctx, block);
 568         }
 569         if (ctx.error)
 570                 return -1;
 571         return 0;
 572 }