src/gallium/drivers/freedreno/ir3/ir3_sched.c

   1 /* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
   2
   3 /*
   4  * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the "Software"),
   8  * to deal in the Software without restriction, including without limitation
   9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  10  * and/or sell copies of the Software, and to permit persons to whom the
  11  * Software is furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice (including the next
  14  * paragraph) shall be included in all copies or substantial portions of the
  15  * Software.
  16  *
  17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  18  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  20  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  21  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  22  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  23  * SOFTWARE.
  24  *
  25  * Authors:
  26  *    Rob Clark <robclark@freedesktop.org>
  27  */
  28
  29
  30 #include "util/u_math.h"
  31
  32 #include "ir3.h"
  33
  34 /*
  35  * Instruction Scheduling:
  36  *
  37  * A priority-queue based scheduling algo.  Add eligible instructions,
  38  * ie. ones with all their dependencies scheduled, to the priority
  39  * (depth) sorted queue (list).  Pop highest priority instruction off
  40  * the queue and schedule it, add newly eligible instructions to the
  41  * priority queue, rinse, repeat.
  42  *
  43  * There are a few special cases that need to be handled, since sched
  44  * is currently independent of register allocation.  Usages of address
  45  * register (a0.x) or predicate register (p0.x) must be serialized.  Ie.
  46  * if you have two pairs of instructions that write the same special
  47  * register and then read it, then those pairs cannot be interleaved.
  48  * To solve this, when we are in such a scheduling "critical section",
  49  * and we encounter a conflicting write to a special register, we try
  50  * to schedule any remaining instructions that use that value first.
  51  */
  52
  53 struct ir3_sched_ctx {
  54         struct ir3_block *block;           /* the current block */
  55         struct ir3_instruction *scheduled; /* last scheduled instr XXX remove*/
  56         struct ir3_instruction *addr;      /* current a0.x user, if any */
  57         struct ir3_instruction *pred;      /* current p0.x user, if any */
  58         bool error;
  59 };
  60
  61 static bool is_sfu_or_mem(struct ir3_instruction *instr)
  62 {
  63         return is_sfu(instr) || is_mem(instr);
  64 }
  65
  66 static void
  67 schedule(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
  68 {
  69         debug_assert(ctx->block == instr->block);
  70
  71         /* maybe there is a better way to handle this than just stuffing
  72          * a nop.. ideally we'd know about this constraint in the
  73          * scheduling and depth calculation..
  74          */
  75         if (ctx->scheduled && is_sfu_or_mem(ctx->scheduled) && is_sfu_or_mem(instr))
  76                 ir3_NOP(ctx->block);
  77
  78         /* remove from depth list:
  79          */
  80         list_delinit(&instr->node);
  81
  82         if (writes_addr(instr)) {
  83                 debug_assert(ctx->addr == NULL);
  84                 ctx->addr = instr;
  85         }
  86
  87         if (writes_pred(instr)) {
  88                 debug_assert(ctx->pred == NULL);
  89                 ctx->pred = instr;
  90         }
  91
  92         instr->flags |= IR3_INSTR_MARK;
  93
  94         list_addtail(&instr->node, &instr->block->instr_list);
  95         ctx->scheduled = instr;
  96 }
  97
  98 static unsigned
  99 distance(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr,
 100                 unsigned maxd)
 101 {
 102         struct list_head *instr_list = &ctx->block->instr_list;
 103         unsigned d = 0;
 104
 105         list_for_each_entry_rev (struct ir3_instruction, n, instr_list, node) {
 106                 if ((n == instr) || (d >= maxd))
 107                         break;
 108                 if (is_alu(n) || is_flow(n))
 109                         d++;
 110         }
 111
 112         return d;
 113 }
 114
 115 /* calculate delay for specified src: */
 116 static unsigned
 117 delay_calc_srcn(struct ir3_sched_ctx *ctx,
 118                 struct ir3_instruction *assigner,
 119                 struct ir3_instruction *consumer, unsigned srcn)
 120 {
 121         unsigned delay = 0;
 122
 123         if (is_meta(assigner)) {
 124                 struct ir3_instruction *src;
 125                 foreach_ssa_src(src, assigner) {
 126                         unsigned d;
 127                         if (src->block != assigner->block)
 128                                 break;
 129                         d = delay_calc_srcn(ctx, src, consumer, srcn);
 130                         delay = MAX2(delay, d);
 131                 }
 132         } else {
 133                 delay = ir3_delayslots(assigner, consumer, srcn);
 134                 delay -= distance(ctx, assigner, delay);
 135         }
 136
 137         return delay;
 138 }
 139
 140 /* calculate delay for instruction (maximum of delay for all srcs): */
 141 static unsigned
 142 delay_calc(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
 143 {
 144         unsigned delay = 0;
 145         struct ir3_instruction *src;
 146
 147         foreach_ssa_src_n(src, i, instr) {
 148                 unsigned d;
 149                 if (src->block != instr->block)
 150                         continue;
 151                 d = delay_calc_srcn(ctx, src, instr, i);
 152                 delay = MAX2(delay, d);
 153         }
 154
 155         return delay;
 156 }
 157
 158 struct ir3_sched_notes {
 159         /* there is at least one kill which could be scheduled, except
 160          * for unscheduled bary.f's:
 161          */
 162         bool blocked_kill;
 163         /* there is at least one instruction that could be scheduled,
 164          * except for conflicting address/predicate register usage:
 165          */
 166         bool addr_conflict, pred_conflict;
 167 };
 168
 169 static bool is_scheduled(struct ir3_instruction *instr)
 170 {
 171         return !!(instr->flags & IR3_INSTR_MARK);
 172 }
 173
 174 static bool
 175 check_conflict(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
 176                 struct ir3_instruction *instr)
 177 {
 178         /* if this is a write to address/predicate register, and that
 179          * register is currently in use, we need to defer until it is
 180          * free:
 181          */
 182         if (writes_addr(instr) && ctx->addr) {
 183                 debug_assert(ctx->addr != instr);
 184                 notes->addr_conflict = true;
 185                 return true;
 186         }
 187
 188         if (writes_pred(instr) && ctx->pred) {
 189                 debug_assert(ctx->pred != instr);
 190                 notes->pred_conflict = true;
 191                 return true;
 192         }
 193
 194         return false;
 195 }
 196
 197 /* is this instruction ready to be scheduled?  Return negative for not
 198  * ready (updating notes if needed), or >= 0 to indicate number of
 199  * delay slots needed.
 200  */
 201 static int
 202 instr_eligibility(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
 203                 struct ir3_instruction *instr)
 204 {
 205         struct ir3_instruction *src;
 206         unsigned delay = 0;
 207
 208         /* Phi instructions can have a dependency on something not
 209          * scheduled yet (for ex, loops).  But OTOH we don't really
 210          * care.  By definition phi's should appear at the top of
 211          * the block, and it's sources should be values from the
 212          * previously executing block, so they are always ready to
 213          * be scheduled:
 214          */
 215         if (is_meta(instr) && (instr->opc == OPC_META_PHI))
 216                 return 0;
 217
 218         foreach_ssa_src(src, instr) {
 219                 /* if dependency not scheduled, we aren't ready yet: */
 220                 if (!is_scheduled(src))
 221                         return -1;
 222         }
 223
 224         /* all our dependents are scheduled, figure out if
 225          * we have enough delay slots to schedule ourself:
 226          */
 227         delay = delay_calc(ctx, instr);
 228         if (delay)
 229                 return delay;
 230
 231         /* if the instruction is a kill, we need to ensure *every*
 232          * bary.f is scheduled.  The hw seems unhappy if the thread
 233          * gets killed before the end-input (ei) flag is hit.
 234          *
 235          * We could do this by adding each bary.f instruction as
 236          * virtual ssa src for the kill instruction.  But we have
 237          * fixed length instr->regs[].
 238          *
 239          * TODO this wouldn't be quite right if we had multiple
 240          * basic blocks, if any block was conditional.  We'd need
 241          * to schedule the bary.f's outside of any block which
 242          * was conditional that contained a kill.. I think..
 243          */
 244         if (is_kill(instr)) {
 245                 struct ir3 *ir = instr->block->shader;
 246
 247                 for (unsigned i = 0; i < ir->baryfs_count; i++) {
 248                         struct ir3_instruction *baryf = ir->baryfs[i];
 249                         if (baryf->depth == DEPTH_UNUSED)
 250                                 continue;
 251                         if (!is_scheduled(baryf)) {
 252                                 notes->blocked_kill = true;
 253                                 return -1;
 254                         }
 255                 }
 256         }
 257
 258         if (check_conflict(ctx, notes, instr))
 259                 return -1;
 260
 261         return 0;
 262 }
 263
 264 /* could an instruction be scheduled if specified ssa src was scheduled? */
 265 static bool
 266 could_sched(struct ir3_instruction *instr, struct ir3_instruction *src)
 267 {
 268         struct ir3_instruction *other_src;
 269         foreach_ssa_src(other_src, instr) {
 270                 /* if dependency not scheduled, we aren't ready yet: */
 271                 if ((src != other_src) && !is_scheduled(other_src)) {
 272                         return false;
 273                 }
 274         }
 275         return true;
 276 }
 277
 278 /* move eligible instructions to the priority list: */
 279 static unsigned
 280 add_eligible_instrs(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
 281                 struct list_head *prio_queue, struct list_head *unscheduled_list)
 282 {
 283         unsigned min_delay = ~0;
 284
 285         list_for_each_entry_safe (struct ir3_instruction, instr, unscheduled_list, node) {
 286                 int e = instr_eligibility(ctx, notes, instr);
 287                 if (e < 0)
 288                         continue;
 289
 290                 /* For instructions that write address register we need to
 291                  * make sure there is at least one instruction that uses the
 292                  * addr value which is otherwise ready.
 293                  *
 294                  * TODO if any instructions use pred register and have other
 295                  * src args, we would need to do the same for writes_pred()..
 296                  */
 297                 if (unlikely(writes_addr(instr))) {
 298                         struct ir3 *ir = instr->block->shader;
 299                         bool ready = false;
 300                         for (unsigned i = 0; (i < ir->indirects_count) && !ready; i++) {
 301                                 struct ir3_instruction *indirect = ir->indirects[i];
 302                                 if (!indirect)
 303                                         continue;
 304                                 if (indirect->address != instr)
 305                                         continue;
 306                                 ready = could_sched(indirect, instr);
 307                         }
 308
 309                         /* nothing could be scheduled, so keep looking: */
 310                         if (!ready)
 311                                 continue;
 312                 }
 313
 314                 min_delay = MIN2(min_delay, e);
 315                 if (e == 0) {
 316                         /* remove from unscheduled list and into priority queue: */
 317                         list_delinit(&instr->node);
 318                         ir3_insert_by_depth(instr, prio_queue);
 319                 }
 320         }
 321
 322         return min_delay;
 323 }
 324
 325 /* "spill" the address register by remapping any unscheduled
 326  * instructions which depend on the current address register
 327  * to a clone of the instruction which wrote the address reg.
 328  */
 329 static struct ir3_instruction *
 330 split_addr(struct ir3_sched_ctx *ctx)
 331 {
 332         struct ir3 *ir;
 333         struct ir3_instruction *new_addr = NULL;
 334         unsigned i;
 335
 336         debug_assert(ctx->addr);
 337
 338         ir = ctx->addr->block->shader;
 339
 340         for (i = 0; i < ir->indirects_count; i++) {
 341                 struct ir3_instruction *indirect = ir->indirects[i];
 342
 343                 if (!indirect)
 344                         continue;
 345
 346                 /* skip instructions already scheduled: */
 347                 if (is_scheduled(indirect))
 348                         continue;
 349
 350                 /* remap remaining instructions using current addr
 351                  * to new addr:
 352                  */
 353                 if (indirect->address == ctx->addr) {
 354                         if (!new_addr) {
 355                                 new_addr = ir3_instr_clone(ctx->addr);
 356                                 /* original addr is scheduled, but new one isn't: */
 357                                 new_addr->flags &= ~IR3_INSTR_MARK;
 358                         }
 359                         ir3_instr_set_address(indirect, new_addr);
 360                 }
 361         }
 362
 363         /* all remaining indirects remapped to new addr: */
 364         ctx->addr = NULL;
 365
 366         return new_addr;
 367 }
 368
 369 /* "spill" the predicate register by remapping any unscheduled
 370  * instructions which depend on the current predicate register
 371  * to a clone of the instruction which wrote the address reg.
 372  */
 373 static struct ir3_instruction *
 374 split_pred(struct ir3_sched_ctx *ctx)
 375 {
 376         struct ir3 *ir;
 377         struct ir3_instruction *new_pred = NULL;
 378         unsigned i;
 379
 380         debug_assert(ctx->pred);
 381
 382         ir = ctx->pred->block->shader;
 383
 384         for (i = 0; i < ir->predicates_count; i++) {
 385                 struct ir3_instruction *predicated = ir->predicates[i];
 386
 387                 /* skip instructions already scheduled: */
 388                 if (is_scheduled(predicated))
 389                         continue;
 390
 391                 /* remap remaining instructions using current pred
 392                  * to new pred:
 393                  *
 394                  * TODO is there ever a case when pred isn't first
 395                  * (and only) src?
 396                  */
 397                 if (ssa(predicated->regs[1]) == ctx->pred) {
 398                         if (!new_pred) {
 399                                 new_pred = ir3_instr_clone(ctx->pred);
 400                                 /* original pred is scheduled, but new one isn't: */
 401                                 new_pred->flags &= ~IR3_INSTR_MARK;
 402                         }
 403                         predicated->regs[1]->instr = new_pred;
 404                 }
 405         }
 406
 407         /* all remaining predicated remapped to new pred: */
 408         ctx->pred = NULL;
 409
 410         return new_pred;
 411 }
 412
 413 static void
 414 sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
 415 {
 416         struct list_head unscheduled_list, prio_queue;
 417
 418         ctx->block = block;
 419
 420         /* move all instructions to the unscheduled list, and
 421          * empty the block's instruction list (to which we will
 422          * be inserting.
 423          */
 424         list_replace(&block->instr_list, &unscheduled_list);
 425         list_inithead(&block->instr_list);
 426         list_inithead(&prio_queue);
 427
 428         /* first a pre-pass to schedule all meta:input/phi instructions
 429          * (which need to appear first so that RA knows the register is
 430          * occupied:
 431          */
 432         list_for_each_entry_safe (struct ir3_instruction, instr, &unscheduled_list, node) {
 433                 if (is_meta(instr) && ((instr->opc == OPC_META_INPUT) ||
 434                                 (instr->opc == OPC_META_PHI)))
 435                         schedule(ctx, instr);
 436         }
 437
 438         while (!(list_empty(&unscheduled_list) &&
 439                         list_empty(&prio_queue))) {
 440                 struct ir3_sched_notes notes = {0};
 441                 unsigned delay;
 442
 443                 delay = add_eligible_instrs(ctx, &notes, &prio_queue, &unscheduled_list);
 444
 445                 if (!list_empty(&prio_queue)) {
 446                         struct ir3_instruction *instr = list_last_entry(&prio_queue,
 447                                         struct ir3_instruction, node);
 448                         /* ugg, this is a bit ugly, but between the time when
 449                          * the instruction became eligible and now, a new
 450                          * conflict may have arose..
 451                          */
 452                         if (check_conflict(ctx, &notes, instr)) {
 453                                 list_del(&instr->node);
 454                                 list_addtail(&instr->node, &unscheduled_list);
 455                                 continue;
 456                         }
 457
 458                         schedule(ctx, instr);
 459                 } else if (delay == ~0) {
 460                         struct ir3_instruction *new_instr = NULL;
 461
 462                         /* nothing available to schedule.. if we are blocked on
 463                          * address/predicate register conflict, then break the
 464                          * deadlock by cloning the instruction that wrote that
 465                          * reg:
 466                          */
 467                         if (notes.addr_conflict) {
 468                                 new_instr = split_addr(ctx);
 469                         } else if (notes.pred_conflict) {
 470                                 new_instr = split_pred(ctx);
 471                         } else {
 472                                 debug_assert(0);
 473                                 ctx->error = true;
 474                                 return;
 475                         }
 476
 477                         if (new_instr) {
 478                                 list_del(&new_instr->node);
 479                                 list_addtail(&new_instr->node, &unscheduled_list);
 480                                 /* the original instr that wrote addr/pred may have
 481                                  * originated from a different block:
 482                                  */
 483                                 new_instr->block = block;
 484                         }
 485
 486                 } else {
 487                         /* and if we run out of instructions that can be scheduled,
 488                          * then it is time for nop's:
 489                          */
 490                         debug_assert(delay <= 6);
 491                         while (delay > 0) {
 492                                 ir3_NOP(block);
 493                                 delay--;
 494                         }
 495                 }
 496         }
 497
 498         /* And lastly, insert branch/jump instructions to take us to
 499          * the next block.  Later we'll strip back out the branches
 500          * that simply jump to next instruction.
 501          */
 502         if (block->successors[1]) {
 503                 /* if/else, conditional branches to "then" or "else": */
 504                 struct ir3_instruction *br;
 505                 unsigned delay = 6;
 506
 507                 debug_assert(ctx->pred);
 508                 debug_assert(block->condition);
 509
 510                 delay -= distance(ctx, ctx->pred, delay);
 511
 512                 while (delay > 0) {
 513                         ir3_NOP(block);
 514                         delay--;
 515                 }
 516
 517                 /* create "else" branch first (since "then" block should
 518                  * frequently/always end up being a fall-thru):
 519                  */
 520                 br = ir3_BR(block);
 521                 br->cat0.inv = true;
 522                 br->cat0.target = block->successors[1];
 523
 524                 /* NOTE: we have to hard code delay of 6 above, since
 525                  * we want to insert the nop's before constructing the
 526                  * branch.  Throw in an assert so we notice if this
 527                  * ever breaks on future generation:
 528                  */
 529                 debug_assert(ir3_delayslots(ctx->pred, br, 0) == 6);
 530
 531                 br = ir3_BR(block);
 532                 br->cat0.target = block->successors[0];
 533
 534         } else if (block->successors[0]) {
 535                 /* otherwise unconditional jump to next block: */
 536                 struct ir3_instruction *jmp;
 537
 538                 jmp = ir3_JUMP(block);
 539                 jmp->cat0.target = block->successors[0];
 540         }
 541
 542         /* NOTE: if we kept track of the predecessors, we could do a better
 543          * job w/ (jp) flags.. every node w/ > predecessor is a join point.
 544          * Note that as we eliminate blocks which contain only an unconditional
 545          * jump we probably need to propagate (jp) flag..
 546          */
 547 }
 548
 549 /* this is needed to ensure later RA stage succeeds: */
 550 static void
 551 sched_insert_parallel_copies(struct ir3_block *block)
 552 {
 553         list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
 554                 if (is_meta(instr) && (instr->opc == OPC_META_PHI)) {
 555                         struct ir3_register *reg;
 556                         foreach_src(reg, instr) {
 557                                 struct ir3_instruction *src = reg->instr;
 558                                 struct ir3_instruction *mov =
 559                                         ir3_MOV(src->block, src, TYPE_U32);
 560                                 mov->regs[0]->flags |= IR3_REG_PHI_SRC;
 561                                 mov->regs[0]->instr = instr;
 562                                 reg->instr = mov;
 563                         }
 564                 }
 565         }
 566 }
 567
 568 int ir3_sched(struct ir3 *ir)
 569 {
 570         struct ir3_sched_ctx ctx = {0};
 571         list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
 572                 sched_insert_parallel_copies(block);
 573         }
 574         ir3_clear_mark(ir);
 575         list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
 576                 sched_block(&ctx, block);
 577         }
 578         if (ctx.error)
 579                 return -1;
 580         return 0;
 581 }