src/gallium/drivers/freedreno/ir3/ir3_sched.c

   1 /* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
   2
   3 /*
   4  * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the "Software"),
   8  * to deal in the Software without restriction, including without limitation
   9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  10  * and/or sell copies of the Software, and to permit persons to whom the
  11  * Software is furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice (including the next
  14  * paragraph) shall be included in all copies or substantial portions of the
  15  * Software.
  16  *
  17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  18  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  20  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  21  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  22  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  23  * SOFTWARE.
  24  *
  25  * Authors:
  26  *    Rob Clark <robclark@freedesktop.org>
  27  */
  28
  29
  30 #include "util/u_math.h"
  31
  32 #include "ir3.h"
  33
  34 /*
  35  * Instruction Scheduling:
  36  *
  37  * A recursive depth based scheduling algo.  Recursively find an eligible
  38  * instruction to schedule from the deepest instruction (recursing through
  39  * it's unscheduled src instructions).  Normally this would result in a
  40  * lot of re-traversal of the same instructions, so we cache results in
  41  * instr->data (and clear cached results that would be no longer valid
  42  * after scheduling an instruction).
  43  *
  44  * There are a few special cases that need to be handled, since sched
  45  * is currently independent of register allocation.  Usages of address
  46  * register (a0.x) or predicate register (p0.x) must be serialized.  Ie.
  47  * if you have two pairs of instructions that write the same special
  48  * register and then read it, then those pairs cannot be interleaved.
  49  * To solve this, when we are in such a scheduling "critical section",
  50  * and we encounter a conflicting write to a special register, we try
  51  * to schedule any remaining instructions that use that value first.
  52  */
  53
  54 struct ir3_sched_ctx {
  55         struct ir3_block *block;           /* the current block */
  56         struct list_head depth_list;       /* depth sorted unscheduled instrs */
  57         struct ir3_instruction *scheduled; /* last scheduled instr XXX remove*/
  58         struct ir3_instruction *addr;      /* current a0.x user, if any */
  59         struct ir3_instruction *pred;      /* current p0.x user, if any */
  60         bool error;
  61 };
  62
  63 static bool is_sfu_or_mem(struct ir3_instruction *instr)
  64 {
  65         return is_sfu(instr) || is_mem(instr);
  66 }
  67
  68 #define NULL_INSTR ((void *)~0)
  69
  70 static void
  71 clear_cache(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
  72 {
  73         list_for_each_entry (struct ir3_instruction, instr2, &ctx->depth_list, node) {
  74                 if ((instr2->data == instr) || (instr2->data == NULL_INSTR) || !instr)
  75                         instr2->data = NULL;
  76         }
  77 }
  78
  79 static void
  80 schedule(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
  81 {
  82         debug_assert(ctx->block == instr->block);
  83
  84         /* maybe there is a better way to handle this than just stuffing
  85          * a nop.. ideally we'd know about this constraint in the
  86          * scheduling and depth calculation..
  87          */
  88         if (ctx->scheduled && is_sfu_or_mem(ctx->scheduled) && is_sfu_or_mem(instr))
  89                 ir3_NOP(ctx->block);
  90
  91         /* remove from depth list:
  92          */
  93         list_delinit(&instr->node);
  94
  95         if (writes_addr(instr)) {
  96                 debug_assert(ctx->addr == NULL);
  97                 ctx->addr = instr;
  98         }
  99
 100         if (writes_pred(instr)) {
 101                 debug_assert(ctx->pred == NULL);
 102                 ctx->pred = instr;
 103         }
 104
 105         instr->flags |= IR3_INSTR_MARK;
 106
 107         list_addtail(&instr->node, &instr->block->instr_list);
 108         ctx->scheduled = instr;
 109
 110         if (writes_addr(instr) || writes_pred(instr) || is_input(instr)) {
 111                 clear_cache(ctx, NULL);
 112         } else {
 113                 /* invalidate only the necessary entries.. */
 114                 clear_cache(ctx, instr);
 115         }
 116 }
 117
 118 static struct ir3_instruction *
 119 deepest(struct ir3_instruction **srcs, unsigned nsrcs)
 120 {
 121         struct ir3_instruction *d = NULL;
 122         unsigned i = 0, id = 0;
 123
 124         while ((i < nsrcs) && !(d = srcs[id = i]))
 125                 i++;
 126
 127         if (!d)
 128                 return NULL;
 129
 130         for (; i < nsrcs; i++)
 131                 if (srcs[i] && (srcs[i]->depth > d->depth))
 132                         d = srcs[id = i];
 133
 134         srcs[id] = NULL;
 135
 136         return d;
 137 }
 138
 139 static unsigned
 140 distance(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr,
 141                 unsigned maxd)
 142 {
 143         struct list_head *instr_list = &ctx->block->instr_list;
 144         unsigned d = 0;
 145
 146         list_for_each_entry_rev (struct ir3_instruction, n, instr_list, node) {
 147                 if ((n == instr) || (d >= maxd))
 148                         break;
 149                 if (is_alu(n) || is_flow(n))
 150                         d++;
 151         }
 152
 153         return d;
 154 }
 155
 156 /* calculate delay for specified src: */
 157 static unsigned
 158 delay_calc_srcn(struct ir3_sched_ctx *ctx,
 159                 struct ir3_instruction *assigner,
 160                 struct ir3_instruction *consumer, unsigned srcn)
 161 {
 162         unsigned delay = 0;
 163
 164         if (is_meta(assigner)) {
 165                 struct ir3_instruction *src;
 166                 foreach_ssa_src(src, assigner) {
 167                         unsigned d;
 168                         if (src->block != assigner->block)
 169                                 break;
 170                         d = delay_calc_srcn(ctx, src, consumer, srcn);
 171                         delay = MAX2(delay, d);
 172                 }
 173         } else {
 174                 delay = ir3_delayslots(assigner, consumer, srcn);
 175                 delay -= distance(ctx, assigner, delay);
 176         }
 177
 178         return delay;
 179 }
 180
 181 /* calculate delay for instruction (maximum of delay for all srcs): */
 182 static unsigned
 183 delay_calc(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
 184 {
 185         unsigned delay = 0;
 186         struct ir3_instruction *src;
 187
 188         foreach_ssa_src_n(src, i, instr) {
 189                 unsigned d;
 190                 /* for array writes, no need to delay on previous write: */
 191                 if (i == 0)
 192                         continue;
 193                 if (src->block != instr->block)
 194                         continue;
 195                 d = delay_calc_srcn(ctx, src, instr, i);
 196                 delay = MAX2(delay, d);
 197         }
 198
 199         return delay;
 200 }
 201
 202 struct ir3_sched_notes {
 203         /* there is at least one kill which could be scheduled, except
 204          * for unscheduled bary.f's:
 205          */
 206         bool blocked_kill;
 207         /* there is at least one instruction that could be scheduled,
 208          * except for conflicting address/predicate register usage:
 209          */
 210         bool addr_conflict, pred_conflict;
 211 };
 212
 213 static bool is_scheduled(struct ir3_instruction *instr)
 214 {
 215         return !!(instr->flags & IR3_INSTR_MARK);
 216 }
 217
 218 /* could an instruction be scheduled if specified ssa src was scheduled? */
 219 static bool
 220 could_sched(struct ir3_instruction *instr, struct ir3_instruction *src)
 221 {
 222         struct ir3_instruction *other_src;
 223         foreach_ssa_src(other_src, instr) {
 224                 /* if dependency not scheduled, we aren't ready yet: */
 225                 if ((src != other_src) && !is_scheduled(other_src)) {
 226                         return false;
 227                 }
 228         }
 229         return true;
 230 }
 231
 232 /* Check if instruction is ok to schedule.  Make sure it is not blocked
 233  * by use of addr/predicate register, etc.
 234  */
 235 static bool
 236 check_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
 237                 struct ir3_instruction *instr)
 238 {
 239         /* For instructions that write address register we need to
 240          * make sure there is at least one instruction that uses the
 241          * addr value which is otherwise ready.
 242          *
 243          * TODO if any instructions use pred register and have other
 244          * src args, we would need to do the same for writes_pred()..
 245          */
 246         if (writes_addr(instr)) {
 247                 struct ir3 *ir = instr->block->shader;
 248                 bool ready = false;
 249                 for (unsigned i = 0; (i < ir->indirects_count) && !ready; i++) {
 250                         struct ir3_instruction *indirect = ir->indirects[i];
 251                         if (!indirect)
 252                                 continue;
 253                         if (indirect->address != instr)
 254                                 continue;
 255                         ready = could_sched(indirect, instr);
 256                 }
 257
 258                 /* nothing could be scheduled, so keep looking: */
 259                 if (!ready)
 260                         return false;
 261         }
 262
 263         /* if this is a write to address/predicate register, and that
 264          * register is currently in use, we need to defer until it is
 265          * free:
 266          */
 267         if (writes_addr(instr) && ctx->addr) {
 268                 debug_assert(ctx->addr != instr);
 269                 notes->addr_conflict = true;
 270                 return false;
 271         }
 272
 273         if (writes_pred(instr) && ctx->pred) {
 274                 debug_assert(ctx->pred != instr);
 275                 notes->pred_conflict = true;
 276                 return false;
 277         }
 278
 279         /* if the instruction is a kill, we need to ensure *every*
 280          * bary.f is scheduled.  The hw seems unhappy if the thread
 281          * gets killed before the end-input (ei) flag is hit.
 282          *
 283          * We could do this by adding each bary.f instruction as
 284          * virtual ssa src for the kill instruction.  But we have
 285          * fixed length instr->regs[].
 286          *
 287          * TODO this wouldn't be quite right if we had multiple
 288          * basic blocks, if any block was conditional.  We'd need
 289          * to schedule the bary.f's outside of any block which
 290          * was conditional that contained a kill.. I think..
 291          */
 292         if (is_kill(instr)) {
 293                 struct ir3 *ir = instr->block->shader;
 294
 295                 for (unsigned i = 0; i < ir->baryfs_count; i++) {
 296                         struct ir3_instruction *baryf = ir->baryfs[i];
 297                         if (baryf->flags & IR3_INSTR_UNUSED)
 298                                 continue;
 299                         if (!is_scheduled(baryf)) {
 300                                 notes->blocked_kill = true;
 301                                 return false;
 302                         }
 303                 }
 304         }
 305
 306         return true;
 307 }
 308
 309 /* Find the best instruction to schedule from specified instruction or
 310  * recursively it's ssa sources.
 311  */
 312 static struct ir3_instruction *
 313 find_instr_recursive(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
 314                 struct ir3_instruction *instr)
 315 {
 316         struct ir3_instruction *srcs[__ssa_src_cnt(instr)];
 317         struct ir3_instruction *src;
 318         unsigned nsrcs = 0;
 319
 320         if (is_scheduled(instr))
 321                 return NULL;
 322
 323         /* use instr->data to cache the results of recursing up the
 324          * instr src's.  Otherwise the recursive algo can scale quite
 325          * badly w/ shader size.  But this takes some care to clear
 326          * the cache appropriately when instructions are scheduled.
 327          */
 328         if (instr->data) {
 329                 if (instr->data == NULL_INSTR)
 330                         return NULL;
 331                 return instr->data;
 332         }
 333
 334         /* find unscheduled srcs: */
 335         foreach_ssa_src(src, instr) {
 336                 if (!is_scheduled(src)) {
 337                         debug_assert(nsrcs < ARRAY_SIZE(srcs));
 338                         srcs[nsrcs++] = src;
 339                 }
 340         }
 341
 342         /* if all our src's are already scheduled: */
 343         if (nsrcs == 0) {
 344                 if (check_instr(ctx, notes, instr)) {
 345                         instr->data = instr;
 346                         return instr;
 347                 }
 348                 return NULL;
 349         }
 350
 351         while ((src = deepest(srcs, nsrcs))) {
 352                 struct ir3_instruction *candidate;
 353
 354                 candidate = find_instr_recursive(ctx, notes, src);
 355                 if (!candidate)
 356                         continue;
 357
 358                 if (check_instr(ctx, notes, candidate)) {
 359                         instr->data = candidate;
 360                         return candidate;
 361                 }
 362         }
 363
 364         instr->data = NULL_INSTR;
 365         return NULL;
 366 }
 367
 368 /* find instruction to schedule: */
 369 static struct ir3_instruction *
 370 find_eligible_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes)
 371 {
 372         struct ir3_instruction *best_instr = NULL;
 373         unsigned min_delay = ~0;
 374
 375         /* TODO we'd really rather use the list/array of block outputs.  But we
 376          * don't have such a thing.  Recursing *every* instruction in the list
 377          * will result in a lot of repeated traversal, since instructions will
 378          * get traversed both when they appear as ssa src to a later instruction
 379          * as well as where they appear in the depth_list.
 380          */
 381         list_for_each_entry_rev (struct ir3_instruction, instr, &ctx->depth_list, node) {
 382                 struct ir3_instruction *candidate;
 383                 unsigned delay;
 384
 385                 candidate = find_instr_recursive(ctx, notes, instr);
 386                 if (!candidate)
 387                         continue;
 388
 389                 delay = delay_calc(ctx, candidate);
 390                 if (delay < min_delay) {
 391                         best_instr = candidate;
 392                         min_delay = delay;
 393                 }
 394
 395                 if (min_delay == 0)
 396                         break;
 397         }
 398
 399         return best_instr;
 400 }
 401
 402 /* "spill" the address register by remapping any unscheduled
 403  * instructions which depend on the current address register
 404  * to a clone of the instruction which wrote the address reg.
 405  */
 406 static struct ir3_instruction *
 407 split_addr(struct ir3_sched_ctx *ctx)
 408 {
 409         struct ir3 *ir;
 410         struct ir3_instruction *new_addr = NULL;
 411         unsigned i;
 412
 413         debug_assert(ctx->addr);
 414
 415         ir = ctx->addr->block->shader;
 416
 417         for (i = 0; i < ir->indirects_count; i++) {
 418                 struct ir3_instruction *indirect = ir->indirects[i];
 419
 420                 if (!indirect)
 421                         continue;
 422
 423                 /* skip instructions already scheduled: */
 424                 if (is_scheduled(indirect))
 425                         continue;
 426
 427                 /* remap remaining instructions using current addr
 428                  * to new addr:
 429                  */
 430                 if (indirect->address == ctx->addr) {
 431                         if (!new_addr) {
 432                                 new_addr = ir3_instr_clone(ctx->addr);
 433                                 /* original addr is scheduled, but new one isn't: */
 434                                 new_addr->flags &= ~IR3_INSTR_MARK;
 435                         }
 436                         ir3_instr_set_address(indirect, new_addr);
 437                 }
 438         }
 439
 440         /* all remaining indirects remapped to new addr: */
 441         ctx->addr = NULL;
 442
 443         return new_addr;
 444 }
 445
 446 /* "spill" the predicate register by remapping any unscheduled
 447  * instructions which depend on the current predicate register
 448  * to a clone of the instruction which wrote the address reg.
 449  */
 450 static struct ir3_instruction *
 451 split_pred(struct ir3_sched_ctx *ctx)
 452 {
 453         struct ir3 *ir;
 454         struct ir3_instruction *new_pred = NULL;
 455         unsigned i;
 456
 457         debug_assert(ctx->pred);
 458
 459         ir = ctx->pred->block->shader;
 460
 461         for (i = 0; i < ir->predicates_count; i++) {
 462                 struct ir3_instruction *predicated = ir->predicates[i];
 463
 464                 /* skip instructions already scheduled: */
 465                 if (is_scheduled(predicated))
 466                         continue;
 467
 468                 /* remap remaining instructions using current pred
 469                  * to new pred:
 470                  *
 471                  * TODO is there ever a case when pred isn't first
 472                  * (and only) src?
 473                  */
 474                 if (ssa(predicated->regs[1]) == ctx->pred) {
 475                         if (!new_pred) {
 476                                 new_pred = ir3_instr_clone(ctx->pred);
 477                                 /* original pred is scheduled, but new one isn't: */
 478                                 new_pred->flags &= ~IR3_INSTR_MARK;
 479                         }
 480                         predicated->regs[1]->instr = new_pred;
 481                 }
 482         }
 483
 484         /* all remaining predicated remapped to new pred: */
 485         ctx->pred = NULL;
 486
 487         return new_pred;
 488 }
 489
 490 static void
 491 sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
 492 {
 493         struct list_head unscheduled_list;
 494
 495         ctx->block = block;
 496
 497         /* addr/pred writes are per-block: */
 498         ctx->addr = NULL;
 499         ctx->pred = NULL;
 500
 501         /* move all instructions to the unscheduled list, and
 502          * empty the block's instruction list (to which we will
 503          * be inserting).
 504          */
 505         list_replace(&block->instr_list, &unscheduled_list);
 506         list_inithead(&block->instr_list);
 507         list_inithead(&ctx->depth_list);
 508
 509         /* first a pre-pass to schedule all meta:input/phi instructions
 510          * (which need to appear first so that RA knows the register is
 511          * occupied), and move remaining to depth sorted list:
 512          */
 513         list_for_each_entry_safe (struct ir3_instruction, instr, &unscheduled_list, node) {
 514                 if ((instr->opc == OPC_META_INPUT) || (instr->opc == OPC_META_PHI)) {
 515                         schedule(ctx, instr);
 516                 } else {
 517                         ir3_insert_by_depth(instr, &ctx->depth_list);
 518                 }
 519         }
 520
 521         while (!list_empty(&ctx->depth_list)) {
 522                 struct ir3_sched_notes notes = {0};
 523                 struct ir3_instruction *instr;
 524
 525                 instr = find_eligible_instr(ctx, &notes);
 526
 527                 if (instr) {
 528                         unsigned delay = delay_calc(ctx, instr);
 529
 530                         /* and if we run out of instructions that can be scheduled,
 531                          * then it is time for nop's:
 532                          */
 533                         debug_assert(delay <= 6);
 534                         while (delay > 0) {
 535                                 ir3_NOP(block);
 536                                 delay--;
 537                         }
 538
 539                         schedule(ctx, instr);
 540                 } else {
 541                         struct ir3_instruction *new_instr = NULL;
 542
 543                         /* nothing available to schedule.. if we are blocked on
 544                          * address/predicate register conflict, then break the
 545                          * deadlock by cloning the instruction that wrote that
 546                          * reg:
 547                          */
 548                         if (notes.addr_conflict) {
 549                                 new_instr = split_addr(ctx);
 550                         } else if (notes.pred_conflict) {
 551                                 new_instr = split_pred(ctx);
 552                         } else {
 553                                 debug_assert(0);
 554                                 ctx->error = true;
 555                                 return;
 556                         }
 557
 558                         if (new_instr) {
 559                                 /* clearing current addr/pred can change what is
 560                                  * available to schedule, so clear cache..
 561                                  */
 562                                 clear_cache(ctx, NULL);
 563
 564                                 ir3_insert_by_depth(new_instr, &ctx->depth_list);
 565                                 /* the original instr that wrote addr/pred may have
 566                                  * originated from a different block:
 567                                  */
 568                                 new_instr->block = block;
 569                         }
 570                 }
 571         }
 572
 573         /* And lastly, insert branch/jump instructions to take us to
 574          * the next block.  Later we'll strip back out the branches
 575          * that simply jump to next instruction.
 576          */
 577         if (block->successors[1]) {
 578                 /* if/else, conditional branches to "then" or "else": */
 579                 struct ir3_instruction *br;
 580                 unsigned delay = 6;
 581
 582                 debug_assert(ctx->pred);
 583                 debug_assert(block->condition);
 584
 585                 delay -= distance(ctx, ctx->pred, delay);
 586
 587                 while (delay > 0) {
 588                         ir3_NOP(block);
 589                         delay--;
 590                 }
 591
 592                 /* create "else" branch first (since "then" block should
 593                  * frequently/always end up being a fall-thru):
 594                  */
 595                 br = ir3_BR(block);
 596                 br->cat0.inv = true;
 597                 br->cat0.target = block->successors[1];
 598
 599                 /* NOTE: we have to hard code delay of 6 above, since
 600                  * we want to insert the nop's before constructing the
 601                  * branch.  Throw in an assert so we notice if this
 602                  * ever breaks on future generation:
 603                  */
 604                 debug_assert(ir3_delayslots(ctx->pred, br, 0) == 6);
 605
 606                 br = ir3_BR(block);
 607                 br->cat0.target = block->successors[0];
 608
 609         } else if (block->successors[0]) {
 610                 /* otherwise unconditional jump to next block: */
 611                 struct ir3_instruction *jmp;
 612
 613                 jmp = ir3_JUMP(block);
 614                 jmp->cat0.target = block->successors[0];
 615         }
 616
 617         /* NOTE: if we kept track of the predecessors, we could do a better
 618          * job w/ (jp) flags.. every node w/ > predecessor is a join point.
 619          * Note that as we eliminate blocks which contain only an unconditional
 620          * jump we probably need to propagate (jp) flag..
 621          */
 622 }
 623
 624 /* this is needed to ensure later RA stage succeeds: */
 625 static void
 626 sched_insert_parallel_copies(struct ir3_block *block)
 627 {
 628         list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
 629                 if (instr->opc == OPC_META_PHI) {
 630                         struct ir3_register *reg, *reg2;
 631                         foreach_src(reg, instr) {
 632                                 struct ir3_instruction *src = reg->instr;
 633                                 struct ir3_instruction *mov = NULL;
 634
 635                                 /* after CP we could end up w/ duplicate phi srcs: */
 636                                 foreach_src(reg2, instr) {
 637                                         if (reg == reg2)
 638                                                 break;
 639                                         /* reg2 is before reg1 so already an inserted mov: */
 640                                         else if (reg2->instr->regs[1]->instr == src) {
 641                                                 mov = reg2->instr;
 642                                                 break;
 643                                         }
 644                                 }
 645
 646                                 if (!mov) {
 647                                         mov = ir3_MOV(src->block, src, TYPE_U32);
 648                                         mov->regs[0]->flags |= IR3_REG_PHI_SRC;
 649                                         mov->regs[0]->instr = instr;
 650                                 }
 651
 652                                 reg->instr = mov;
 653                         }
 654                 }
 655         }
 656 }
 657
 658 int ir3_sched(struct ir3 *ir)
 659 {
 660         struct ir3_sched_ctx ctx = {0};
 661         list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
 662                 sched_insert_parallel_copies(block);
 663         }
 664         ir3_clear_mark(ir);
 665         list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
 666                 sched_block(&ctx, block);
 667         }
 668         if (ctx.error)
 669                 return -1;
 670         return 0;
 671 }