src/freedreno/ir3/ir3_sched.c

   1 /*
   2  * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21  * SOFTWARE.
  22  *
  23  * Authors:
  24  *    Rob Clark <robclark@freedesktop.org>
  25  */
  26
  27
  28 #include "util/dag.h"
  29 #include "util/u_math.h"
  30
  31 #include "ir3.h"
  32 #include "ir3_compiler.h"
  33
  34 #ifdef DEBUG
  35 #define SCHED_DEBUG (ir3_shader_debug & IR3_DBG_SCHEDMSGS)
  36 #else
  37 #define SCHED_DEBUG 0
  38 #endif
  39 #define d(fmt, ...) do { if (SCHED_DEBUG) { \
  40         printf("SCHED: "fmt"\n", ##__VA_ARGS__); \
  41 } } while (0)
  42
  43 #define di(instr, fmt, ...) do { if (SCHED_DEBUG) { \
  44         printf("SCHED: "fmt": ", ##__VA_ARGS__); \
  45         ir3_print_instr(instr); \
  46 } } while (0)
  47
  48 /*
  49  * Instruction Scheduling:
  50  *
  51  * A block-level pre-RA scheduler, which works by creating a DAG of
  52  * instruction dependencies, and heuristically picking a DAG head
  53  * (instruction with no unscheduled dependencies).
  54  *
  55  * Where possible, it tries to pick instructions that avoid nop delay
  56  * slots, but it will prefer to pick instructions that reduce (or do
  57  * not increase) the number of live values.
  58  *
  59  * If the only possible choices are instructions that increase the
  60  * number of live values, it will try to pick the one with the earliest
  61  * consumer (based on pre-sched program order).
  62  *
  63  * There are a few special cases that need to be handled, since sched
  64  * is currently independent of register allocation.  Usages of address
  65  * register (a0.x) or predicate register (p0.x) must be serialized.  Ie.
  66  * if you have two pairs of instructions that write the same special
  67  * register and then read it, then those pairs cannot be interleaved.
  68  * To solve this, when we are in such a scheduling "critical section",
  69  * and we encounter a conflicting write to a special register, we try
  70  * to schedule any remaining instructions that use that value first.
  71  *
  72  * TODO we can detect too-large live_values here.. would be a good place
  73  * to "spill" cheap things, like move from uniform/immed.  (Constructing
  74  * list of ssa def consumers before sched pass would make this easier.
  75  * Also, in general it is general it might be best not to re-use load_immed
  76  * across blocks.
  77  *
  78  * TODO we can use (abs)/(neg) src modifiers in a lot of cases to reduce
  79  * the # of immediates in play (or at least that would help with
  80  * dEQP-GLES31.functional.ubo.random.all_per_block_buffers.*).. probably
  81  * do this in a nir pass that inserts fneg/etc?  The cp pass should fold
  82  * these into src modifiers..
  83  */
  84
  85 struct ir3_sched_ctx {
  86         struct ir3_block *block;           /* the current block */
  87         struct dag *dag;
  88
  89         struct list_head unscheduled_list; /* unscheduled instructions */
  90         struct ir3_instruction *scheduled; /* last scheduled instr */
  91         struct ir3_instruction *addr0;     /* current a0.x user, if any */
  92         struct ir3_instruction *addr1;     /* current a1.x user, if any */
  93         struct ir3_instruction *pred;      /* current p0.x user, if any */
  94
  95         int remaining_kills;
  96         int remaining_tex;
  97
  98         bool error;
  99
 100         int sfu_delay;
 101         int tex_delay;
 102 };
 103
 104 struct ir3_sched_node {
 105         struct dag_node dag;     /* must be first for util_dynarray_foreach */
 106         struct ir3_instruction *instr;
 107
 108         unsigned delay;
 109         unsigned max_delay;
 110
 111         /* For instructions that are a meta:collect src, once we schedule
 112          * the first src of the collect, the entire vecN is live (at least
 113          * from the PoV of the first RA pass.. the 2nd scalar pass can fill
 114          * in some of the gaps, but often not all).  So we want to help out
 115          * RA, and realize that as soon as we schedule the first collect
 116          * src, there is no penalty to schedule the remainder (ie. they
 117          * don't make additional values live).  In fact we'd prefer to
 118          * schedule the rest ASAP to minimize the live range of the vecN.
 119          *
 120          * For instructions that are the src of a collect, we track the
 121          * corresponding collect, and mark them as partially live as soon
 122          * as any one of the src's is scheduled.
 123          */
 124         struct ir3_instruction *collect;
 125         bool partially_live;
 126
 127         /* Is this instruction a direct or indirect dependency for a kill?
 128          * If so, we should prioritize it when possible
 129          */
 130         bool kill_path;
 131
 132         /* This node represents a shader output.  A semi-common pattern in
 133          * shaders is something along the lines of:
 134          *
 135          *    fragcolor.w = 1.0
 136          *
 137          * Which we'd prefer to schedule as late as possible, since it
 138          * produces a live value that is never killed/consumed.  So detect
 139          * outputs up-front, and avoid scheduling them unless the reduce
 140          * register pressure (or at least are neutral)
 141          */
 142         bool output;
 143 };
 144
 145 #define foreach_sched_node(__n, __list) \
 146         list_for_each_entry(struct ir3_sched_node, __n, __list, dag.link)
 147
 148 static void sched_node_init(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr);
 149 static void sched_node_add_dep(struct ir3_instruction *instr, struct ir3_instruction *src, int i);
 150
 151 static bool is_scheduled(struct ir3_instruction *instr)
 152 {
 153         return !!(instr->flags & IR3_INSTR_MARK);
 154 }
 155
 156 static void
 157 schedule(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
 158 {
 159         debug_assert(ctx->block == instr->block);
 160
 161         /* remove from depth list:
 162          */
 163         list_delinit(&instr->node);
 164
 165         if (writes_addr0(instr)) {
 166                 debug_assert(ctx->addr0 == NULL);
 167                 ctx->addr0 = instr;
 168         }
 169
 170         if (writes_addr1(instr)) {
 171                 debug_assert(ctx->addr1 == NULL);
 172                 ctx->addr1 = instr;
 173         }
 174
 175         if (writes_pred(instr)) {
 176                 debug_assert(ctx->pred == NULL);
 177                 ctx->pred = instr;
 178         }
 179
 180         instr->flags |= IR3_INSTR_MARK;
 181
 182         di(instr, "schedule");
 183
 184         list_addtail(&instr->node, &instr->block->instr_list);
 185         ctx->scheduled = instr;
 186
 187         if (is_kill(instr)){
 188                 assert(ctx->remaining_kills > 0);
 189                 ctx->remaining_kills--;
 190         }
 191
 192         struct ir3_sched_node *n = instr->data;
 193
 194         /* If this instruction is a meta:collect src, mark the remaining
 195          * collect srcs as partially live.
 196          */
 197         if (n->collect) {
 198                 struct ir3_instruction *src;
 199                 foreach_ssa_src (src, n->collect) {
 200                         if (src->block != instr->block)
 201                                 continue;
 202                         struct ir3_sched_node *sn = src->data;
 203                         sn->partially_live = true;
 204                 }
 205         }
 206
 207         dag_prune_head(ctx->dag, &n->dag);
 208
 209         if (is_meta(instr) && (instr->opc != OPC_META_TEX_PREFETCH))
 210                 return;
 211
 212         if (is_sfu(instr)) {
 213                 ctx->sfu_delay = 8;
 214         } else if (check_src_cond(instr, is_sfu)) {
 215                 ctx->sfu_delay = 0;
 216         } else if (ctx->sfu_delay > 0) {
 217                 ctx->sfu_delay--;
 218         }
 219
 220         if (is_tex_or_prefetch(instr)) {
 221                 /* NOTE that this isn't an attempt to hide texture fetch latency,
 222                  * but an attempt to hide the cost of switching to another warp.
 223                  * If we can, we'd like to try to schedule another texture fetch
 224                  * before scheduling something that would sync.
 225                  */
 226                 ctx->tex_delay = 10;
 227                 assert(ctx->remaining_tex > 0);
 228                 ctx->remaining_tex--;
 229         } else if (check_src_cond(instr, is_tex_or_prefetch)) {
 230                 ctx->tex_delay = 0;
 231         } else if (ctx->tex_delay > 0) {
 232                 ctx->tex_delay--;
 233         }
 234 }
 235
 236 struct ir3_sched_notes {
 237         /* there is at least one kill which could be scheduled, except
 238          * for unscheduled bary.f's:
 239          */
 240         bool blocked_kill;
 241         /* there is at least one instruction that could be scheduled,
 242          * except for conflicting address/predicate register usage:
 243          */
 244         bool addr0_conflict, addr1_conflict, pred_conflict;
 245 };
 246
 247 /* could an instruction be scheduled if specified ssa src was scheduled? */
 248 static bool
 249 could_sched(struct ir3_instruction *instr, struct ir3_instruction *src)
 250 {
 251         struct ir3_instruction *other_src;
 252         foreach_ssa_src (other_src, instr) {
 253                 /* if dependency not scheduled, we aren't ready yet: */
 254                 if ((src != other_src) && !is_scheduled(other_src)) {
 255                         return false;
 256                 }
 257         }
 258         return true;
 259 }
 260
 261 /* Check if instruction is ok to schedule.  Make sure it is not blocked
 262  * by use of addr/predicate register, etc.
 263  */
 264 static bool
 265 check_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
 266                 struct ir3_instruction *instr)
 267 {
 268         debug_assert(!is_scheduled(instr));
 269
 270         if (ctx->remaining_kills && (is_tex(instr) || is_mem(instr))) {
 271                 /* avoid texture/memory access if we have unscheduled kills
 272                  * that could make the expensive operation unnecessary.  By
 273                  * definition, if there are remaining kills, and this instr
 274                  * is not a dependency of a kill, there are other instructions
 275                  * that we can choose from.
 276                  */
 277                 struct ir3_sched_node *n = instr->data;
 278                 if (!n->kill_path)
 279                         return false;
 280         }
 281
 282         /* For instructions that write address register we need to
 283          * make sure there is at least one instruction that uses the
 284          * addr value which is otherwise ready.
 285          *
 286          * NOTE if any instructions use pred register and have other
 287          * src args, we would need to do the same for writes_pred()..
 288          */
 289         if (writes_addr0(instr)) {
 290                 struct ir3 *ir = instr->block->shader;
 291                 bool ready = false;
 292                 for (unsigned i = 0; (i < ir->a0_users_count) && !ready; i++) {
 293                         struct ir3_instruction *indirect = ir->a0_users[i];
 294                         if (!indirect)
 295                                 continue;
 296                         if (indirect->address != instr)
 297                                 continue;
 298                         ready = could_sched(indirect, instr);
 299                 }
 300
 301                 /* nothing could be scheduled, so keep looking: */
 302                 if (!ready)
 303                         return false;
 304         }
 305
 306         if (writes_addr1(instr)) {
 307                 struct ir3 *ir = instr->block->shader;
 308                 bool ready = false;
 309                 for (unsigned i = 0; (i < ir->a1_users_count) && !ready; i++) {
 310                         struct ir3_instruction *indirect = ir->a1_users[i];
 311                         if (!indirect)
 312                                 continue;
 313                         if (indirect->address != instr)
 314                                 continue;
 315                         ready = could_sched(indirect, instr);
 316                 }
 317
 318                 /* nothing could be scheduled, so keep looking: */
 319                 if (!ready)
 320                         return false;
 321         }
 322
 323         /* if this is a write to address/predicate register, and that
 324          * register is currently in use, we need to defer until it is
 325          * free:
 326          */
 327         if (writes_addr0(instr) && ctx->addr0) {
 328                 debug_assert(ctx->addr0 != instr);
 329                 notes->addr0_conflict = true;
 330                 return false;
 331         }
 332
 333         if (writes_addr1(instr) && ctx->addr1) {
 334                 debug_assert(ctx->addr1 != instr);
 335                 notes->addr1_conflict = true;
 336                 return false;
 337         }
 338
 339         if (writes_pred(instr) && ctx->pred) {
 340                 debug_assert(ctx->pred != instr);
 341                 notes->pred_conflict = true;
 342                 return false;
 343         }
 344
 345         /* if the instruction is a kill, we need to ensure *every*
 346          * bary.f is scheduled.  The hw seems unhappy if the thread
 347          * gets killed before the end-input (ei) flag is hit.
 348          *
 349          * We could do this by adding each bary.f instruction as
 350          * virtual ssa src for the kill instruction.  But we have
 351          * fixed length instr->regs[].
 352          *
 353          * TODO we could handle this by false-deps now, probably.
 354          */
 355         if (is_kill(instr)) {
 356                 struct ir3 *ir = instr->block->shader;
 357
 358                 for (unsigned i = 0; i < ir->baryfs_count; i++) {
 359                         struct ir3_instruction *baryf = ir->baryfs[i];
 360                         if (baryf->flags & IR3_INSTR_UNUSED)
 361                                 continue;
 362                         if (!is_scheduled(baryf)) {
 363                                 notes->blocked_kill = true;
 364                                 return false;
 365                         }
 366                 }
 367         }
 368
 369         return true;
 370 }
 371
 372 /* Find the instr->ip of the closest use of an instruction, in
 373  * pre-sched order.  This isn't going to be the same as post-sched
 374  * order, but it is a reasonable approximation to limit scheduling
 375  * instructions *too* early.  This is mostly to prevent bad behavior
 376  * in cases where we have a large number of possible instructions
 377  * to choose, to avoid creating too much parallelism (ie. blowing
 378  * up register pressure)
 379  *
 380  * See dEQP-GLES31.functional.atomic_counter.layout.reverse_offset.inc_dec.8_counters_5_calls_1_thread
 381  */
 382 static int
 383 nearest_use(struct ir3_instruction *instr)
 384 {
 385         unsigned nearest = ~0;
 386         foreach_ssa_use (use, instr)
 387                 if (!is_scheduled(use))
 388                         nearest = MIN2(nearest, use->ip);
 389
 390         /* slight hack.. this heuristic tends to push bary.f's to later
 391          * in the shader, closer to their uses.  But we actually would
 392          * prefer to get these scheduled earlier, to unlock varying
 393          * storage for more VS jobs:
 394          */
 395         if (is_input(instr))
 396                 nearest /= 2;
 397
 398         return nearest;
 399 }
 400
 401 static int
 402 use_count(struct ir3_instruction *instr)
 403 {
 404         unsigned cnt = 0;
 405         foreach_ssa_use (use, instr)
 406                 if (!is_scheduled(use))
 407                         cnt++;
 408         return cnt;
 409 }
 410
 411 /* find net change to live values if instruction were scheduled: */
 412 static int
 413 live_effect(struct ir3_instruction *instr)
 414 {
 415         struct ir3_sched_node *n = instr->data;
 416         struct ir3_instruction *src;
 417         int new_live = n->partially_live ? 0 : dest_regs(instr);
 418         int freed_live = 0;
 419
 420         /* if we schedule something that causes a vecN to be live,
 421          * then count all it's other components too:
 422          */
 423         if (n->collect)
 424                 new_live *= n->collect->regs_count - 1;
 425
 426         foreach_ssa_src_n (src, n, instr) {
 427                 if (__is_false_dep(instr, n))
 428                         continue;
 429
 430                 if (instr->block != src->block)
 431                         continue;
 432
 433                 if (use_count(src) == 1)
 434                         freed_live += dest_regs(src);
 435         }
 436
 437         return new_live - freed_live;
 438 }
 439
 440 /* Determine if this is an instruction that we'd prefer not to schedule
 441  * yet, in order to avoid an (ss)/(sy) sync.  This is limited by the
 442  * sfu_delay/tex_delay counters, ie. the more cycles it has been since
 443  * the last SFU/tex, the less costly a sync would be.
 444  */
 445 static bool
 446 would_sync(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
 447 {
 448         if (ctx->sfu_delay) {
 449                 if (check_src_cond(instr, is_sfu))
 450                         return true;
 451         }
 452
 453         /* We mostly just want to try to schedule another texture fetch
 454          * before scheduling something that would (sy) sync, so we can
 455          * limit this rule to cases where there are remaining texture
 456          * fetches
 457          */
 458         if (ctx->tex_delay && ctx->remaining_tex) {
 459                 if (check_src_cond(instr, is_tex_or_prefetch))
 460                         return true;
 461         }
 462
 463         return false;
 464 }
 465
 466 static struct ir3_sched_node *
 467 choose_instr_inc(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
 468                 bool avoid_sync, bool avoid_output);
 469
 470 /**
 471  * Chooses an instruction to schedule using the Goodman/Hsu (1988) CSR (Code
 472  * Scheduling for Register pressure) heuristic.
 473  *
 474  * Only handles the case of choosing instructions that reduce register pressure
 475  * or are even.
 476  */
 477 static struct ir3_sched_node *
 478 choose_instr_dec(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
 479                 bool avoid_sync)
 480 {
 481         const char *mode = avoid_sync ? "-as" : "";
 482         struct ir3_sched_node *chosen = NULL;
 483
 484         /* Find a ready inst with regs freed and pick the one with max cost. */
 485         foreach_sched_node (n, &ctx->dag->heads) {
 486                 if (avoid_sync && would_sync(ctx, n->instr))
 487                         continue;
 488
 489                 unsigned d = ir3_delay_calc(ctx->block, n->instr, false, false);
 490
 491                 if (d > 0)
 492                         continue;
 493
 494                 if (live_effect(n->instr) > -1)
 495                         continue;
 496
 497                 if (!check_instr(ctx, notes, n->instr))
 498                         continue;
 499
 500                 if (!chosen || chosen->max_delay < n->max_delay) {
 501                         chosen = n;
 502                 }
 503         }
 504
 505         if (chosen) {
 506                 di(chosen->instr, "dec%s: chose (freed+ready)", mode);
 507                 return chosen;
 508         }
 509
 510         /* Find a leader with regs freed and pick the one with max cost. */
 511         foreach_sched_node (n, &ctx->dag->heads) {
 512                 if (avoid_sync && would_sync(ctx, n->instr))
 513                         continue;
 514
 515                 if (live_effect(n->instr) > -1)
 516                         continue;
 517
 518                 if (!check_instr(ctx, notes, n->instr))
 519                         continue;
 520
 521                 if (!chosen || chosen->max_delay < n->max_delay) {
 522                         chosen = n;
 523                 }
 524         }
 525
 526         if (chosen) {
 527                 di(chosen->instr, "dec%s: chose (freed)", mode);
 528                 return chosen;
 529         }
 530
 531         /* Contra the paper, pick a leader with no effect on used regs.  This may
 532          * open up new opportunities, as otherwise a single-operand instr consuming
 533          * a value will tend to block finding freeing that value.  This had a
 534          * massive effect on reducing spilling on V3D.
 535          *
 536          * XXX: Should this prioritize ready?
 537          */
 538         foreach_sched_node (n, &ctx->dag->heads) {
 539                 if (avoid_sync && would_sync(ctx, n->instr))
 540                         continue;
 541
 542                 unsigned d = ir3_delay_calc(ctx->block, n->instr, false, false);
 543
 544                 if (d > 0)
 545                         continue;
 546
 547                 if (live_effect(n->instr) > 0)
 548                         continue;
 549
 550                 if (!check_instr(ctx, notes, n->instr))
 551                         continue;
 552
 553                 if (!chosen || chosen->max_delay < n->max_delay)
 554                         chosen = n;
 555         }
 556
 557         if (chosen) {
 558                 di(chosen->instr, "dec%s: chose (neutral+ready)", mode);
 559                 return chosen;
 560         }
 561
 562         foreach_sched_node (n, &ctx->dag->heads) {
 563                 if (avoid_sync && would_sync(ctx, n->instr))
 564                         continue;
 565
 566                 if (live_effect(n->instr) > 0)
 567                         continue;
 568
 569                 if (!check_instr(ctx, notes, n->instr))
 570                         continue;
 571
 572                 if (!chosen || chosen->max_delay < n->max_delay)
 573                         chosen = n;
 574         }
 575
 576         if (chosen) {
 577                 di(chosen->instr, "dec%s: chose (neutral)", mode);
 578                 return chosen;
 579         }
 580
 581         return choose_instr_inc(ctx, notes, avoid_sync, true);
 582 }
 583
 584 /**
 585  * When we can't choose an instruction that reduces register pressure or
 586  * is neutral, we end up here to try and pick the least bad option.
 587  */
 588 static struct ir3_sched_node *
 589 choose_instr_inc(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
 590                 bool avoid_sync, bool avoid_output)
 591 {
 592         const char *mode = avoid_sync ? "-as" : "";
 593         struct ir3_sched_node *chosen = NULL;
 594
 595         /*
 596          * From hear on out, we are picking something that increases
 597          * register pressure.  So try to pick something which will
 598          * be consumed soon:
 599          */
 600         unsigned chosen_distance = 0;
 601
 602         /* Pick the max delay of the remaining ready set. */
 603         foreach_sched_node (n, &ctx->dag->heads) {
 604                 if (avoid_output && n->output)
 605                         continue;
 606
 607                 if (avoid_sync && would_sync(ctx, n->instr))
 608                         continue;
 609
 610                 unsigned d = ir3_delay_calc(ctx->block, n->instr, false, false);
 611
 612                 if (d > 0)
 613                         continue;
 614
 615                 if (!check_instr(ctx, notes, n->instr))
 616                         continue;
 617
 618                 unsigned distance = nearest_use(n->instr);
 619
 620                 if (!chosen || distance < chosen_distance) {
 621                         chosen = n;
 622                         chosen_distance = distance;
 623                 }
 624         }
 625
 626         if (chosen) {
 627                 di(chosen->instr, "inc%s: chose (distance+ready)", mode);
 628                 return chosen;
 629         }
 630
 631         /* Pick the max delay of the remaining leaders. */
 632         foreach_sched_node (n, &ctx->dag->heads) {
 633                 if (avoid_output && n->output)
 634                         continue;
 635
 636                 if (avoid_sync && would_sync(ctx, n->instr))
 637                         continue;
 638
 639                 if (!check_instr(ctx, notes, n->instr))
 640                         continue;
 641
 642                 unsigned distance = nearest_use(n->instr);
 643
 644                 if (!chosen || distance < chosen_distance) {
 645                         chosen = n;
 646                         chosen_distance = distance;
 647                 }
 648         }
 649
 650         if (chosen) {
 651                 di(chosen->instr, "inc%s: chose (distance)", mode);
 652                 return chosen;
 653         }
 654
 655         return NULL;
 656 }
 657
 658 /* Handles instruction selections for instructions we want to prioritize
 659  * even if csp/csr would not pick them.
 660  */
 661 static struct ir3_sched_node *
 662 choose_instr_prio(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes)
 663 {
 664         struct ir3_sched_node *chosen = NULL;
 665
 666         foreach_sched_node (n, &ctx->dag->heads) {
 667                 if (!is_meta(n->instr))
 668                         continue;
 669
 670                 if (!chosen || (chosen->max_delay < n->max_delay))
 671                         chosen = n;
 672         }
 673
 674         if (chosen) {
 675                 di(chosen->instr, "prio: chose (meta)");
 676                 return chosen;
 677         }
 678
 679         return NULL;
 680 }
 681
 682 static void
 683 dump_state(struct ir3_sched_ctx *ctx)
 684 {
 685         if (!SCHED_DEBUG)
 686                 return;
 687
 688         foreach_sched_node (n, &ctx->dag->heads) {
 689                 di(n->instr, "maxdel=%3d le=%d del=%u ",
 690                                 n->max_delay, live_effect(n->instr),
 691                                 ir3_delay_calc(ctx->block, n->instr, false, false));
 692
 693                 util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
 694                         struct ir3_sched_node *child = (struct ir3_sched_node *)edge->child;
 695
 696                         di(child->instr, " -> (%d parents) ", child->dag.parent_count);
 697                 }
 698         }
 699 }
 700
 701 /* find instruction to schedule: */
 702 static struct ir3_instruction *
 703 choose_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes)
 704 {
 705         struct ir3_sched_node *chosen;
 706
 707         dump_state(ctx);
 708
 709         chosen = choose_instr_prio(ctx, notes);
 710         if (chosen)
 711                 return chosen->instr;
 712
 713         chosen = choose_instr_dec(ctx, notes, true);
 714         if (chosen)
 715                 return chosen->instr;
 716
 717         chosen = choose_instr_dec(ctx, notes, false);
 718         if (chosen)
 719                 return chosen->instr;
 720
 721         chosen = choose_instr_inc(ctx, notes, false, false);
 722         if (chosen)
 723                 return chosen->instr;
 724
 725         return NULL;
 726 }
 727
 728 static struct ir3_instruction *
 729 split_instr(struct ir3_sched_ctx *ctx, struct ir3_instruction *orig_instr)
 730 {
 731         struct ir3_instruction *new_instr = ir3_instr_clone(orig_instr);
 732         di(new_instr, "split instruction");
 733         sched_node_init(ctx, new_instr);
 734         return new_instr;
 735 }
 736
 737 /* "spill" the address registers by remapping any unscheduled
 738  * instructions which depend on the current address register
 739  * to a clone of the instruction which wrote the address reg.
 740  */
 741 static struct ir3_instruction *
 742 split_addr(struct ir3_sched_ctx *ctx, struct ir3_instruction **addr,
 743                    struct ir3_instruction **users, unsigned users_count)
 744 {
 745         struct ir3_instruction *new_addr = NULL;
 746         unsigned i;
 747
 748         debug_assert(*addr);
 749
 750         for (i = 0; i < users_count; i++) {
 751                 struct ir3_instruction *indirect = users[i];
 752
 753                 if (!indirect)
 754                         continue;
 755
 756                 /* skip instructions already scheduled: */
 757                 if (is_scheduled(indirect))
 758                         continue;
 759
 760                 /* remap remaining instructions using current addr
 761                  * to new addr:
 762                  */
 763                 if (indirect->address == *addr) {
 764                         if (!new_addr) {
 765                                 new_addr = split_instr(ctx, *addr);
 766                                 /* original addr is scheduled, but new one isn't: */
 767                                 new_addr->flags &= ~IR3_INSTR_MARK;
 768                         }
 769                         indirect->address = new_addr;
 770                         /* don't need to remove old dag edge since old addr is
 771                          * already scheduled:
 772                          */
 773                         sched_node_add_dep(indirect, new_addr, 0);
 774                         di(indirect, "new address");
 775                 }
 776         }
 777
 778         /* all remaining indirects remapped to new addr: */
 779         *addr = NULL;
 780
 781         return new_addr;
 782 }
 783
 784 /* "spill" the predicate register by remapping any unscheduled
 785  * instructions which depend on the current predicate register
 786  * to a clone of the instruction which wrote the address reg.
 787  */
 788 static struct ir3_instruction *
 789 split_pred(struct ir3_sched_ctx *ctx)
 790 {
 791         struct ir3 *ir;
 792         struct ir3_instruction *new_pred = NULL;
 793         unsigned i;
 794
 795         debug_assert(ctx->pred);
 796
 797         ir = ctx->pred->block->shader;
 798
 799         for (i = 0; i < ir->predicates_count; i++) {
 800                 struct ir3_instruction *predicated = ir->predicates[i];
 801
 802                 /* skip instructions already scheduled: */
 803                 if (is_scheduled(predicated))
 804                         continue;
 805
 806                 /* remap remaining instructions using current pred
 807                  * to new pred:
 808                  *
 809                  * TODO is there ever a case when pred isn't first
 810                  * (and only) src?
 811                  */
 812                 if (ssa(predicated->regs[1]) == ctx->pred) {
 813                         if (!new_pred) {
 814                                 new_pred = split_instr(ctx, ctx->pred);
 815                                 /* original pred is scheduled, but new one isn't: */
 816                                 new_pred->flags &= ~IR3_INSTR_MARK;
 817                         }
 818                         predicated->regs[1]->instr = new_pred;
 819                         /* don't need to remove old dag edge since old pred is
 820                          * already scheduled:
 821                          */
 822                         sched_node_add_dep(predicated, new_pred, 0);
 823                         di(predicated, "new predicate");
 824                 }
 825         }
 826
 827         /* all remaining predicated remapped to new pred: */
 828         ctx->pred = NULL;
 829
 830         return new_pred;
 831 }
 832
 833 static void
 834 sched_node_init(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
 835 {
 836         struct ir3_sched_node *n = rzalloc(ctx->dag, struct ir3_sched_node);
 837
 838         dag_init_node(ctx->dag, &n->dag);
 839
 840         n->instr = instr;
 841         instr->data = n;
 842 }
 843
 844 static void
 845 sched_node_add_dep(struct ir3_instruction *instr, struct ir3_instruction *src, int i)
 846 {
 847         /* don't consider dependencies in other blocks: */
 848         if (src->block != instr->block)
 849                 return;
 850
 851         /* we could have false-dep's that end up unused: */
 852         if (src->flags & IR3_INSTR_UNUSED) {
 853                 debug_assert(__is_false_dep(instr, i));
 854                 return;
 855         }
 856
 857         struct ir3_sched_node *n = instr->data;
 858         struct ir3_sched_node *sn = src->data;
 859
 860         /* If src is consumed by a collect, track that to realize that once
 861          * any of the collect srcs are live, we should hurry up and schedule
 862          * the rest.
 863          */
 864         if (instr->opc == OPC_META_COLLECT)
 865                 sn->collect = instr;
 866
 867         dag_add_edge(&sn->dag, &n->dag, NULL);
 868
 869         unsigned d = ir3_delayslots(src, instr, i, true);
 870         n->delay = MAX2(n->delay, d);
 871 }
 872
 873 static void
 874 mark_kill_path(struct ir3_instruction *instr)
 875 {
 876         struct ir3_sched_node *n = instr->data;
 877         n->kill_path = true;
 878         struct ir3_instruction *src;
 879         foreach_ssa_src (src, instr) {
 880                 if (src->block != instr->block)
 881                         continue;
 882                 mark_kill_path(src);
 883         }
 884 }
 885
 886 /* Is it an output? */
 887 static bool
 888 is_output_collect(struct ir3_instruction *instr)
 889 {
 890         struct ir3 *ir = instr->block->shader;
 891
 892         for (unsigned i = 0; i < ir->outputs_count; i++) {
 893                 struct ir3_instruction *collect = ir->outputs[i];
 894                 assert(collect->opc == OPC_META_COLLECT);
 895                 if (instr == collect)
 896                         return true;
 897         }
 898
 899         return false;
 900 }
 901
 902 /* Is it's only use as output? */
 903 static bool
 904 is_output_only(struct ir3_instruction *instr)
 905 {
 906         if (!writes_gpr(instr))
 907                 return false;
 908
 909         if (!(instr->regs[0]->flags & IR3_REG_SSA))
 910                 return false;
 911
 912         foreach_ssa_use (use, instr)
 913                 if (!is_output_collect(use))
 914                         return false;
 915
 916         return true;
 917 }
 918
 919 static void
 920 sched_node_add_deps(struct ir3_instruction *instr)
 921 {
 922         struct ir3_instruction *src;
 923
 924         /* Since foreach_ssa_src() already handles false-dep's we can construct
 925          * the DAG easily in a single pass.
 926          */
 927         foreach_ssa_src_n (src, i, instr) {
 928                 sched_node_add_dep(instr, src, i);
 929         }
 930
 931         /* NOTE that all inputs must be scheduled before a kill, so
 932          * mark these to be prioritized as well:
 933          */
 934         if (is_kill(instr) || is_input(instr)) {
 935                 mark_kill_path(instr);
 936         }
 937
 938         if (is_output_only(instr)) {
 939                 struct ir3_sched_node *n = instr->data;
 940                 n->output = true;
 941         }
 942 }
 943
 944 static void
 945 sched_dag_max_delay_cb(struct dag_node *node, void *state)
 946 {
 947         struct ir3_sched_node *n = (struct ir3_sched_node *)node;
 948         uint32_t max_delay = 0;
 949
 950         util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
 951                 struct ir3_sched_node *child = (struct ir3_sched_node *)edge->child;
 952                 max_delay = MAX2(child->max_delay, max_delay);
 953         }
 954
 955         n->max_delay = MAX2(n->max_delay, max_delay + n->delay);
 956 }
 957
 958 static void
 959 sched_dag_init(struct ir3_sched_ctx *ctx)
 960 {
 961         ctx->dag = dag_create(ctx);
 962
 963         foreach_instr (instr, &ctx->unscheduled_list) {
 964                 sched_node_init(ctx, instr);
 965                 sched_node_add_deps(instr);
 966         }
 967
 968         dag_traverse_bottom_up(ctx->dag, sched_dag_max_delay_cb, NULL);
 969 }
 970
 971 static void
 972 sched_dag_destroy(struct ir3_sched_ctx *ctx)
 973 {
 974         ralloc_free(ctx->dag);
 975         ctx->dag = NULL;
 976 }
 977
 978 static void
 979 sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
 980 {
 981         ctx->block = block;
 982
 983         /* addr/pred writes are per-block: */
 984         ctx->addr0 = NULL;
 985         ctx->addr1 = NULL;
 986         ctx->pred = NULL;
 987
 988         /* move all instructions to the unscheduled list, and
 989          * empty the block's instruction list (to which we will
 990          * be inserting).
 991          */
 992         list_replace(&block->instr_list, &ctx->unscheduled_list);
 993         list_inithead(&block->instr_list);
 994
 995         sched_dag_init(ctx);
 996
 997         ctx->remaining_kills = 0;
 998         ctx->remaining_tex = 0;
 999         foreach_instr_safe (instr, &ctx->unscheduled_list) {
1000                 if (is_kill(instr))
1001                         ctx->remaining_kills++;
1002                 if (is_tex_or_prefetch(instr))
1003                         ctx->remaining_tex++;
1004         }
1005
1006         /* First schedule all meta:input instructions, followed by
1007          * tex-prefetch.  We want all of the instructions that load
1008          * values into registers before the shader starts to go
1009          * before any other instructions.  But in particular we
1010          * want inputs to come before prefetches.  This is because
1011          * a FS's bary_ij input may not actually be live in the
1012          * shader, but it should not be scheduled on top of any
1013          * other input (but can be overwritten by a tex prefetch)
1014          */
1015         foreach_instr_safe (instr, &ctx->unscheduled_list)
1016                 if (instr->opc == OPC_META_INPUT)
1017                         schedule(ctx, instr);
1018
1019         foreach_instr_safe (instr, &ctx->unscheduled_list)
1020                 if (instr->opc == OPC_META_TEX_PREFETCH)
1021                         schedule(ctx, instr);
1022
1023         while (!list_is_empty(&ctx->unscheduled_list)) {
1024                 struct ir3_sched_notes notes = {0};
1025                 struct ir3_instruction *instr;
1026
1027                 instr = choose_instr(ctx, &notes);
1028                 if (instr) {
1029                         unsigned delay = ir3_delay_calc(ctx->block, instr, false, false);
1030                         d("delay=%u", delay);
1031
1032                         /* and if we run out of instructions that can be scheduled,
1033                          * then it is time for nop's:
1034                          */
1035                         debug_assert(delay <= 6);
1036                         while (delay > 0) {
1037                                 ir3_NOP(block);
1038                                 delay--;
1039                         }
1040
1041                         schedule(ctx, instr);
1042                 } else {
1043                         struct ir3_instruction *new_instr = NULL;
1044                         struct ir3 *ir = block->shader;
1045
1046                         /* nothing available to schedule.. if we are blocked on
1047                          * address/predicate register conflict, then break the
1048                          * deadlock by cloning the instruction that wrote that
1049                          * reg:
1050                          */
1051                         if (notes.addr0_conflict) {
1052                                 new_instr = split_addr(ctx, &ctx->addr0,
1053                                                                            ir->a0_users, ir->a0_users_count);
1054                         } else if (notes.addr1_conflict) {
1055                                 new_instr = split_addr(ctx, &ctx->addr1,
1056                                                                            ir->a1_users, ir->a1_users_count);
1057                         } else if (notes.pred_conflict) {
1058                                 new_instr = split_pred(ctx);
1059                         } else {
1060                                 d("unscheduled_list:");
1061                                 foreach_instr (instr, &ctx->unscheduled_list)
1062                                         di(instr, "unscheduled: ");
1063                                 debug_assert(0);
1064                                 ctx->error = true;
1065                                 return;
1066                         }
1067
1068                         if (new_instr) {
1069                                 list_delinit(&new_instr->node);
1070                                 list_addtail(&new_instr->node, &ctx->unscheduled_list);
1071                         }
1072                 }
1073         }
1074
1075         sched_dag_destroy(ctx);
1076 }
1077
1078 int ir3_sched(struct ir3 *ir)
1079 {
1080         struct ir3_sched_ctx *ctx = rzalloc(NULL, struct ir3_sched_ctx);
1081
1082         foreach_block (block, &ir->block_list) {
1083                 foreach_instr (instr, &block->instr_list) {
1084                         instr->data = NULL;
1085                 }
1086         }
1087
1088         ir3_count_instructions(ir);
1089         ir3_clear_mark(ir);
1090         ir3_find_ssa_uses(ir, ctx, false);
1091
1092         foreach_block (block, &ir->block_list) {
1093                 sched_block(ctx, block);
1094         }
1095
1096         int ret = ctx->error ? -1 : 0;
1097
1098         ralloc_free(ctx);
1099
1100         return ret;
1101 }
1102
1103 static unsigned
1104 get_array_id(struct ir3_instruction *instr)
1105 {
1106         /* The expectation is that there is only a single array
1107          * src or dst, ir3_cp should enforce this.
1108          */
1109
1110         for (unsigned i = 0; i < instr->regs_count; i++)
1111                 if (instr->regs[i]->flags & IR3_REG_ARRAY)
1112                         return instr->regs[i]->array.id;
1113
1114         unreachable("this was unexpected");
1115 }
1116
1117 /* does instruction 'prior' need to be scheduled before 'instr'? */
1118 static bool
1119 depends_on(struct ir3_instruction *instr, struct ir3_instruction *prior)
1120 {
1121         /* TODO for dependencies that are related to a specific object, ie
1122          * a specific SSBO/image/array, we could relax this constraint to
1123          * make accesses to unrelated objects not depend on each other (at
1124          * least as long as not declared coherent)
1125          */
1126         if (((instr->barrier_class & IR3_BARRIER_EVERYTHING) && prior->barrier_class) ||
1127                         ((prior->barrier_class & IR3_BARRIER_EVERYTHING) && instr->barrier_class))
1128                 return true;
1129
1130         if (instr->barrier_class & prior->barrier_conflict) {
1131                 if (!(instr->barrier_class & ~(IR3_BARRIER_ARRAY_R | IR3_BARRIER_ARRAY_W))) {
1132                         /* if only array barrier, then we can further limit false-deps
1133                          * by considering the array-id, ie reads/writes to different
1134                          * arrays do not depend on each other (no aliasing)
1135                          */
1136                         if (get_array_id(instr) != get_array_id(prior)) {
1137                                 return false;
1138                         }
1139                 }
1140
1141                 return true;
1142         }
1143
1144         return false;
1145 }
1146
1147 static void
1148 add_barrier_deps(struct ir3_block *block, struct ir3_instruction *instr)
1149 {
1150         struct list_head *prev = instr->node.prev;
1151         struct list_head *next = instr->node.next;
1152
1153         /* add dependencies on previous instructions that must be scheduled
1154          * prior to the current instruction
1155          */
1156         while (prev != &block->instr_list) {
1157                 struct ir3_instruction *pi =
1158                         LIST_ENTRY(struct ir3_instruction, prev, node);
1159
1160                 prev = prev->prev;
1161
1162                 if (is_meta(pi))
1163                         continue;
1164
1165                 if (instr->barrier_class == pi->barrier_class) {
1166                         ir3_instr_add_dep(instr, pi);
1167                         break;
1168                 }
1169
1170                 if (depends_on(instr, pi))
1171                         ir3_instr_add_dep(instr, pi);
1172         }
1173
1174         /* add dependencies on this instruction to following instructions
1175          * that must be scheduled after the current instruction:
1176          */
1177         while (next != &block->instr_list) {
1178                 struct ir3_instruction *ni =
1179                         LIST_ENTRY(struct ir3_instruction, next, node);
1180
1181                 next = next->next;
1182
1183                 if (is_meta(ni))
1184                         continue;
1185
1186                 if (instr->barrier_class == ni->barrier_class) {
1187                         ir3_instr_add_dep(ni, instr);
1188                         break;
1189                 }
1190
1191                 if (depends_on(ni, instr))
1192                         ir3_instr_add_dep(ni, instr);
1193         }
1194 }
1195
1196 /* before scheduling a block, we need to add any necessary false-dependencies
1197  * to ensure that:
1198  *
1199  *  (1) barriers are scheduled in the right order wrt instructions related
1200  *      to the barrier
1201  *
1202  *  (2) reads that come before a write actually get scheduled before the
1203  *      write
1204  */
1205 static void
1206 calculate_deps(struct ir3_block *block)
1207 {
1208         foreach_instr (instr, &block->instr_list) {
1209                 if (instr->barrier_class) {
1210                         add_barrier_deps(block, instr);
1211                 }
1212         }
1213 }
1214
1215 void
1216 ir3_sched_add_deps(struct ir3 *ir)
1217 {
1218         foreach_block (block, &ir->block_list) {
1219                 calculate_deps(block);
1220         }
1221 }