src/freedreno/ir3/ir3_postsched.c

   1 /*
   2  * Copyright (C) 2019 Google, Inc.
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21  * SOFTWARE.
  22  *
  23  * Authors:
  24  *    Rob Clark <robclark@freedesktop.org>
  25  */
  26
  27
  28 #include "util/dag.h"
  29 #include "util/u_math.h"
  30
  31 #include "ir3.h"
  32 #include "ir3_compiler.h"
  33 #include "ir3_context.h"
  34
  35 #ifdef DEBUG
  36 #define SCHED_DEBUG (ir3_shader_debug & IR3_DBG_SCHEDMSGS)
  37 #else
  38 #define SCHED_DEBUG 0
  39 #endif
  40 #define d(fmt, ...) do { if (SCHED_DEBUG) { \
  41         printf("PSCHED: "fmt"\n", ##__VA_ARGS__); \
  42 } } while (0)
  43
  44 #define di(instr, fmt, ...) do { if (SCHED_DEBUG) { \
  45         printf("PSCHED: "fmt": ", ##__VA_ARGS__); \
  46         ir3_print_instr(instr); \
  47 } } while (0)
  48
  49 /*
  50  * Post RA Instruction Scheduling
  51  */
  52
  53 struct ir3_postsched_ctx {
  54         struct ir3 *ir;
  55
  56         void *mem_ctx;
  57         struct ir3_block *block;           /* the current block */
  58         struct dag *dag;
  59
  60         struct list_head unscheduled_list; /* unscheduled instructions */
  61
  62         int sfu_delay;
  63         int tex_delay;
  64 };
  65
  66 struct ir3_postsched_node {
  67         struct dag_node dag;     /* must be first for util_dynarray_foreach */
  68         struct ir3_instruction *instr;
  69         bool partially_evaluated_path;
  70
  71         unsigned delay;
  72         unsigned max_delay;
  73 };
  74
  75 #define foreach_sched_node(__n, __list) \
  76         list_for_each_entry(struct ir3_postsched_node, __n, __list, dag.link)
  77
  78 #define foreach_bit(b, mask) \
  79         for (uint32_t _m = ({debug_assert((mask) >= 1); (mask);}); _m && ({(b) = u_bit_scan(&_m); 1;});)
  80
  81 static void
  82 schedule(struct ir3_postsched_ctx *ctx, struct ir3_instruction *instr)
  83 {
  84         debug_assert(ctx->block == instr->block);
  85
  86         /* remove from unscheduled_list:
  87          */
  88         list_delinit(&instr->node);
  89
  90         di(instr, "schedule");
  91
  92         list_addtail(&instr->node, &instr->block->instr_list);
  93
  94         struct ir3_postsched_node *n = instr->data;
  95         dag_prune_head(ctx->dag, &n->dag);
  96
  97         if (is_meta(instr) && (instr->opc != OPC_META_TEX_PREFETCH))
  98                 return;
  99
 100         if (is_sfu(instr)) {
 101                 ctx->sfu_delay = 8;
 102         } else if (check_src_cond(instr, is_sfu)) {
 103                 ctx->sfu_delay = 0;
 104         } else if (ctx->sfu_delay > 0) {
 105                 ctx->sfu_delay--;
 106         }
 107
 108         if (is_tex_or_prefetch(instr)) {
 109                 ctx->tex_delay = 10;
 110         } else if (check_src_cond(instr, is_tex_or_prefetch)) {
 111                 ctx->tex_delay = 0;
 112         } else if (ctx->tex_delay > 0) {
 113                 ctx->tex_delay--;
 114         }
 115 }
 116
 117 static void
 118 dump_state(struct ir3_postsched_ctx *ctx)
 119 {
 120         if (!SCHED_DEBUG)
 121                 return;
 122
 123         foreach_sched_node (n, &ctx->dag->heads) {
 124                 di(n->instr, "maxdel=%3d    ", n->max_delay);
 125
 126                 util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
 127                         struct ir3_postsched_node *child =
 128                                 (struct ir3_postsched_node *)edge->child;
 129
 130                         di(child->instr, " -> (%d parents) ", child->dag.parent_count);
 131                 }
 132         }
 133 }
 134
 135 /* Determine if this is an instruction that we'd prefer not to schedule
 136  * yet, in order to avoid an (ss) sync.  This is limited by the sfu_delay
 137  * counter, ie. the more cycles it has been since the last SFU, the less
 138  * costly a sync would be.
 139  */
 140 static bool
 141 would_sync(struct ir3_postsched_ctx *ctx, struct ir3_instruction *instr)
 142 {
 143         if (ctx->sfu_delay) {
 144                 if (check_src_cond(instr, is_sfu))
 145                         return true;
 146         }
 147
 148         if (ctx->tex_delay) {
 149                 if (check_src_cond(instr, is_tex_or_prefetch))
 150                         return true;
 151         }
 152
 153         return false;
 154 }
 155
 156 /* find instruction to schedule: */
 157 static struct ir3_instruction *
 158 choose_instr(struct ir3_postsched_ctx *ctx)
 159 {
 160         struct ir3_postsched_node *chosen = NULL;
 161
 162         dump_state(ctx);
 163
 164         foreach_sched_node (n, &ctx->dag->heads) {
 165                 if (!is_meta(n->instr))
 166                         continue;
 167
 168                 if (!chosen || (chosen->max_delay < n->max_delay))
 169                         chosen = n;
 170         }
 171
 172         if (chosen) {
 173                 di(chosen->instr, "prio: chose (meta)");
 174                 return chosen->instr;
 175         }
 176
 177         /* Try to schedule inputs with a higher priority, if possible, as
 178          * the last bary.f unlocks varying storage to unblock more VS
 179          * warps.
 180          */
 181         foreach_sched_node (n, &ctx->dag->heads) {
 182                 if (!is_input(n->instr))
 183                         continue;
 184
 185                 if (!chosen || (chosen->max_delay < n->max_delay))
 186                         chosen = n;
 187         }
 188
 189         if (chosen) {
 190                 di(chosen->instr, "prio: chose (input)");
 191                 return chosen->instr;
 192         }
 193
 194         /* Next prioritize discards: */
 195         foreach_sched_node (n, &ctx->dag->heads) {
 196                 unsigned d = ir3_delay_calc(ctx->block, n->instr, false, false);
 197
 198                 if (d > 0)
 199                         continue;
 200
 201                 if (!is_kill(n->instr))
 202                         continue;
 203
 204                 if (!chosen || (chosen->max_delay < n->max_delay))
 205                         chosen = n;
 206         }
 207
 208         if (chosen) {
 209                 di(chosen->instr, "csp: chose (kill, hard ready)");
 210                 return chosen->instr;
 211         }
 212
 213         /* Next prioritize expensive instructions: */
 214         foreach_sched_node (n, &ctx->dag->heads) {
 215                 unsigned d = ir3_delay_calc(ctx->block, n->instr, false, false);
 216
 217                 if (d > 0)
 218                         continue;
 219
 220                 if (!(is_sfu(n->instr) || is_tex(n->instr)))
 221                         continue;
 222
 223                 if (!chosen || (chosen->max_delay < n->max_delay))
 224                         chosen = n;
 225         }
 226
 227         if (chosen) {
 228                 di(chosen->instr, "csp: chose (sfu/tex, hard ready)");
 229                 return chosen->instr;
 230         }
 231
 232         /*
 233          * Sometimes be better to take a nop, rather than scheduling an
 234          * instruction that would require an (ss) shortly after another
 235          * SFU..  ie. if last SFU was just one or two instr ago, and we
 236          * could choose between taking a nop and then scheduling
 237          * something else, vs scheduling the immed avail instruction that
 238          * would require (ss), we are better with the nop.
 239          */
 240         for (unsigned delay = 0; delay < 4; delay++) {
 241                 foreach_sched_node (n, &ctx->dag->heads) {
 242                         if (would_sync(ctx, n->instr))
 243                                 continue;
 244
 245                         unsigned d = ir3_delay_calc(ctx->block, n->instr, true, false);
 246
 247                         if (d > delay)
 248                                 continue;
 249
 250                         if (!chosen || (chosen->max_delay < n->max_delay))
 251                                 chosen = n;
 252                 }
 253
 254                 if (chosen) {
 255                         di(chosen->instr, "csp: chose (soft ready, delay=%u)", delay);
 256                         return chosen->instr;
 257                 }
 258         }
 259
 260         /* Next try to find a ready leader w/ soft delay (ie. including extra
 261          * delay for things like tex fetch which can be synchronized w/ sync
 262          * bit (but we probably do want to schedule some other instructions
 263          * while we wait)
 264          */
 265         foreach_sched_node (n, &ctx->dag->heads) {
 266                 unsigned d = ir3_delay_calc(ctx->block, n->instr, true, false);
 267
 268                 if (d > 0)
 269                         continue;
 270
 271                 if (!chosen || (chosen->max_delay < n->max_delay))
 272                         chosen = n;
 273         }
 274
 275         if (chosen) {
 276                 di(chosen->instr, "csp: chose (soft ready)");
 277                 return chosen->instr;
 278         }
 279
 280         /* Next try to find a ready leader that can be scheduled without nop's,
 281          * which in the case of things that need (sy)/(ss) could result in
 282          * stalls.. but we've already decided there is not a better option.
 283          */
 284         foreach_sched_node (n, &ctx->dag->heads) {
 285                 unsigned d = ir3_delay_calc(ctx->block, n->instr, false, false);
 286
 287                 if (d > 0)
 288                         continue;
 289
 290                 if (!chosen || (chosen->max_delay < n->max_delay))
 291                         chosen = n;
 292         }
 293
 294         if (chosen) {
 295                 di(chosen->instr, "csp: chose (hard ready)");
 296                 return chosen->instr;
 297         }
 298
 299         /* Otherwise choose leader with maximum cost:
 300          *
 301          * TODO should we try to balance cost and delays?  I guess it is
 302          * a balance between now-nop's and future-nop's?
 303          */
 304         foreach_sched_node (n, &ctx->dag->heads) {
 305                 if (!chosen || chosen->max_delay < n->max_delay)
 306                         chosen = n;
 307         }
 308
 309         if (chosen) {
 310                 di(chosen->instr, "csp: chose (leader)");
 311                 return chosen->instr;
 312         }
 313
 314         return NULL;
 315 }
 316
 317 struct ir3_postsched_deps_state {
 318         struct ir3_postsched_ctx *ctx;
 319
 320         enum { F, R } direction;
 321
 322         bool merged;
 323
 324         /* Track the mapping between sched node (instruction) that last
 325          * wrote a given register (in whichever direction we are iterating
 326          * the block)
 327          *
 328          * Note, this table is twice as big as the # of regs, to deal with
 329          * half-precision regs.  The approach differs depending on whether
 330          * the half and full precision register files are "merged" (conflict,
 331          * ie. a6xx+) in which case we consider each full precision dep
 332          * as two half-precision dependencies, vs older separate (non-
 333          * conflicting) in which case the first half of the table is used
 334          * for full precision and 2nd half for half-precision.
 335          */
 336         struct ir3_postsched_node *regs[2 * 256];
 337 };
 338
 339 /* bounds checking read/write accessors, since OoB access to stuff on
 340  * the stack is gonna cause a bad day.
 341  */
 342 #define dep_reg(state, idx) *({ \
 343                 assert((idx) < ARRAY_SIZE((state)->regs)); \
 344                 &(state)->regs[(idx)]; \
 345         })
 346
 347 static void
 348 add_dep(struct ir3_postsched_deps_state *state,
 349                 struct ir3_postsched_node *before,
 350                 struct ir3_postsched_node *after)
 351 {
 352         if (!before || !after)
 353                 return;
 354
 355         assert(before != after);
 356
 357         if (state->direction == F) {
 358                 dag_add_edge(&before->dag, &after->dag, NULL);
 359         } else {
 360                 dag_add_edge(&after->dag, &before->dag, NULL);
 361         }
 362 }
 363
 364 static void
 365 add_single_reg_dep(struct ir3_postsched_deps_state *state,
 366                 struct ir3_postsched_node *node, unsigned num, bool write)
 367 {
 368         add_dep(state, dep_reg(state, num), node);
 369         if (write) {
 370                 dep_reg(state, num) = node;
 371         }
 372 }
 373
 374 /* This is where we handled full vs half-precision, and potential conflicts
 375  * between half and full precision that result in additional dependencies.
 376  * The 'reg' arg is really just to know half vs full precision.
 377  */
 378 static void
 379 add_reg_dep(struct ir3_postsched_deps_state *state,
 380                 struct ir3_postsched_node *node, const struct ir3_register *reg,
 381                 unsigned num, bool write)
 382 {
 383         if (state->merged) {
 384                 if (reg->flags & IR3_REG_HALF) {
 385                         /* single conflict in half-reg space: */
 386                         add_single_reg_dep(state, node, num, write);
 387                 } else {
 388                         /* two conflicts in half-reg space: */
 389                         add_single_reg_dep(state, node, 2 * num + 0, write);
 390                         add_single_reg_dep(state, node, 2 * num + 1, write);
 391                 }
 392         } else {
 393                 if (reg->flags & IR3_REG_HALF)
 394                         num += ARRAY_SIZE(state->regs) / 2;
 395                 add_single_reg_dep(state, node, num, write);
 396         }
 397 }
 398
 399 static void
 400 calculate_deps(struct ir3_postsched_deps_state *state,
 401                 struct ir3_postsched_node *node)
 402 {
 403         int b;
 404
 405         /* Add dependencies on instructions that previously (or next,
 406          * in the reverse direction) wrote any of our src registers:
 407          */
 408         foreach_src_n (reg, i, node->instr) {
 409                 if (reg->flags & (IR3_REG_CONST | IR3_REG_IMMED))
 410                         continue;
 411
 412                 if (reg->flags & IR3_REG_RELATIV) {
 413                         /* mark entire array as read: */
 414                         struct ir3_array *arr = ir3_lookup_array(state->ctx->ir, reg->array.id);
 415                         for (unsigned i = 0; i < arr->length; i++) {
 416                                 add_reg_dep(state, node, reg, arr->reg + i, false);
 417                         }
 418                 } else {
 419                         foreach_bit (b, reg->wrmask) {
 420                                 add_reg_dep(state, node, reg, reg->num + b, false);
 421
 422                                 struct ir3_postsched_node *dep = dep_reg(state, reg->num + b);
 423                                 if (dep && (state->direction == F)) {
 424                                         unsigned d = ir3_delayslots(dep->instr, node->instr, i, true);
 425                                         node->delay = MAX2(node->delay, d);
 426                                 }
 427                         }
 428                 }
 429         }
 430
 431         if (node->instr->address) {
 432                 add_reg_dep(state, node, node->instr->address->regs[0],
 433                                         node->instr->address->regs[0]->num,
 434                                         false);
 435         }
 436
 437         if (dest_regs(node->instr) == 0)
 438                 return;
 439
 440         /* And then after we update the state for what this instruction
 441          * wrote:
 442          */
 443         struct ir3_register *reg = node->instr->regs[0];
 444         if (reg->flags & IR3_REG_RELATIV) {
 445                 /* mark the entire array as written: */
 446                 struct ir3_array *arr = ir3_lookup_array(state->ctx->ir, reg->array.id);
 447                 for (unsigned i = 0; i < arr->length; i++) {
 448                         add_reg_dep(state, node, reg, arr->reg + i, true);
 449                 }
 450         } else {
 451                 foreach_bit (b, reg->wrmask) {
 452                         add_reg_dep(state, node, reg, reg->num + b, true);
 453                 }
 454         }
 455 }
 456
 457 static void
 458 calculate_forward_deps(struct ir3_postsched_ctx *ctx)
 459 {
 460         struct ir3_postsched_deps_state state = {
 461                         .ctx = ctx,
 462                         .direction = F,
 463                         .merged = ctx->ir->compiler->gpu_id >= 600,
 464         };
 465
 466         foreach_instr (instr, &ctx->unscheduled_list) {
 467                 calculate_deps(&state, instr->data);
 468         }
 469 }
 470
 471 static void
 472 calculate_reverse_deps(struct ir3_postsched_ctx *ctx)
 473 {
 474         struct ir3_postsched_deps_state state = {
 475                         .ctx = ctx,
 476                         .direction = R,
 477                         .merged = ctx->ir->compiler->gpu_id >= 600,
 478         };
 479
 480         foreach_instr_rev (instr, &ctx->unscheduled_list) {
 481                 calculate_deps(&state, instr->data);
 482         }
 483 }
 484
 485 static void
 486 sched_node_init(struct ir3_postsched_ctx *ctx, struct ir3_instruction *instr)
 487 {
 488         struct ir3_postsched_node *n = rzalloc(ctx->mem_ctx, struct ir3_postsched_node);
 489
 490         dag_init_node(ctx->dag, &n->dag);
 491
 492         n->instr = instr;
 493         instr->data = n;
 494 }
 495
 496 static void
 497 sched_dag_max_delay_cb(struct dag_node *node, void *state)
 498 {
 499         struct ir3_postsched_node *n = (struct ir3_postsched_node *)node;
 500         uint32_t max_delay = 0;
 501
 502         util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
 503                 struct ir3_postsched_node *child = (struct ir3_postsched_node *)edge->child;
 504                 max_delay = MAX2(child->max_delay, max_delay);
 505         }
 506
 507         n->max_delay = MAX2(n->max_delay, max_delay + n->delay);
 508 }
 509
 510 static void
 511 sched_dag_init(struct ir3_postsched_ctx *ctx)
 512 {
 513         ctx->mem_ctx = ralloc_context(NULL);
 514
 515         ctx->dag = dag_create(ctx->mem_ctx);
 516
 517         foreach_instr (instr, &ctx->unscheduled_list)
 518                 sched_node_init(ctx, instr);
 519
 520         calculate_forward_deps(ctx);
 521         calculate_reverse_deps(ctx);
 522
 523         /*
 524          * To avoid expensive texture fetches, etc, from being moved ahead
 525          * of kills, track the kills we've seen so far, so we can add an
 526          * extra dependency on them for tex/mem instructions
 527          */
 528         struct util_dynarray kills;
 529         util_dynarray_init(&kills, ctx->mem_ctx);
 530
 531         /*
 532          * Normal srcs won't be in SSA at this point, those are dealt with in
 533          * calculate_forward_deps() and calculate_reverse_deps().  But we still
 534          * have the false-dep information in SSA form, so go ahead and add
 535          * dependencies for that here:
 536          */
 537         foreach_instr (instr, &ctx->unscheduled_list) {
 538                 struct ir3_postsched_node *n = instr->data;
 539
 540                 foreach_ssa_src_n (src, i, instr) {
 541                         if (src->block != instr->block)
 542                                 continue;
 543
 544                         /* we can end up with unused false-deps.. just skip them: */
 545                         if (src->flags & IR3_INSTR_UNUSED)
 546                                 continue;
 547
 548                         struct ir3_postsched_node *sn = src->data;
 549
 550                         /* don't consider dependencies in other blocks: */
 551                         if (src->block != instr->block)
 552                                 continue;
 553
 554                         dag_add_edge(&sn->dag, &n->dag, NULL);
 555                 }
 556
 557                 if (is_kill(instr)) {
 558                         util_dynarray_append(&kills, struct ir3_instruction *, instr);
 559                 } else if (is_tex(instr) || is_mem(instr)) {
 560                         util_dynarray_foreach(&kills, struct ir3_instruction *, instrp) {
 561                                 struct ir3_instruction *kill = *instrp;
 562                                 struct ir3_postsched_node *kn = kill->data;
 563                                 dag_add_edge(&kn->dag, &n->dag, NULL);
 564                         }
 565                 }
 566         }
 567
 568         // TODO do we want to do this after reverse-dependencies?
 569         dag_traverse_bottom_up(ctx->dag, sched_dag_max_delay_cb, NULL);
 570 }
 571
 572 static void
 573 sched_dag_destroy(struct ir3_postsched_ctx *ctx)
 574 {
 575         ralloc_free(ctx->mem_ctx);
 576         ctx->mem_ctx = NULL;
 577         ctx->dag = NULL;
 578 }
 579
 580 static void
 581 sched_block(struct ir3_postsched_ctx *ctx, struct ir3_block *block)
 582 {
 583         ctx->block = block;
 584         ctx->tex_delay = 0;
 585         ctx->sfu_delay = 0;
 586
 587         /* move all instructions to the unscheduled list, and
 588          * empty the block's instruction list (to which we will
 589          * be inserting).
 590          */
 591         list_replace(&block->instr_list, &ctx->unscheduled_list);
 592         list_inithead(&block->instr_list);
 593
 594         // TODO once we are using post-sched for everything we can
 595         // just not stick in NOP's prior to post-sched, and drop this.
 596         // for now keep this, since it makes post-sched optional:
 597         foreach_instr_safe (instr, &ctx->unscheduled_list) {
 598                 switch (instr->opc) {
 599                 case OPC_NOP:
 600                 case OPC_B:
 601                 case OPC_JUMP:
 602                         list_delinit(&instr->node);
 603                         break;
 604                 default:
 605                         break;
 606                 }
 607         }
 608
 609         sched_dag_init(ctx);
 610
 611         /* First schedule all meta:input instructions, followed by
 612          * tex-prefetch.  We want all of the instructions that load
 613          * values into registers before the shader starts to go
 614          * before any other instructions.  But in particular we
 615          * want inputs to come before prefetches.  This is because
 616          * a FS's bary_ij input may not actually be live in the
 617          * shader, but it should not be scheduled on top of any
 618          * other input (but can be overwritten by a tex prefetch)
 619          */
 620         foreach_instr_safe (instr, &ctx->unscheduled_list)
 621                 if (instr->opc == OPC_META_INPUT)
 622                         schedule(ctx, instr);
 623
 624         foreach_instr_safe (instr, &ctx->unscheduled_list)
 625                 if (instr->opc == OPC_META_TEX_PREFETCH)
 626                         schedule(ctx, instr);
 627
 628         while (!list_is_empty(&ctx->unscheduled_list)) {
 629                 struct ir3_instruction *instr = choose_instr(ctx);
 630
 631                 unsigned delay = ir3_delay_calc(ctx->block, instr, false, false);
 632                 d("delay=%u", delay);
 633
 634                 /* and if we run out of instructions that can be scheduled,
 635                  * then it is time for nop's:
 636                  */
 637                 debug_assert(delay <= 6);
 638                 while (delay > 0) {
 639                         ir3_NOP(block);
 640                         delay--;
 641                 }
 642
 643                 schedule(ctx, instr);
 644         }
 645
 646         sched_dag_destroy(ctx);
 647 }
 648
 649
 650 static bool
 651 is_self_mov(struct ir3_instruction *instr)
 652 {
 653         if (!is_same_type_mov(instr))
 654                 return false;
 655
 656         if (instr->regs[0]->num != instr->regs[1]->num)
 657                 return false;
 658
 659         if (instr->regs[0]->flags & IR3_REG_RELATIV)
 660                 return false;
 661
 662         if (instr->regs[1]->flags & (IR3_REG_CONST | IR3_REG_IMMED |
 663                         IR3_REG_RELATIV | IR3_REG_FNEG | IR3_REG_FABS |
 664                         IR3_REG_SNEG | IR3_REG_SABS | IR3_REG_BNOT |
 665                         IR3_REG_EVEN | IR3_REG_POS_INF))
 666                 return false;
 667
 668         return true;
 669 }
 670
 671 /* sometimes we end up w/ in-place mov's, ie. mov.u32u32 r1.y, r1.y
 672  * as a result of places were before RA we are not sure that it is
 673  * safe to eliminate.  We could eliminate these earlier, but sometimes
 674  * they are tangled up in false-dep's, etc, so it is easier just to
 675  * let them exist until after RA
 676  */
 677 static void
 678 cleanup_self_movs(struct ir3 *ir)
 679 {
 680         foreach_block (block, &ir->block_list) {
 681                 foreach_instr_safe (instr, &block->instr_list) {
 682
 683                         foreach_src (reg, instr) {
 684                                 if (!reg->instr)
 685                                         continue;
 686
 687                                 if (is_self_mov(reg->instr)) {
 688                                         list_delinit(&reg->instr->node);
 689                                         reg->instr = reg->instr->regs[1]->instr;
 690                                 }
 691                         }
 692
 693                         for (unsigned i = 0; i < instr->deps_count; i++) {
 694                                 if (instr->deps[i] && is_self_mov(instr->deps[i])) {
 695                                         list_delinit(&instr->deps[i]->node);
 696                                         instr->deps[i] = instr->deps[i]->regs[1]->instr;
 697                                 }
 698                         }
 699                 }
 700         }
 701 }
 702
 703 bool
 704 ir3_postsched(struct ir3 *ir)
 705 {
 706         struct ir3_postsched_ctx ctx = {
 707                         .ir = ir,
 708         };
 709
 710         ir3_remove_nops(ir);
 711         cleanup_self_movs(ir);
 712
 713         foreach_block (block, &ir->block_list) {
 714                 sched_block(&ctx, block);
 715         }
 716
 717         return true;
 718 }