src/freedreno/ir3/ir3_postsched.c

   1 /*
   2  * Copyright (C) 2019 Google, Inc.
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21  * SOFTWARE.
  22  *
  23  * Authors:
  24  *    Rob Clark <robclark@freedesktop.org>
  25  */
  26
  27
  28 #include "util/dag.h"
  29 #include "util/u_math.h"
  30
  31 #include "ir3.h"
  32 #include "ir3_compiler.h"
  33 #include "ir3_context.h"
  34
  35 #ifdef DEBUG
  36 #define SCHED_DEBUG (ir3_shader_debug & IR3_DBG_SCHEDMSGS)
  37 #else
  38 #define SCHED_DEBUG 0
  39 #endif
  40 #define d(fmt, ...) do { if (SCHED_DEBUG) { \
  41         printf("PSCHED: "fmt"\n", ##__VA_ARGS__); \
  42 } } while (0)
  43
  44 #define di(instr, fmt, ...) do { if (SCHED_DEBUG) { \
  45         printf("PSCHED: "fmt": ", ##__VA_ARGS__); \
  46         ir3_print_instr(instr); \
  47 } } while (0)
  48
  49 /*
  50  * Post RA Instruction Scheduling
  51  */
  52
  53 struct ir3_postsched_ctx {
  54         struct ir3 *ir;
  55
  56         struct ir3_shader_variant *v;
  57
  58         void *mem_ctx;
  59         struct ir3_block *block;           /* the current block */
  60         struct dag *dag;
  61
  62         struct list_head unscheduled_list; /* unscheduled instructions */
  63
  64         int sfu_delay;
  65         int tex_delay;
  66 };
  67
  68 struct ir3_postsched_node {
  69         struct dag_node dag;     /* must be first for util_dynarray_foreach */
  70         struct ir3_instruction *instr;
  71         bool partially_evaluated_path;
  72
  73         unsigned delay;
  74         unsigned max_delay;
  75 };
  76
  77 #define foreach_sched_node(__n, __list) \
  78         list_for_each_entry(struct ir3_postsched_node, __n, __list, dag.link)
  79
  80 #define foreach_bit(b, mask) \
  81         for (uint32_t _m = ({debug_assert((mask) >= 1); (mask);}); _m && ({(b) = u_bit_scan(&_m); 1;});)
  82
  83 static void
  84 schedule(struct ir3_postsched_ctx *ctx, struct ir3_instruction *instr)
  85 {
  86         debug_assert(ctx->block == instr->block);
  87
  88         /* remove from unscheduled_list:
  89          */
  90         list_delinit(&instr->node);
  91
  92         di(instr, "schedule");
  93
  94         list_addtail(&instr->node, &instr->block->instr_list);
  95
  96         struct ir3_postsched_node *n = instr->data;
  97         dag_prune_head(ctx->dag, &n->dag);
  98
  99         if (is_meta(instr) && (instr->opc != OPC_META_TEX_PREFETCH))
 100                 return;
 101
 102         if (is_sfu(instr)) {
 103                 ctx->sfu_delay = 8;
 104         } else if (check_src_cond(instr, is_sfu)) {
 105                 ctx->sfu_delay = 0;
 106         } else if (ctx->sfu_delay > 0) {
 107                 ctx->sfu_delay--;
 108         }
 109
 110         if (is_tex_or_prefetch(instr)) {
 111                 ctx->tex_delay = 10;
 112         } else if (check_src_cond(instr, is_tex_or_prefetch)) {
 113                 ctx->tex_delay = 0;
 114         } else if (ctx->tex_delay > 0) {
 115                 ctx->tex_delay--;
 116         }
 117 }
 118
 119 static void
 120 dump_state(struct ir3_postsched_ctx *ctx)
 121 {
 122         if (!SCHED_DEBUG)
 123                 return;
 124
 125         foreach_sched_node (n, &ctx->dag->heads) {
 126                 di(n->instr, "maxdel=%3d    ", n->max_delay);
 127
 128                 util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
 129                         struct ir3_postsched_node *child =
 130                                 (struct ir3_postsched_node *)edge->child;
 131
 132                         di(child->instr, " -> (%d parents) ", child->dag.parent_count);
 133                 }
 134         }
 135 }
 136
 137 /* Determine if this is an instruction that we'd prefer not to schedule
 138  * yet, in order to avoid an (ss) sync.  This is limited by the sfu_delay
 139  * counter, ie. the more cycles it has been since the last SFU, the less
 140  * costly a sync would be.
 141  */
 142 static bool
 143 would_sync(struct ir3_postsched_ctx *ctx, struct ir3_instruction *instr)
 144 {
 145         if (ctx->sfu_delay) {
 146                 if (check_src_cond(instr, is_sfu))
 147                         return true;
 148         }
 149
 150         if (ctx->tex_delay) {
 151                 if (check_src_cond(instr, is_tex_or_prefetch))
 152                         return true;
 153         }
 154
 155         return false;
 156 }
 157
 158 /* find instruction to schedule: */
 159 static struct ir3_instruction *
 160 choose_instr(struct ir3_postsched_ctx *ctx)
 161 {
 162         struct ir3_postsched_node *chosen = NULL;
 163
 164         dump_state(ctx);
 165
 166         foreach_sched_node (n, &ctx->dag->heads) {
 167                 if (!is_meta(n->instr))
 168                         continue;
 169
 170                 if (!chosen || (chosen->max_delay < n->max_delay))
 171                         chosen = n;
 172         }
 173
 174         if (chosen) {
 175                 di(chosen->instr, "prio: chose (meta)");
 176                 return chosen->instr;
 177         }
 178
 179         /* Try to schedule inputs with a higher priority, if possible, as
 180          * the last bary.f unlocks varying storage to unblock more VS
 181          * warps.
 182          */
 183         foreach_sched_node (n, &ctx->dag->heads) {
 184                 if (!is_input(n->instr))
 185                         continue;
 186
 187                 if (!chosen || (chosen->max_delay < n->max_delay))
 188                         chosen = n;
 189         }
 190
 191         if (chosen) {
 192                 di(chosen->instr, "prio: chose (input)");
 193                 return chosen->instr;
 194         }
 195
 196         /* Next prioritize discards: */
 197         foreach_sched_node (n, &ctx->dag->heads) {
 198                 unsigned d = ir3_delay_calc(ctx->block, n->instr, false, false);
 199
 200                 if (d > 0)
 201                         continue;
 202
 203                 if (!is_kill(n->instr))
 204                         continue;
 205
 206                 if (!chosen || (chosen->max_delay < n->max_delay))
 207                         chosen = n;
 208         }
 209
 210         if (chosen) {
 211                 di(chosen->instr, "csp: chose (kill, hard ready)");
 212                 return chosen->instr;
 213         }
 214
 215         /* Next prioritize expensive instructions: */
 216         foreach_sched_node (n, &ctx->dag->heads) {
 217                 unsigned d = ir3_delay_calc(ctx->block, n->instr, false, false);
 218
 219                 if (d > 0)
 220                         continue;
 221
 222                 if (!(is_sfu(n->instr) || is_tex(n->instr)))
 223                         continue;
 224
 225                 if (!chosen || (chosen->max_delay < n->max_delay))
 226                         chosen = n;
 227         }
 228
 229         if (chosen) {
 230                 di(chosen->instr, "csp: chose (sfu/tex, hard ready)");
 231                 return chosen->instr;
 232         }
 233
 234         /*
 235          * Sometimes be better to take a nop, rather than scheduling an
 236          * instruction that would require an (ss) shortly after another
 237          * SFU..  ie. if last SFU was just one or two instr ago, and we
 238          * could choose between taking a nop and then scheduling
 239          * something else, vs scheduling the immed avail instruction that
 240          * would require (ss), we are better with the nop.
 241          */
 242         for (unsigned delay = 0; delay < 4; delay++) {
 243                 foreach_sched_node (n, &ctx->dag->heads) {
 244                         if (would_sync(ctx, n->instr))
 245                                 continue;
 246
 247                         unsigned d = ir3_delay_calc(ctx->block, n->instr, true, false);
 248
 249                         if (d > delay)
 250                                 continue;
 251
 252                         if (!chosen || (chosen->max_delay < n->max_delay))
 253                                 chosen = n;
 254                 }
 255
 256                 if (chosen) {
 257                         di(chosen->instr, "csp: chose (soft ready, delay=%u)", delay);
 258                         return chosen->instr;
 259                 }
 260         }
 261
 262         /* Next try to find a ready leader w/ soft delay (ie. including extra
 263          * delay for things like tex fetch which can be synchronized w/ sync
 264          * bit (but we probably do want to schedule some other instructions
 265          * while we wait)
 266          */
 267         foreach_sched_node (n, &ctx->dag->heads) {
 268                 unsigned d = ir3_delay_calc(ctx->block, n->instr, true, false);
 269
 270                 if (d > 0)
 271                         continue;
 272
 273                 if (!chosen || (chosen->max_delay < n->max_delay))
 274                         chosen = n;
 275         }
 276
 277         if (chosen) {
 278                 di(chosen->instr, "csp: chose (soft ready)");
 279                 return chosen->instr;
 280         }
 281
 282         /* Next try to find a ready leader that can be scheduled without nop's,
 283          * which in the case of things that need (sy)/(ss) could result in
 284          * stalls.. but we've already decided there is not a better option.
 285          */
 286         foreach_sched_node (n, &ctx->dag->heads) {
 287                 unsigned d = ir3_delay_calc(ctx->block, n->instr, false, false);
 288
 289                 if (d > 0)
 290                         continue;
 291
 292                 if (!chosen || (chosen->max_delay < n->max_delay))
 293                         chosen = n;
 294         }
 295
 296         if (chosen) {
 297                 di(chosen->instr, "csp: chose (hard ready)");
 298                 return chosen->instr;
 299         }
 300
 301         /* Otherwise choose leader with maximum cost:
 302          *
 303          * TODO should we try to balance cost and delays?  I guess it is
 304          * a balance between now-nop's and future-nop's?
 305          */
 306         foreach_sched_node (n, &ctx->dag->heads) {
 307                 if (!chosen || chosen->max_delay < n->max_delay)
 308                         chosen = n;
 309         }
 310
 311         if (chosen) {
 312                 di(chosen->instr, "csp: chose (leader)");
 313                 return chosen->instr;
 314         }
 315
 316         return NULL;
 317 }
 318
 319 struct ir3_postsched_deps_state {
 320         struct ir3_postsched_ctx *ctx;
 321
 322         enum { F, R } direction;
 323
 324         bool merged;
 325
 326         /* Track the mapping between sched node (instruction) that last
 327          * wrote a given register (in whichever direction we are iterating
 328          * the block)
 329          *
 330          * Note, this table is twice as big as the # of regs, to deal with
 331          * half-precision regs.  The approach differs depending on whether
 332          * the half and full precision register files are "merged" (conflict,
 333          * ie. a6xx+) in which case we consider each full precision dep
 334          * as two half-precision dependencies, vs older separate (non-
 335          * conflicting) in which case the first half of the table is used
 336          * for full precision and 2nd half for half-precision.
 337          */
 338         struct ir3_postsched_node *regs[2 * 256];
 339 };
 340
 341 /* bounds checking read/write accessors, since OoB access to stuff on
 342  * the stack is gonna cause a bad day.
 343  */
 344 #define dep_reg(state, idx) *({ \
 345                 assert((idx) < ARRAY_SIZE((state)->regs)); \
 346                 &(state)->regs[(idx)]; \
 347         })
 348
 349 static void
 350 add_dep(struct ir3_postsched_deps_state *state,
 351                 struct ir3_postsched_node *before,
 352                 struct ir3_postsched_node *after)
 353 {
 354         if (!before || !after)
 355                 return;
 356
 357         assert(before != after);
 358
 359         if (state->direction == F) {
 360                 dag_add_edge(&before->dag, &after->dag, NULL);
 361         } else {
 362                 dag_add_edge(&after->dag, &before->dag, NULL);
 363         }
 364 }
 365
 366 static void
 367 add_single_reg_dep(struct ir3_postsched_deps_state *state,
 368                 struct ir3_postsched_node *node, unsigned num, bool write)
 369 {
 370         add_dep(state, dep_reg(state, num), node);
 371         if (write) {
 372                 dep_reg(state, num) = node;
 373         }
 374 }
 375
 376 /* This is where we handled full vs half-precision, and potential conflicts
 377  * between half and full precision that result in additional dependencies.
 378  * The 'reg' arg is really just to know half vs full precision.
 379  */
 380 static void
 381 add_reg_dep(struct ir3_postsched_deps_state *state,
 382                 struct ir3_postsched_node *node, const struct ir3_register *reg,
 383                 unsigned num, bool write)
 384 {
 385         if (state->merged) {
 386                 if (reg->flags & IR3_REG_HALF) {
 387                         /* single conflict in half-reg space: */
 388                         add_single_reg_dep(state, node, num, write);
 389                 } else {
 390                         /* two conflicts in half-reg space: */
 391                         add_single_reg_dep(state, node, 2 * num + 0, write);
 392                         add_single_reg_dep(state, node, 2 * num + 1, write);
 393                 }
 394         } else {
 395                 if (reg->flags & IR3_REG_HALF)
 396                         num += ARRAY_SIZE(state->regs) / 2;
 397                 add_single_reg_dep(state, node, num, write);
 398         }
 399 }
 400
 401 static void
 402 calculate_deps(struct ir3_postsched_deps_state *state,
 403                 struct ir3_postsched_node *node)
 404 {
 405         int b;
 406
 407         /* Add dependencies on instructions that previously (or next,
 408          * in the reverse direction) wrote any of our src registers:
 409          */
 410         foreach_src_n (reg, i, node->instr) {
 411                 if (reg->flags & (IR3_REG_CONST | IR3_REG_IMMED))
 412                         continue;
 413
 414                 if (reg->flags & IR3_REG_RELATIV) {
 415                         /* mark entire array as read: */
 416                         struct ir3_array *arr = ir3_lookup_array(state->ctx->ir, reg->array.id);
 417                         for (unsigned i = 0; i < arr->length; i++) {
 418                                 add_reg_dep(state, node, reg, arr->reg + i, false);
 419                         }
 420                 } else {
 421                         foreach_bit (b, reg->wrmask) {
 422                                 add_reg_dep(state, node, reg, reg->num + b, false);
 423
 424                                 struct ir3_postsched_node *dep = dep_reg(state, reg->num + b);
 425                                 if (dep && (state->direction == F)) {
 426                                         unsigned d = ir3_delayslots(dep->instr, node->instr, i, true);
 427                                         node->delay = MAX2(node->delay, d);
 428                                 }
 429                         }
 430                 }
 431         }
 432
 433         if (node->instr->address) {
 434                 add_reg_dep(state, node, node->instr->address->regs[0],
 435                                         node->instr->address->regs[0]->num,
 436                                         false);
 437         }
 438
 439         if (dest_regs(node->instr) == 0)
 440                 return;
 441
 442         /* And then after we update the state for what this instruction
 443          * wrote:
 444          */
 445         struct ir3_register *reg = node->instr->regs[0];
 446         if (reg->flags & IR3_REG_RELATIV) {
 447                 /* mark the entire array as written: */
 448                 struct ir3_array *arr = ir3_lookup_array(state->ctx->ir, reg->array.id);
 449                 for (unsigned i = 0; i < arr->length; i++) {
 450                         add_reg_dep(state, node, reg, arr->reg + i, true);
 451                 }
 452         } else {
 453                 foreach_bit (b, reg->wrmask) {
 454                         add_reg_dep(state, node, reg, reg->num + b, true);
 455                 }
 456         }
 457 }
 458
 459 static void
 460 calculate_forward_deps(struct ir3_postsched_ctx *ctx)
 461 {
 462         struct ir3_postsched_deps_state state = {
 463                         .ctx = ctx,
 464                         .direction = F,
 465                         .merged = ctx->v->mergedregs,
 466         };
 467
 468         foreach_instr (instr, &ctx->unscheduled_list) {
 469                 calculate_deps(&state, instr->data);
 470         }
 471 }
 472
 473 static void
 474 calculate_reverse_deps(struct ir3_postsched_ctx *ctx)
 475 {
 476         struct ir3_postsched_deps_state state = {
 477                         .ctx = ctx,
 478                         .direction = R,
 479                         .merged = ctx->v->mergedregs,
 480         };
 481
 482         foreach_instr_rev (instr, &ctx->unscheduled_list) {
 483                 calculate_deps(&state, instr->data);
 484         }
 485 }
 486
 487 static void
 488 sched_node_init(struct ir3_postsched_ctx *ctx, struct ir3_instruction *instr)
 489 {
 490         struct ir3_postsched_node *n = rzalloc(ctx->mem_ctx, struct ir3_postsched_node);
 491
 492         dag_init_node(ctx->dag, &n->dag);
 493
 494         n->instr = instr;
 495         instr->data = n;
 496 }
 497
 498 static void
 499 sched_dag_max_delay_cb(struct dag_node *node, void *state)
 500 {
 501         struct ir3_postsched_node *n = (struct ir3_postsched_node *)node;
 502         uint32_t max_delay = 0;
 503
 504         util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
 505                 struct ir3_postsched_node *child = (struct ir3_postsched_node *)edge->child;
 506                 max_delay = MAX2(child->max_delay, max_delay);
 507         }
 508
 509         n->max_delay = MAX2(n->max_delay, max_delay + n->delay);
 510 }
 511
 512 static void
 513 sched_dag_init(struct ir3_postsched_ctx *ctx)
 514 {
 515         ctx->mem_ctx = ralloc_context(NULL);
 516
 517         ctx->dag = dag_create(ctx->mem_ctx);
 518
 519         foreach_instr (instr, &ctx->unscheduled_list)
 520                 sched_node_init(ctx, instr);
 521
 522         calculate_forward_deps(ctx);
 523         calculate_reverse_deps(ctx);
 524
 525         /*
 526          * To avoid expensive texture fetches, etc, from being moved ahead
 527          * of kills, track the kills we've seen so far, so we can add an
 528          * extra dependency on them for tex/mem instructions
 529          */
 530         struct util_dynarray kills;
 531         util_dynarray_init(&kills, ctx->mem_ctx);
 532
 533         /*
 534          * Normal srcs won't be in SSA at this point, those are dealt with in
 535          * calculate_forward_deps() and calculate_reverse_deps().  But we still
 536          * have the false-dep information in SSA form, so go ahead and add
 537          * dependencies for that here:
 538          */
 539         foreach_instr (instr, &ctx->unscheduled_list) {
 540                 struct ir3_postsched_node *n = instr->data;
 541
 542                 foreach_ssa_src_n (src, i, instr) {
 543                         if (src->block != instr->block)
 544                                 continue;
 545
 546                         /* we can end up with unused false-deps.. just skip them: */
 547                         if (src->flags & IR3_INSTR_UNUSED)
 548                                 continue;
 549
 550                         struct ir3_postsched_node *sn = src->data;
 551
 552                         /* don't consider dependencies in other blocks: */
 553                         if (src->block != instr->block)
 554                                 continue;
 555
 556                         dag_add_edge(&sn->dag, &n->dag, NULL);
 557                 }
 558
 559                 if (is_kill(instr)) {
 560                         util_dynarray_append(&kills, struct ir3_instruction *, instr);
 561                 } else if (is_tex(instr) || is_mem(instr)) {
 562                         util_dynarray_foreach(&kills, struct ir3_instruction *, instrp) {
 563                                 struct ir3_instruction *kill = *instrp;
 564                                 struct ir3_postsched_node *kn = kill->data;
 565                                 dag_add_edge(&kn->dag, &n->dag, NULL);
 566                         }
 567                 }
 568         }
 569
 570         // TODO do we want to do this after reverse-dependencies?
 571         dag_traverse_bottom_up(ctx->dag, sched_dag_max_delay_cb, NULL);
 572 }
 573
 574 static void
 575 sched_dag_destroy(struct ir3_postsched_ctx *ctx)
 576 {
 577         ralloc_free(ctx->mem_ctx);
 578         ctx->mem_ctx = NULL;
 579         ctx->dag = NULL;
 580 }
 581
 582 static void
 583 sched_block(struct ir3_postsched_ctx *ctx, struct ir3_block *block)
 584 {
 585         ctx->block = block;
 586         ctx->tex_delay = 0;
 587         ctx->sfu_delay = 0;
 588
 589         /* move all instructions to the unscheduled list, and
 590          * empty the block's instruction list (to which we will
 591          * be inserting).
 592          */
 593         list_replace(&block->instr_list, &ctx->unscheduled_list);
 594         list_inithead(&block->instr_list);
 595
 596         // TODO once we are using post-sched for everything we can
 597         // just not stick in NOP's prior to post-sched, and drop this.
 598         // for now keep this, since it makes post-sched optional:
 599         foreach_instr_safe (instr, &ctx->unscheduled_list) {
 600                 switch (instr->opc) {
 601                 case OPC_NOP:
 602                 case OPC_B:
 603                 case OPC_JUMP:
 604                         list_delinit(&instr->node);
 605                         break;
 606                 default:
 607                         break;
 608                 }
 609         }
 610
 611         sched_dag_init(ctx);
 612
 613         /* First schedule all meta:input instructions, followed by
 614          * tex-prefetch.  We want all of the instructions that load
 615          * values into registers before the shader starts to go
 616          * before any other instructions.  But in particular we
 617          * want inputs to come before prefetches.  This is because
 618          * a FS's bary_ij input may not actually be live in the
 619          * shader, but it should not be scheduled on top of any
 620          * other input (but can be overwritten by a tex prefetch)
 621          */
 622         foreach_instr_safe (instr, &ctx->unscheduled_list)
 623                 if (instr->opc == OPC_META_INPUT)
 624                         schedule(ctx, instr);
 625
 626         foreach_instr_safe (instr, &ctx->unscheduled_list)
 627                 if (instr->opc == OPC_META_TEX_PREFETCH)
 628                         schedule(ctx, instr);
 629
 630         while (!list_is_empty(&ctx->unscheduled_list)) {
 631                 struct ir3_instruction *instr = choose_instr(ctx);
 632
 633                 unsigned delay = ir3_delay_calc(ctx->block, instr, false, false);
 634                 d("delay=%u", delay);
 635
 636                 /* and if we run out of instructions that can be scheduled,
 637                  * then it is time for nop's:
 638                  */
 639                 debug_assert(delay <= 6);
 640                 while (delay > 0) {
 641                         ir3_NOP(block);
 642                         delay--;
 643                 }
 644
 645                 schedule(ctx, instr);
 646         }
 647
 648         sched_dag_destroy(ctx);
 649 }
 650
 651
 652 static bool
 653 is_self_mov(struct ir3_instruction *instr)
 654 {
 655         if (!is_same_type_mov(instr))
 656                 return false;
 657
 658         if (instr->regs[0]->num != instr->regs[1]->num)
 659                 return false;
 660
 661         if (instr->regs[0]->flags & IR3_REG_RELATIV)
 662                 return false;
 663
 664         if (instr->regs[1]->flags & (IR3_REG_CONST | IR3_REG_IMMED |
 665                         IR3_REG_RELATIV | IR3_REG_FNEG | IR3_REG_FABS |
 666                         IR3_REG_SNEG | IR3_REG_SABS | IR3_REG_BNOT |
 667                         IR3_REG_EVEN | IR3_REG_POS_INF))
 668                 return false;
 669
 670         return true;
 671 }
 672
 673 /* sometimes we end up w/ in-place mov's, ie. mov.u32u32 r1.y, r1.y
 674  * as a result of places were before RA we are not sure that it is
 675  * safe to eliminate.  We could eliminate these earlier, but sometimes
 676  * they are tangled up in false-dep's, etc, so it is easier just to
 677  * let them exist until after RA
 678  */
 679 static void
 680 cleanup_self_movs(struct ir3 *ir)
 681 {
 682         foreach_block (block, &ir->block_list) {
 683                 foreach_instr_safe (instr, &block->instr_list) {
 684
 685                         foreach_src (reg, instr) {
 686                                 if (!reg->instr)
 687                                         continue;
 688
 689                                 if (is_self_mov(reg->instr)) {
 690                                         list_delinit(&reg->instr->node);
 691                                         reg->instr = reg->instr->regs[1]->instr;
 692                                 }
 693                         }
 694
 695                         for (unsigned i = 0; i < instr->deps_count; i++) {
 696                                 if (instr->deps[i] && is_self_mov(instr->deps[i])) {
 697                                         list_delinit(&instr->deps[i]->node);
 698                                         instr->deps[i] = instr->deps[i]->regs[1]->instr;
 699                                 }
 700                         }
 701                 }
 702         }
 703 }
 704
 705 bool
 706 ir3_postsched(struct ir3 *ir, struct ir3_shader_variant *v)
 707 {
 708         struct ir3_postsched_ctx ctx = {
 709                         .ir = ir,
 710                         .v  = v,
 711         };
 712
 713         ir3_remove_nops(ir);
 714         cleanup_self_movs(ir);
 715
 716         foreach_block (block, &ir->block_list) {
 717                 sched_block(&ctx, block);
 718         }
 719
 720         return true;
 721 }