src/freedreno/ir3/ir3_postsched.c

   1 /*
   2  * Copyright (C) 2019 Google, Inc.
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21  * SOFTWARE.
  22  *
  23  * Authors:
  24  *    Rob Clark <robclark@freedesktop.org>
  25  */
  26
  27
  28 #include "util/dag.h"
  29 #include "util/u_math.h"
  30
  31 #include "ir3.h"
  32 #include "ir3_compiler.h"
  33 #include "ir3_context.h"
  34
  35 #ifdef DEBUG
  36 #define SCHED_DEBUG (ir3_shader_debug & IR3_DBG_SCHEDMSGS)
  37 #else
  38 #define SCHED_DEBUG 0
  39 #endif
  40 #define d(fmt, ...) do { if (SCHED_DEBUG) { \
  41         printf("PSCHED: "fmt"\n", ##__VA_ARGS__); \
  42 } } while (0)
  43
  44 #define di(instr, fmt, ...) do { if (SCHED_DEBUG) { \
  45         printf("PSCHED: "fmt": ", ##__VA_ARGS__); \
  46         ir3_print_instr(instr); \
  47 } } while (0)
  48
  49 /*
  50  * Post RA Instruction Scheduling
  51  */
  52
  53 struct ir3_postsched_ctx {
  54         struct ir3_context *ctx;
  55
  56         void *mem_ctx;
  57         struct ir3_block *block;           /* the current block */
  58         struct dag *dag;
  59
  60         struct list_head unscheduled_list; /* unscheduled instructions */
  61
  62         bool error;
  63
  64         int sfu_delay;
  65 };
  66
  67 struct ir3_postsched_node {
  68         struct dag_node dag;     /* must be first for util_dynarray_foreach */
  69         struct ir3_instruction *instr;
  70         bool partially_evaluated_path;
  71
  72         unsigned delay;
  73         unsigned max_delay;
  74 };
  75
  76 #define foreach_sched_node(__n, __list) \
  77         list_for_each_entry(struct ir3_postsched_node, __n, __list, dag.link)
  78
  79 #define foreach_bit(b, mask) \
  80         for (uint32_t _m = ({debug_assert((mask) >= 1); (mask);}); _m && ({(b) = u_bit_scan(&_m); 1;});)
  81
  82 static void
  83 schedule(struct ir3_postsched_ctx *ctx, struct ir3_instruction *instr)
  84 {
  85         debug_assert(ctx->block == instr->block);
  86
  87         /* remove from unscheduled_list:
  88          */
  89         list_delinit(&instr->node);
  90
  91         di(instr, "schedule");
  92
  93         list_addtail(&instr->node, &instr->block->instr_list);
  94
  95         if (is_sfu(instr)) {
  96                 ctx->sfu_delay = 8;
  97         } else if (ctx->sfu_delay > 0) {
  98                 ctx->sfu_delay--;
  99         }
 100
 101         struct ir3_postsched_node *n = instr->data;
 102         dag_prune_head(ctx->dag, &n->dag);
 103 }
 104
 105 static void
 106 dump_state(struct ir3_postsched_ctx *ctx)
 107 {
 108         if (!SCHED_DEBUG)
 109                 return;
 110
 111         foreach_sched_node (n, &ctx->dag->heads) {
 112                 di(n->instr, "maxdel=%3d    ", n->max_delay);
 113
 114                 util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
 115                         struct ir3_postsched_node *child =
 116                                 (struct ir3_postsched_node *)edge->child;
 117
 118                         di(child->instr, " -> (%d parents) ", child->dag.parent_count);
 119                 }
 120         }
 121 }
 122
 123 /* Determine if this is an instruction that we'd prefer not to schedule
 124  * yet, in order to avoid an (ss) sync.  This is limited by the sfu_delay
 125  * counter, ie. the more cycles it has been since the last SFU, the less
 126  * costly a sync would be.
 127  */
 128 static bool
 129 would_sync(struct ir3_postsched_ctx *ctx, struct ir3_instruction *instr)
 130 {
 131         if (ctx->sfu_delay) {
 132                 struct ir3_register *reg;
 133                 foreach_src (reg, instr)
 134                         if (reg->instr && is_sfu(reg->instr))
 135                                 return true;
 136         }
 137
 138         return false;
 139 }
 140
 141 /* find instruction to schedule: */
 142 static struct ir3_instruction *
 143 choose_instr(struct ir3_postsched_ctx *ctx)
 144 {
 145         struct ir3_postsched_node *chosen = NULL;
 146
 147         dump_state(ctx);
 148
 149         foreach_sched_node (n, &ctx->dag->heads) {
 150                 if (!is_meta(n->instr))
 151                         continue;
 152
 153                 if (!chosen || (chosen->max_delay < n->max_delay))
 154                         chosen = n;
 155         }
 156
 157         if (chosen) {
 158                 di(chosen->instr, "prio: chose (meta)");
 159                 return chosen->instr;
 160         }
 161
 162         /* Try to schedule inputs with a higher priority, if possible, as
 163          * the last bary.f unlocks varying storage to unblock more VS
 164          * warps.
 165          */
 166         foreach_sched_node (n, &ctx->dag->heads) {
 167                 if (!is_input(n->instr))
 168                         continue;
 169
 170                 if (!chosen || (chosen->max_delay < n->max_delay))
 171                         chosen = n;
 172         }
 173
 174         if (chosen) {
 175                 di(chosen->instr, "prio: chose (input)");
 176                 return chosen->instr;
 177         }
 178
 179         /* Next prioritize discards: */
 180         foreach_sched_node (n, &ctx->dag->heads) {
 181                 unsigned d = ir3_delay_calc(ctx->block, n->instr, false, false);
 182
 183                 if (d > 0)
 184                         continue;
 185
 186                 if (!is_kill(n->instr))
 187                         continue;
 188
 189                 if (!chosen || (chosen->max_delay < n->max_delay))
 190                         chosen = n;
 191         }
 192
 193         if (chosen) {
 194                 di(chosen->instr, "csp: chose (kill, hard ready)");
 195                 return chosen->instr;
 196         }
 197
 198         /* Next prioritize expensive instructions: */
 199         foreach_sched_node (n, &ctx->dag->heads) {
 200                 unsigned d = ir3_delay_calc(ctx->block, n->instr, false, false);
 201
 202                 if (d > 0)
 203                         continue;
 204
 205                 if (!(is_sfu(n->instr) || is_tex(n->instr)))
 206                         continue;
 207
 208                 if (!chosen || (chosen->max_delay < n->max_delay))
 209                         chosen = n;
 210         }
 211
 212         if (chosen) {
 213                 di(chosen->instr, "csp: chose (sfu/tex, hard ready)");
 214                 return chosen->instr;
 215         }
 216
 217         /*
 218          * Sometimes be better to take a nop, rather than scheduling an
 219          * instruction that would require an (ss) shortly after another
 220          * SFU..  ie. if last SFU was just one or two instr ago, and we
 221          * could choose between taking a nop and then scheduling
 222          * something else, vs scheduling the immed avail instruction that
 223          * would require (ss), we are better with the nop.
 224          */
 225         for (unsigned delay = 0; delay < 4; delay++) {
 226                 foreach_sched_node (n, &ctx->dag->heads) {
 227                         if (would_sync(ctx, n->instr))
 228                                 continue;
 229
 230                         unsigned d = ir3_delay_calc(ctx->block, n->instr, true, false);
 231
 232                         if (d > delay)
 233                                 continue;
 234
 235                         if (!chosen || (chosen->max_delay < n->max_delay))
 236                                 chosen = n;
 237                 }
 238
 239                 if (chosen) {
 240                         di(chosen->instr, "csp: chose (soft ready, delay=%u)", delay);
 241                         return chosen->instr;
 242                 }
 243         }
 244
 245         /* Next try to find a ready leader w/ soft delay (ie. including extra
 246          * delay for things like tex fetch which can be synchronized w/ sync
 247          * bit (but we probably do want to schedule some other instructions
 248          * while we wait)
 249          */
 250         foreach_sched_node (n, &ctx->dag->heads) {
 251                 unsigned d = ir3_delay_calc(ctx->block, n->instr, true, false);
 252
 253                 if (d > 0)
 254                         continue;
 255
 256                 if (!chosen || (chosen->max_delay < n->max_delay))
 257                         chosen = n;
 258         }
 259
 260         if (chosen) {
 261                 di(chosen->instr, "csp: chose (soft ready)");
 262                 return chosen->instr;
 263         }
 264
 265         /* Next try to find a ready leader that can be scheduled without nop's,
 266          * which in the case of things that need (sy)/(ss) could result in
 267          * stalls.. but we've already decided there is not a better option.
 268          */
 269         foreach_sched_node (n, &ctx->dag->heads) {
 270                 unsigned d = ir3_delay_calc(ctx->block, n->instr, false, false);
 271
 272                 if (d > 0)
 273                         continue;
 274
 275                 if (!chosen || (chosen->max_delay < n->max_delay))
 276                         chosen = n;
 277         }
 278
 279         if (chosen) {
 280                 di(chosen->instr, "csp: chose (hard ready)");
 281                 return chosen->instr;
 282         }
 283
 284         /* Otherwise choose leader with maximum cost:
 285          *
 286          * TODO should we try to balance cost and delays?  I guess it is
 287          * a balance between now-nop's and future-nop's?
 288          */
 289         foreach_sched_node (n, &ctx->dag->heads) {
 290                 if (!chosen || chosen->max_delay < n->max_delay)
 291                         chosen = n;
 292         }
 293
 294         if (chosen) {
 295                 di(chosen->instr, "csp: chose (leader)");
 296                 return chosen->instr;
 297         }
 298
 299         return NULL;
 300 }
 301
 302 struct ir3_postsched_deps_state {
 303         struct ir3_context *ctx;
 304
 305         enum { F, R } direction;
 306
 307         bool merged;
 308
 309         /* Track the mapping between sched node (instruction) that last
 310          * wrote a given register (in whichever direction we are iterating
 311          * the block)
 312          *
 313          * Note, this table is twice as big as the # of regs, to deal with
 314          * half-precision regs.  The approach differs depending on whether
 315          * the half and full precision register files are "merged" (conflict,
 316          * ie. a6xx+) in which case we consider each full precision dep
 317          * as two half-precision dependencies, vs older separate (non-
 318          * conflicting) in which case the first half of the table is used
 319          * for full precision and 2nd half for half-precision.
 320          */
 321         struct ir3_postsched_node *regs[2 * 256];
 322 };
 323
 324 /* bounds checking read/write accessors, since OoB access to stuff on
 325  * the stack is gonna cause a bad day.
 326  */
 327 #define dep_reg(state, idx) *({ \
 328                 assert((idx) < ARRAY_SIZE((state)->regs)); \
 329                 &(state)->regs[(idx)]; \
 330         })
 331
 332 static void
 333 add_dep(struct ir3_postsched_deps_state *state,
 334                 struct ir3_postsched_node *before,
 335                 struct ir3_postsched_node *after)
 336 {
 337         if (!before || !after)
 338                 return;
 339
 340         assert(before != after);
 341
 342         if (state->direction == F) {
 343                 dag_add_edge(&before->dag, &after->dag, NULL);
 344         } else {
 345                 dag_add_edge(&after->dag, &before->dag, NULL);
 346         }
 347 }
 348
 349 static void
 350 add_single_reg_dep(struct ir3_postsched_deps_state *state,
 351                 struct ir3_postsched_node *node, unsigned num, bool write)
 352 {
 353         add_dep(state, dep_reg(state, num), node);
 354         if (write) {
 355                 dep_reg(state, num) = node;
 356         }
 357 }
 358
 359 /* This is where we handled full vs half-precision, and potential conflicts
 360  * between half and full precision that result in additional dependencies.
 361  * The 'reg' arg is really just to know half vs full precision.
 362  */
 363 static void
 364 add_reg_dep(struct ir3_postsched_deps_state *state,
 365                 struct ir3_postsched_node *node, const struct ir3_register *reg,
 366                 unsigned num, bool write)
 367 {
 368         if (state->merged) {
 369                 if (reg->flags & IR3_REG_HALF) {
 370                         /* single conflict in half-reg space: */
 371                         add_single_reg_dep(state, node, num, write);
 372                 } else {
 373                         /* two conflicts in half-reg space: */
 374                         add_single_reg_dep(state, node, 2 * num + 0, write);
 375                         add_single_reg_dep(state, node, 2 * num + 1, write);
 376                 }
 377         } else {
 378                 if (reg->flags & IR3_REG_HALF)
 379                         num += ARRAY_SIZE(state->regs) / 2;
 380                 add_single_reg_dep(state, node, num, write);
 381         }
 382 }
 383
 384 static void
 385 calculate_deps(struct ir3_postsched_deps_state *state,
 386                 struct ir3_postsched_node *node)
 387 {
 388         struct ir3_register *reg;
 389         int b;
 390
 391         /* Add dependencies on instructions that previously (or next,
 392          * in the reverse direction) wrote any of our src registers:
 393          */
 394         foreach_src_n (reg, i, node->instr) {
 395                 if (reg->flags & (IR3_REG_CONST | IR3_REG_IMMED))
 396                         continue;
 397
 398                 if (reg->flags & IR3_REG_RELATIV) {
 399                         /* mark entire array as read: */
 400                         struct ir3_array *arr = ir3_lookup_array(state->ctx->ir, reg->array.id);
 401                         for (unsigned i = 0; i < arr->length; i++) {
 402                                 add_reg_dep(state, node, reg, arr->reg + i, false);
 403                         }
 404                 } else {
 405                         foreach_bit (b, reg->wrmask) {
 406                                 add_reg_dep(state, node, reg, reg->num + b, false);
 407
 408                                 struct ir3_postsched_node *dep = dep_reg(state, reg->num + b);
 409                                 if (dep && (state->direction == F)) {
 410                                         unsigned d = ir3_delayslots(dep->instr, node->instr, i, true);
 411                                         node->delay = MAX2(node->delay, d);
 412                                 }
 413                         }
 414                 }
 415         }
 416
 417         if (node->instr->address) {
 418                 add_reg_dep(state, node, node->instr->address->regs[0],
 419                                         node->instr->address->regs[0]->num,
 420                                         false);
 421         }
 422
 423         if (dest_regs(node->instr) == 0)
 424                 return;
 425
 426         /* And then after we update the state for what this instruction
 427          * wrote:
 428          */
 429         reg = node->instr->regs[0];
 430         if (reg->flags & IR3_REG_RELATIV) {
 431                 /* mark the entire array as written: */
 432                 struct ir3_array *arr = ir3_lookup_array(state->ctx->ir, reg->array.id);
 433                 for (unsigned i = 0; i < arr->length; i++) {
 434                         add_reg_dep(state, node, reg, arr->reg + i, true);
 435                 }
 436         } else {
 437                 foreach_bit (b, reg->wrmask) {
 438                         add_reg_dep(state, node, reg, reg->num + b, true);
 439                 }
 440         }
 441 }
 442
 443 static void
 444 calculate_forward_deps(struct ir3_postsched_ctx *ctx)
 445 {
 446         struct ir3_postsched_deps_state state = {
 447                         .ctx = ctx->ctx,
 448                         .direction = F,
 449                         .merged = ctx->ctx->compiler->gpu_id >= 600,
 450         };
 451
 452         foreach_instr (instr, &ctx->unscheduled_list) {
 453                 calculate_deps(&state, instr->data);
 454         }
 455 }
 456
 457 static void
 458 calculate_reverse_deps(struct ir3_postsched_ctx *ctx)
 459 {
 460         struct ir3_postsched_deps_state state = {
 461                         .ctx = ctx->ctx,
 462                         .direction = R,
 463                         .merged = ctx->ctx->compiler->gpu_id >= 600,
 464         };
 465
 466         foreach_instr_rev (instr, &ctx->unscheduled_list) {
 467                 calculate_deps(&state, instr->data);
 468         }
 469 }
 470
 471 static void
 472 sched_node_init(struct ir3_postsched_ctx *ctx, struct ir3_instruction *instr)
 473 {
 474         struct ir3_postsched_node *n = rzalloc(ctx->mem_ctx, struct ir3_postsched_node);
 475
 476         dag_init_node(ctx->dag, &n->dag);
 477
 478         n->instr = instr;
 479         instr->data = n;
 480 }
 481
 482 static void
 483 sched_dag_max_delay_cb(struct dag_node *node, void *state)
 484 {
 485         struct ir3_postsched_node *n = (struct ir3_postsched_node *)node;
 486         uint32_t max_delay = 0;
 487
 488         util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
 489                 struct ir3_postsched_node *child = (struct ir3_postsched_node *)edge->child;
 490                 max_delay = MAX2(child->max_delay, max_delay);
 491         }
 492
 493         n->max_delay = MAX2(n->max_delay, max_delay + n->delay);
 494 }
 495
 496 static void
 497 sched_dag_init(struct ir3_postsched_ctx *ctx)
 498 {
 499         ctx->mem_ctx = ralloc_context(NULL);
 500
 501         ctx->dag = dag_create(ctx->mem_ctx);
 502
 503         foreach_instr (instr, &ctx->unscheduled_list)
 504                 sched_node_init(ctx, instr);
 505
 506         calculate_forward_deps(ctx);
 507         calculate_reverse_deps(ctx);
 508
 509         /*
 510          * To avoid expensive texture fetches, etc, from being moved ahead
 511          * of kills, track the kills we've seen so far, so we can add an
 512          * extra dependency on them for tex/mem instructions
 513          */
 514         struct util_dynarray kills;
 515         util_dynarray_init(&kills, ctx->mem_ctx);
 516
 517         /*
 518          * Normal srcs won't be in SSA at this point, those are dealt with in
 519          * calculate_forward_deps() and calculate_reverse_deps().  But we still
 520          * have the false-dep information in SSA form, so go ahead and add
 521          * dependencies for that here:
 522          */
 523         foreach_instr (instr, &ctx->unscheduled_list) {
 524                 struct ir3_postsched_node *n = instr->data;
 525                 struct ir3_instruction *src;
 526
 527                 foreach_ssa_src_n (src, i, instr) {
 528                         if (src->block != instr->block)
 529                                 continue;
 530
 531                         /* we can end up with unused false-deps.. just skip them: */
 532                         if (src->flags & IR3_INSTR_UNUSED)
 533                                 continue;
 534
 535                         struct ir3_postsched_node *sn = src->data;
 536
 537                         /* don't consider dependencies in other blocks: */
 538                         if (src->block != instr->block)
 539                                 continue;
 540
 541                         dag_add_edge(&sn->dag, &n->dag, NULL);
 542                 }
 543
 544                 if (is_kill(instr)) {
 545                         util_dynarray_append(&kills, struct ir3_instruction *, instr);
 546                 } else if (is_tex(instr) || is_mem(instr)) {
 547                         util_dynarray_foreach(&kills, struct ir3_instruction *, instrp) {
 548                                 struct ir3_instruction *kill = *instrp;
 549                                 struct ir3_postsched_node *kn = kill->data;
 550                                 dag_add_edge(&kn->dag, &n->dag, NULL);
 551                         }
 552                 }
 553         }
 554
 555         // TODO do we want to do this after reverse-dependencies?
 556         dag_traverse_bottom_up(ctx->dag, sched_dag_max_delay_cb, NULL);
 557 }
 558
 559 static void
 560 sched_dag_destroy(struct ir3_postsched_ctx *ctx)
 561 {
 562         ralloc_free(ctx->mem_ctx);
 563         ctx->mem_ctx = NULL;
 564         ctx->dag = NULL;
 565 }
 566
 567 static void
 568 sched_block(struct ir3_postsched_ctx *ctx, struct ir3_block *block)
 569 {
 570         ctx->block = block;
 571
 572         /* move all instructions to the unscheduled list, and
 573          * empty the block's instruction list (to which we will
 574          * be inserting).
 575          */
 576         list_replace(&block->instr_list, &ctx->unscheduled_list);
 577         list_inithead(&block->instr_list);
 578
 579         // TODO once we are using post-sched for everything we can
 580         // just not stick in NOP's prior to post-sched, and drop this.
 581         // for now keep this, since it makes post-sched optional:
 582         foreach_instr_safe (instr, &ctx->unscheduled_list) {
 583                 switch (instr->opc) {
 584                 case OPC_NOP:
 585                 case OPC_B:
 586                 case OPC_JUMP:
 587                         list_delinit(&instr->node);
 588                         break;
 589                 default:
 590                         break;
 591                 }
 592         }
 593
 594         sched_dag_init(ctx);
 595
 596         /* First schedule all meta:input instructions, followed by
 597          * tex-prefetch.  We want all of the instructions that load
 598          * values into registers before the shader starts to go
 599          * before any other instructions.  But in particular we
 600          * want inputs to come before prefetches.  This is because
 601          * a FS's bary_ij input may not actually be live in the
 602          * shader, but it should not be scheduled on top of any
 603          * other input (but can be overwritten by a tex prefetch)
 604          */
 605         foreach_instr_safe (instr, &ctx->unscheduled_list)
 606                 if (instr->opc == OPC_META_INPUT)
 607                         schedule(ctx, instr);
 608
 609         foreach_instr_safe (instr, &ctx->unscheduled_list)
 610                 if (instr->opc == OPC_META_TEX_PREFETCH)
 611                         schedule(ctx, instr);
 612
 613         while (!list_is_empty(&ctx->unscheduled_list)) {
 614                 struct ir3_instruction *instr;
 615
 616                 instr = choose_instr(ctx);
 617
 618                 /* this shouldn't happen: */
 619                 if (!instr) {
 620                         ctx->error = true;
 621                         break;
 622                 }
 623
 624                 unsigned delay = ir3_delay_calc(ctx->block, instr, false, false);
 625                 d("delay=%u", delay);
 626
 627                 /* and if we run out of instructions that can be scheduled,
 628                  * then it is time for nop's:
 629                  */
 630                 debug_assert(delay <= 6);
 631                 while (delay > 0) {
 632                         ir3_NOP(block);
 633                         delay--;
 634                 }
 635
 636                 schedule(ctx, instr);
 637         }
 638
 639         sched_dag_destroy(ctx);
 640 }
 641
 642
 643 static bool
 644 is_self_mov(struct ir3_instruction *instr)
 645 {
 646         if (!is_same_type_mov(instr))
 647                 return false;
 648
 649         if (instr->regs[0]->num != instr->regs[1]->num)
 650                 return false;
 651
 652         if (instr->regs[0]->flags & IR3_REG_RELATIV)
 653                 return false;
 654
 655         if (instr->regs[1]->flags & (IR3_REG_CONST | IR3_REG_IMMED |
 656                         IR3_REG_RELATIV | IR3_REG_FNEG | IR3_REG_FABS |
 657                         IR3_REG_SNEG | IR3_REG_SABS | IR3_REG_BNOT |
 658                         IR3_REG_EVEN | IR3_REG_POS_INF))
 659                 return false;
 660
 661         return true;
 662 }
 663
 664 /* sometimes we end up w/ in-place mov's, ie. mov.u32u32 r1.y, r1.y
 665  * as a result of places were before RA we are not sure that it is
 666  * safe to eliminate.  We could eliminate these earlier, but sometimes
 667  * they are tangled up in false-dep's, etc, so it is easier just to
 668  * let them exist until after RA
 669  */
 670 static void
 671 cleanup_self_movs(struct ir3 *ir)
 672 {
 673         foreach_block (block, &ir->block_list) {
 674                 foreach_instr_safe (instr, &block->instr_list) {
 675                         struct ir3_register *reg;
 676
 677                         foreach_src (reg, instr) {
 678                                 if (!reg->instr)
 679                                         continue;
 680
 681                                 if (is_self_mov(reg->instr)) {
 682                                         list_delinit(&reg->instr->node);
 683                                         reg->instr = reg->instr->regs[1]->instr;
 684                                 }
 685                         }
 686
 687                         for (unsigned i = 0; i < instr->deps_count; i++) {
 688                                 if (instr->deps[i] && is_self_mov(instr->deps[i])) {
 689                                         list_delinit(&instr->deps[i]->node);
 690                                         instr->deps[i] = instr->deps[i]->regs[1]->instr;
 691                                 }
 692                         }
 693                 }
 694         }
 695 }
 696
 697 int
 698 ir3_postsched(struct ir3_context *cctx)
 699 {
 700         struct ir3_postsched_ctx ctx = {
 701                         .ctx = cctx,
 702         };
 703
 704         ir3_remove_nops(cctx->ir);
 705         cleanup_self_movs(cctx->ir);
 706
 707         foreach_block (block, &cctx->ir->block_list) {
 708                 sched_block(&ctx, block);
 709         }
 710
 711         if (ctx.error)
 712                 return -1;
 713
 714         return 0;
 715 }