src/freedreno/ir3/ir3_postsched.c

   1 /*
   2  * Copyright (C) 2019 Google, Inc.
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21  * SOFTWARE.
  22  *
  23  * Authors:
  24  *    Rob Clark <robclark@freedesktop.org>
  25  */
  26
  27
  28 #include "util/dag.h"
  29 #include "util/u_math.h"
  30
  31 #include "ir3.h"
  32 #include "ir3_compiler.h"
  33 #include "ir3_context.h"
  34
  35 #ifdef DEBUG
  36 #define SCHED_DEBUG (ir3_shader_debug & IR3_DBG_SCHEDMSGS)
  37 #else
  38 #define SCHED_DEBUG 0
  39 #endif
  40 #define d(fmt, ...) do { if (SCHED_DEBUG) { \
  41         printf("PSCHED: "fmt"\n", ##__VA_ARGS__); \
  42 } } while (0)
  43
  44 #define di(instr, fmt, ...) do { if (SCHED_DEBUG) { \
  45         printf("PSCHED: "fmt": ", ##__VA_ARGS__); \
  46         ir3_print_instr(instr); \
  47 } } while (0)
  48
  49 /*
  50  * Post RA Instruction Scheduling
  51  */
  52
  53 struct ir3_postsched_ctx {
  54         struct ir3_context *ctx;
  55
  56         void *mem_ctx;
  57         struct ir3_block *block;           /* the current block */
  58         struct dag *dag;
  59
  60         struct list_head unscheduled_list; /* unscheduled instructions */
  61
  62         bool error;
  63
  64         int sfu_delay;
  65         int tex_delay;
  66 };
  67
  68 struct ir3_postsched_node {
  69         struct dag_node dag;     /* must be first for util_dynarray_foreach */
  70         struct ir3_instruction *instr;
  71         bool partially_evaluated_path;
  72
  73         unsigned delay;
  74         unsigned max_delay;
  75 };
  76
  77 #define foreach_sched_node(__n, __list) \
  78         list_for_each_entry(struct ir3_postsched_node, __n, __list, dag.link)
  79
  80 #define foreach_bit(b, mask) \
  81         for (uint32_t _m = ({debug_assert((mask) >= 1); (mask);}); _m && ({(b) = u_bit_scan(&_m); 1;});)
  82
  83 static void
  84 schedule(struct ir3_postsched_ctx *ctx, struct ir3_instruction *instr)
  85 {
  86         debug_assert(ctx->block == instr->block);
  87
  88         /* remove from unscheduled_list:
  89          */
  90         list_delinit(&instr->node);
  91
  92         di(instr, "schedule");
  93
  94         list_addtail(&instr->node, &instr->block->instr_list);
  95
  96         struct ir3_postsched_node *n = instr->data;
  97         dag_prune_head(ctx->dag, &n->dag);
  98
  99         if (is_meta(instr) && (instr->opc != OPC_META_TEX_PREFETCH))
 100                 return;
 101
 102         if (is_sfu(instr)) {
 103                 ctx->sfu_delay = 8;
 104         } else if (check_src_cond(instr, is_sfu)) {
 105                 ctx->sfu_delay = 0;
 106         } else if (ctx->sfu_delay > 0) {
 107                 ctx->sfu_delay--;
 108         }
 109
 110         if (is_tex_or_prefetch(instr)) {
 111                 ctx->tex_delay = 10;
 112         } else if (check_src_cond(instr, is_tex_or_prefetch)) {
 113                 ctx->tex_delay = 0;
 114         } else if (ctx->tex_delay > 0) {
 115                 ctx->tex_delay--;
 116         }
 117 }
 118
 119 static void
 120 dump_state(struct ir3_postsched_ctx *ctx)
 121 {
 122         if (!SCHED_DEBUG)
 123                 return;
 124
 125         foreach_sched_node (n, &ctx->dag->heads) {
 126                 di(n->instr, "maxdel=%3d    ", n->max_delay);
 127
 128                 util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
 129                         struct ir3_postsched_node *child =
 130                                 (struct ir3_postsched_node *)edge->child;
 131
 132                         di(child->instr, " -> (%d parents) ", child->dag.parent_count);
 133                 }
 134         }
 135 }
 136
 137 /* Determine if this is an instruction that we'd prefer not to schedule
 138  * yet, in order to avoid an (ss) sync.  This is limited by the sfu_delay
 139  * counter, ie. the more cycles it has been since the last SFU, the less
 140  * costly a sync would be.
 141  */
 142 static bool
 143 would_sync(struct ir3_postsched_ctx *ctx, struct ir3_instruction *instr)
 144 {
 145         if (ctx->sfu_delay) {
 146                 if (check_src_cond(instr, is_sfu))
 147                         return true;
 148         }
 149
 150         if (ctx->tex_delay) {
 151                 if (check_src_cond(instr, is_tex_or_prefetch))
 152                         return true;
 153         }
 154
 155         return false;
 156 }
 157
 158 /* find instruction to schedule: */
 159 static struct ir3_instruction *
 160 choose_instr(struct ir3_postsched_ctx *ctx)
 161 {
 162         struct ir3_postsched_node *chosen = NULL;
 163
 164         dump_state(ctx);
 165
 166         foreach_sched_node (n, &ctx->dag->heads) {
 167                 if (!is_meta(n->instr))
 168                         continue;
 169
 170                 if (!chosen || (chosen->max_delay < n->max_delay))
 171                         chosen = n;
 172         }
 173
 174         if (chosen) {
 175                 di(chosen->instr, "prio: chose (meta)");
 176                 return chosen->instr;
 177         }
 178
 179         /* Try to schedule inputs with a higher priority, if possible, as
 180          * the last bary.f unlocks varying storage to unblock more VS
 181          * warps.
 182          */
 183         foreach_sched_node (n, &ctx->dag->heads) {
 184                 if (!is_input(n->instr))
 185                         continue;
 186
 187                 if (!chosen || (chosen->max_delay < n->max_delay))
 188                         chosen = n;
 189         }
 190
 191         if (chosen) {
 192                 di(chosen->instr, "prio: chose (input)");
 193                 return chosen->instr;
 194         }
 195
 196         /* Next prioritize discards: */
 197         foreach_sched_node (n, &ctx->dag->heads) {
 198                 unsigned d = ir3_delay_calc(ctx->block, n->instr, false, false);
 199
 200                 if (d > 0)
 201                         continue;
 202
 203                 if (!is_kill(n->instr))
 204                         continue;
 205
 206                 if (!chosen || (chosen->max_delay < n->max_delay))
 207                         chosen = n;
 208         }
 209
 210         if (chosen) {
 211                 di(chosen->instr, "csp: chose (kill, hard ready)");
 212                 return chosen->instr;
 213         }
 214
 215         /* Next prioritize expensive instructions: */
 216         foreach_sched_node (n, &ctx->dag->heads) {
 217                 unsigned d = ir3_delay_calc(ctx->block, n->instr, false, false);
 218
 219                 if (d > 0)
 220                         continue;
 221
 222                 if (!(is_sfu(n->instr) || is_tex(n->instr)))
 223                         continue;
 224
 225                 if (!chosen || (chosen->max_delay < n->max_delay))
 226                         chosen = n;
 227         }
 228
 229         if (chosen) {
 230                 di(chosen->instr, "csp: chose (sfu/tex, hard ready)");
 231                 return chosen->instr;
 232         }
 233
 234         /*
 235          * Sometimes be better to take a nop, rather than scheduling an
 236          * instruction that would require an (ss) shortly after another
 237          * SFU..  ie. if last SFU was just one or two instr ago, and we
 238          * could choose between taking a nop and then scheduling
 239          * something else, vs scheduling the immed avail instruction that
 240          * would require (ss), we are better with the nop.
 241          */
 242         for (unsigned delay = 0; delay < 4; delay++) {
 243                 foreach_sched_node (n, &ctx->dag->heads) {
 244                         if (would_sync(ctx, n->instr))
 245                                 continue;
 246
 247                         unsigned d = ir3_delay_calc(ctx->block, n->instr, true, false);
 248
 249                         if (d > delay)
 250                                 continue;
 251
 252                         if (!chosen || (chosen->max_delay < n->max_delay))
 253                                 chosen = n;
 254                 }
 255
 256                 if (chosen) {
 257                         di(chosen->instr, "csp: chose (soft ready, delay=%u)", delay);
 258                         return chosen->instr;
 259                 }
 260         }
 261
 262         /* Next try to find a ready leader w/ soft delay (ie. including extra
 263          * delay for things like tex fetch which can be synchronized w/ sync
 264          * bit (but we probably do want to schedule some other instructions
 265          * while we wait)
 266          */
 267         foreach_sched_node (n, &ctx->dag->heads) {
 268                 unsigned d = ir3_delay_calc(ctx->block, n->instr, true, false);
 269
 270                 if (d > 0)
 271                         continue;
 272
 273                 if (!chosen || (chosen->max_delay < n->max_delay))
 274                         chosen = n;
 275         }
 276
 277         if (chosen) {
 278                 di(chosen->instr, "csp: chose (soft ready)");
 279                 return chosen->instr;
 280         }
 281
 282         /* Next try to find a ready leader that can be scheduled without nop's,
 283          * which in the case of things that need (sy)/(ss) could result in
 284          * stalls.. but we've already decided there is not a better option.
 285          */
 286         foreach_sched_node (n, &ctx->dag->heads) {
 287                 unsigned d = ir3_delay_calc(ctx->block, n->instr, false, false);
 288
 289                 if (d > 0)
 290                         continue;
 291
 292                 if (!chosen || (chosen->max_delay < n->max_delay))
 293                         chosen = n;
 294         }
 295
 296         if (chosen) {
 297                 di(chosen->instr, "csp: chose (hard ready)");
 298                 return chosen->instr;
 299         }
 300
 301         /* Otherwise choose leader with maximum cost:
 302          *
 303          * TODO should we try to balance cost and delays?  I guess it is
 304          * a balance between now-nop's and future-nop's?
 305          */
 306         foreach_sched_node (n, &ctx->dag->heads) {
 307                 if (!chosen || chosen->max_delay < n->max_delay)
 308                         chosen = n;
 309         }
 310
 311         if (chosen) {
 312                 di(chosen->instr, "csp: chose (leader)");
 313                 return chosen->instr;
 314         }
 315
 316         return NULL;
 317 }
 318
 319 struct ir3_postsched_deps_state {
 320         struct ir3_context *ctx;
 321
 322         enum { F, R } direction;
 323
 324         bool merged;
 325
 326         /* Track the mapping between sched node (instruction) that last
 327          * wrote a given register (in whichever direction we are iterating
 328          * the block)
 329          *
 330          * Note, this table is twice as big as the # of regs, to deal with
 331          * half-precision regs.  The approach differs depending on whether
 332          * the half and full precision register files are "merged" (conflict,
 333          * ie. a6xx+) in which case we consider each full precision dep
 334          * as two half-precision dependencies, vs older separate (non-
 335          * conflicting) in which case the first half of the table is used
 336          * for full precision and 2nd half for half-precision.
 337          */
 338         struct ir3_postsched_node *regs[2 * 256];
 339 };
 340
 341 /* bounds checking read/write accessors, since OoB access to stuff on
 342  * the stack is gonna cause a bad day.
 343  */
 344 #define dep_reg(state, idx) *({ \
 345                 assert((idx) < ARRAY_SIZE((state)->regs)); \
 346                 &(state)->regs[(idx)]; \
 347         })
 348
 349 static void
 350 add_dep(struct ir3_postsched_deps_state *state,
 351                 struct ir3_postsched_node *before,
 352                 struct ir3_postsched_node *after)
 353 {
 354         if (!before || !after)
 355                 return;
 356
 357         assert(before != after);
 358
 359         if (state->direction == F) {
 360                 dag_add_edge(&before->dag, &after->dag, NULL);
 361         } else {
 362                 dag_add_edge(&after->dag, &before->dag, NULL);
 363         }
 364 }
 365
 366 static void
 367 add_single_reg_dep(struct ir3_postsched_deps_state *state,
 368                 struct ir3_postsched_node *node, unsigned num, bool write)
 369 {
 370         add_dep(state, dep_reg(state, num), node);
 371         if (write) {
 372                 dep_reg(state, num) = node;
 373         }
 374 }
 375
 376 /* This is where we handled full vs half-precision, and potential conflicts
 377  * between half and full precision that result in additional dependencies.
 378  * The 'reg' arg is really just to know half vs full precision.
 379  */
 380 static void
 381 add_reg_dep(struct ir3_postsched_deps_state *state,
 382                 struct ir3_postsched_node *node, const struct ir3_register *reg,
 383                 unsigned num, bool write)
 384 {
 385         if (state->merged) {
 386                 if (reg->flags & IR3_REG_HALF) {
 387                         /* single conflict in half-reg space: */
 388                         add_single_reg_dep(state, node, num, write);
 389                 } else {
 390                         /* two conflicts in half-reg space: */
 391                         add_single_reg_dep(state, node, 2 * num + 0, write);
 392                         add_single_reg_dep(state, node, 2 * num + 1, write);
 393                 }
 394         } else {
 395                 if (reg->flags & IR3_REG_HALF)
 396                         num += ARRAY_SIZE(state->regs) / 2;
 397                 add_single_reg_dep(state, node, num, write);
 398         }
 399 }
 400
 401 static void
 402 calculate_deps(struct ir3_postsched_deps_state *state,
 403                 struct ir3_postsched_node *node)
 404 {
 405         struct ir3_register *reg;
 406         int b;
 407
 408         /* Add dependencies on instructions that previously (or next,
 409          * in the reverse direction) wrote any of our src registers:
 410          */
 411         foreach_src_n (reg, i, node->instr) {
 412                 if (reg->flags & (IR3_REG_CONST | IR3_REG_IMMED))
 413                         continue;
 414
 415                 if (reg->flags & IR3_REG_RELATIV) {
 416                         /* mark entire array as read: */
 417                         struct ir3_array *arr = ir3_lookup_array(state->ctx->ir, reg->array.id);
 418                         for (unsigned i = 0; i < arr->length; i++) {
 419                                 add_reg_dep(state, node, reg, arr->reg + i, false);
 420                         }
 421                 } else {
 422                         foreach_bit (b, reg->wrmask) {
 423                                 add_reg_dep(state, node, reg, reg->num + b, false);
 424
 425                                 struct ir3_postsched_node *dep = dep_reg(state, reg->num + b);
 426                                 if (dep && (state->direction == F)) {
 427                                         unsigned d = ir3_delayslots(dep->instr, node->instr, i, true);
 428                                         node->delay = MAX2(node->delay, d);
 429                                 }
 430                         }
 431                 }
 432         }
 433
 434         if (node->instr->address) {
 435                 add_reg_dep(state, node, node->instr->address->regs[0],
 436                                         node->instr->address->regs[0]->num,
 437                                         false);
 438         }
 439
 440         if (dest_regs(node->instr) == 0)
 441                 return;
 442
 443         /* And then after we update the state for what this instruction
 444          * wrote:
 445          */
 446         reg = node->instr->regs[0];
 447         if (reg->flags & IR3_REG_RELATIV) {
 448                 /* mark the entire array as written: */
 449                 struct ir3_array *arr = ir3_lookup_array(state->ctx->ir, reg->array.id);
 450                 for (unsigned i = 0; i < arr->length; i++) {
 451                         add_reg_dep(state, node, reg, arr->reg + i, true);
 452                 }
 453         } else {
 454                 foreach_bit (b, reg->wrmask) {
 455                         add_reg_dep(state, node, reg, reg->num + b, true);
 456                 }
 457         }
 458 }
 459
 460 static void
 461 calculate_forward_deps(struct ir3_postsched_ctx *ctx)
 462 {
 463         struct ir3_postsched_deps_state state = {
 464                         .ctx = ctx->ctx,
 465                         .direction = F,
 466                         .merged = ctx->ctx->compiler->gpu_id >= 600,
 467         };
 468
 469         foreach_instr (instr, &ctx->unscheduled_list) {
 470                 calculate_deps(&state, instr->data);
 471         }
 472 }
 473
 474 static void
 475 calculate_reverse_deps(struct ir3_postsched_ctx *ctx)
 476 {
 477         struct ir3_postsched_deps_state state = {
 478                         .ctx = ctx->ctx,
 479                         .direction = R,
 480                         .merged = ctx->ctx->compiler->gpu_id >= 600,
 481         };
 482
 483         foreach_instr_rev (instr, &ctx->unscheduled_list) {
 484                 calculate_deps(&state, instr->data);
 485         }
 486 }
 487
 488 static void
 489 sched_node_init(struct ir3_postsched_ctx *ctx, struct ir3_instruction *instr)
 490 {
 491         struct ir3_postsched_node *n = rzalloc(ctx->mem_ctx, struct ir3_postsched_node);
 492
 493         dag_init_node(ctx->dag, &n->dag);
 494
 495         n->instr = instr;
 496         instr->data = n;
 497 }
 498
 499 static void
 500 sched_dag_max_delay_cb(struct dag_node *node, void *state)
 501 {
 502         struct ir3_postsched_node *n = (struct ir3_postsched_node *)node;
 503         uint32_t max_delay = 0;
 504
 505         util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
 506                 struct ir3_postsched_node *child = (struct ir3_postsched_node *)edge->child;
 507                 max_delay = MAX2(child->max_delay, max_delay);
 508         }
 509
 510         n->max_delay = MAX2(n->max_delay, max_delay + n->delay);
 511 }
 512
 513 static void
 514 sched_dag_init(struct ir3_postsched_ctx *ctx)
 515 {
 516         ctx->mem_ctx = ralloc_context(NULL);
 517
 518         ctx->dag = dag_create(ctx->mem_ctx);
 519
 520         foreach_instr (instr, &ctx->unscheduled_list)
 521                 sched_node_init(ctx, instr);
 522
 523         calculate_forward_deps(ctx);
 524         calculate_reverse_deps(ctx);
 525
 526         /*
 527          * To avoid expensive texture fetches, etc, from being moved ahead
 528          * of kills, track the kills we've seen so far, so we can add an
 529          * extra dependency on them for tex/mem instructions
 530          */
 531         struct util_dynarray kills;
 532         util_dynarray_init(&kills, ctx->mem_ctx);
 533
 534         /*
 535          * Normal srcs won't be in SSA at this point, those are dealt with in
 536          * calculate_forward_deps() and calculate_reverse_deps().  But we still
 537          * have the false-dep information in SSA form, so go ahead and add
 538          * dependencies for that here:
 539          */
 540         foreach_instr (instr, &ctx->unscheduled_list) {
 541                 struct ir3_postsched_node *n = instr->data;
 542                 struct ir3_instruction *src;
 543
 544                 foreach_ssa_src_n (src, i, instr) {
 545                         if (src->block != instr->block)
 546                                 continue;
 547
 548                         /* we can end up with unused false-deps.. just skip them: */
 549                         if (src->flags & IR3_INSTR_UNUSED)
 550                                 continue;
 551
 552                         struct ir3_postsched_node *sn = src->data;
 553
 554                         /* don't consider dependencies in other blocks: */
 555                         if (src->block != instr->block)
 556                                 continue;
 557
 558                         dag_add_edge(&sn->dag, &n->dag, NULL);
 559                 }
 560
 561                 if (is_kill(instr)) {
 562                         util_dynarray_append(&kills, struct ir3_instruction *, instr);
 563                 } else if (is_tex(instr) || is_mem(instr)) {
 564                         util_dynarray_foreach(&kills, struct ir3_instruction *, instrp) {
 565                                 struct ir3_instruction *kill = *instrp;
 566                                 struct ir3_postsched_node *kn = kill->data;
 567                                 dag_add_edge(&kn->dag, &n->dag, NULL);
 568                         }
 569                 }
 570         }
 571
 572         // TODO do we want to do this after reverse-dependencies?
 573         dag_traverse_bottom_up(ctx->dag, sched_dag_max_delay_cb, NULL);
 574 }
 575
 576 static void
 577 sched_dag_destroy(struct ir3_postsched_ctx *ctx)
 578 {
 579         ralloc_free(ctx->mem_ctx);
 580         ctx->mem_ctx = NULL;
 581         ctx->dag = NULL;
 582 }
 583
 584 static void
 585 sched_block(struct ir3_postsched_ctx *ctx, struct ir3_block *block)
 586 {
 587         ctx->block = block;
 588
 589         /* move all instructions to the unscheduled list, and
 590          * empty the block's instruction list (to which we will
 591          * be inserting).
 592          */
 593         list_replace(&block->instr_list, &ctx->unscheduled_list);
 594         list_inithead(&block->instr_list);
 595
 596         // TODO once we are using post-sched for everything we can
 597         // just not stick in NOP's prior to post-sched, and drop this.
 598         // for now keep this, since it makes post-sched optional:
 599         foreach_instr_safe (instr, &ctx->unscheduled_list) {
 600                 switch (instr->opc) {
 601                 case OPC_NOP:
 602                 case OPC_B:
 603                 case OPC_JUMP:
 604                         list_delinit(&instr->node);
 605                         break;
 606                 default:
 607                         break;
 608                 }
 609         }
 610
 611         sched_dag_init(ctx);
 612
 613         /* First schedule all meta:input instructions, followed by
 614          * tex-prefetch.  We want all of the instructions that load
 615          * values into registers before the shader starts to go
 616          * before any other instructions.  But in particular we
 617          * want inputs to come before prefetches.  This is because
 618          * a FS's bary_ij input may not actually be live in the
 619          * shader, but it should not be scheduled on top of any
 620          * other input (but can be overwritten by a tex prefetch)
 621          */
 622         foreach_instr_safe (instr, &ctx->unscheduled_list)
 623                 if (instr->opc == OPC_META_INPUT)
 624                         schedule(ctx, instr);
 625
 626         foreach_instr_safe (instr, &ctx->unscheduled_list)
 627                 if (instr->opc == OPC_META_TEX_PREFETCH)
 628                         schedule(ctx, instr);
 629
 630         while (!list_is_empty(&ctx->unscheduled_list)) {
 631                 struct ir3_instruction *instr;
 632
 633                 instr = choose_instr(ctx);
 634
 635                 /* this shouldn't happen: */
 636                 if (!instr) {
 637                         ctx->error = true;
 638                         break;
 639                 }
 640
 641                 unsigned delay = ir3_delay_calc(ctx->block, instr, false, false);
 642                 d("delay=%u", delay);
 643
 644                 /* and if we run out of instructions that can be scheduled,
 645                  * then it is time for nop's:
 646                  */
 647                 debug_assert(delay <= 6);
 648                 while (delay > 0) {
 649                         ir3_NOP(block);
 650                         delay--;
 651                 }
 652
 653                 schedule(ctx, instr);
 654         }
 655
 656         sched_dag_destroy(ctx);
 657 }
 658
 659
 660 static bool
 661 is_self_mov(struct ir3_instruction *instr)
 662 {
 663         if (!is_same_type_mov(instr))
 664                 return false;
 665
 666         if (instr->regs[0]->num != instr->regs[1]->num)
 667                 return false;
 668
 669         if (instr->regs[0]->flags & IR3_REG_RELATIV)
 670                 return false;
 671
 672         if (instr->regs[1]->flags & (IR3_REG_CONST | IR3_REG_IMMED |
 673                         IR3_REG_RELATIV | IR3_REG_FNEG | IR3_REG_FABS |
 674                         IR3_REG_SNEG | IR3_REG_SABS | IR3_REG_BNOT |
 675                         IR3_REG_EVEN | IR3_REG_POS_INF))
 676                 return false;
 677
 678         return true;
 679 }
 680
 681 /* sometimes we end up w/ in-place mov's, ie. mov.u32u32 r1.y, r1.y
 682  * as a result of places were before RA we are not sure that it is
 683  * safe to eliminate.  We could eliminate these earlier, but sometimes
 684  * they are tangled up in false-dep's, etc, so it is easier just to
 685  * let them exist until after RA
 686  */
 687 static void
 688 cleanup_self_movs(struct ir3 *ir)
 689 {
 690         foreach_block (block, &ir->block_list) {
 691                 foreach_instr_safe (instr, &block->instr_list) {
 692                         struct ir3_register *reg;
 693
 694                         foreach_src (reg, instr) {
 695                                 if (!reg->instr)
 696                                         continue;
 697
 698                                 if (is_self_mov(reg->instr)) {
 699                                         list_delinit(&reg->instr->node);
 700                                         reg->instr = reg->instr->regs[1]->instr;
 701                                 }
 702                         }
 703
 704                         for (unsigned i = 0; i < instr->deps_count; i++) {
 705                                 if (instr->deps[i] && is_self_mov(instr->deps[i])) {
 706                                         list_delinit(&instr->deps[i]->node);
 707                                         instr->deps[i] = instr->deps[i]->regs[1]->instr;
 708                                 }
 709                         }
 710                 }
 711         }
 712 }
 713
 714 int
 715 ir3_postsched(struct ir3_context *cctx)
 716 {
 717         struct ir3_postsched_ctx ctx = {
 718                         .ctx = cctx,
 719         };
 720
 721         ir3_remove_nops(cctx->ir);
 722         cleanup_self_movs(cctx->ir);
 723
 724         foreach_block (block, &cctx->ir->block_list) {
 725                 sched_block(&ctx, block);
 726         }
 727
 728         if (ctx.error)
 729                 return -1;
 730
 731         return 0;
 732 }