src/freedreno/ir3/ir3_postsched.c

   1 /*
   2  * Copyright (C) 2019 Google, Inc.
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21  * SOFTWARE.
  22  *
  23  * Authors:
  24  *    Rob Clark <robclark@freedesktop.org>
  25  */
  26
  27
  28 #include "util/dag.h"
  29 #include "util/u_math.h"
  30
  31 #include "ir3.h"
  32 #include "ir3_compiler.h"
  33 #include "ir3_context.h"
  34
  35 #ifdef DEBUG
  36 #define SCHED_DEBUG (ir3_shader_debug & IR3_DBG_SCHEDMSGS)
  37 #else
  38 #define SCHED_DEBUG 0
  39 #endif
  40 #define d(fmt, ...) do { if (SCHED_DEBUG) { \
  41         printf("PSCHED: "fmt"\n", ##__VA_ARGS__); \
  42 } } while (0)
  43
  44 #define di(instr, fmt, ...) do { if (SCHED_DEBUG) { \
  45         printf("PSCHED: "fmt": ", ##__VA_ARGS__); \
  46         ir3_print_instr(instr); \
  47 } } while (0)
  48
  49 /*
  50  * Post RA Instruction Scheduling
  51  */
  52
  53 struct ir3_postsched_ctx {
  54         struct ir3_context *ctx;
  55
  56         void *mem_ctx;
  57         struct ir3_block *block;           /* the current block */
  58         struct dag *dag;
  59
  60         struct list_head unscheduled_list; /* unscheduled instructions */
  61         struct ir3_instruction *scheduled; /* last scheduled instr */
  62         struct ir3_instruction *pred;      /* current p0.x user, if any */
  63
  64         bool error;
  65 };
  66
  67 struct ir3_postsched_node {
  68         struct dag_node dag;     /* must be first for util_dynarray_foreach */
  69         struct ir3_instruction *instr;
  70         bool partially_evaluated_path;
  71
  72         unsigned delay;
  73         unsigned max_delay;
  74 };
  75
  76 #define foreach_sched_node(__n, __list) \
  77         list_for_each_entry(struct ir3_postsched_node, __n, __list, dag.link)
  78
  79 #define foreach_bit(b, mask) \
  80         for (uint32_t _m = ({debug_assert((mask) >= 1); (mask);}); _m && ({(b) = u_bit_scan(&_m); 1;});)
  81
  82 // TODO deduplicate
  83 static bool is_sfu_or_mem(struct ir3_instruction *instr)
  84 {
  85         return is_sfu(instr) || is_mem(instr);
  86 }
  87
  88 static void
  89 schedule(struct ir3_postsched_ctx *ctx, struct ir3_instruction *instr)
  90 {
  91         debug_assert(ctx->block == instr->block);
  92
  93         /* maybe there is a better way to handle this than just stuffing
  94          * a nop.. ideally we'd know about this constraint in the
  95          * scheduling and depth calculation..
  96          */
  97         if (ctx->scheduled && is_sfu_or_mem(ctx->scheduled) && is_sfu_or_mem(instr))
  98                 ir3_NOP(ctx->block);
  99
 100         /* remove from unscheduled_list:
 101          */
 102         list_delinit(&instr->node);
 103
 104         if (writes_pred(instr)) {
 105                 ctx->pred = instr;
 106         }
 107
 108         di(instr, "schedule");
 109
 110         list_addtail(&instr->node, &instr->block->instr_list);
 111         ctx->scheduled = instr;
 112
 113         struct ir3_postsched_node *n = instr->data;
 114         dag_prune_head(ctx->dag, &n->dag);
 115 }
 116
 117 static void
 118 dump_state(struct ir3_postsched_ctx *ctx)
 119 {
 120         if (!SCHED_DEBUG)
 121                 return;
 122
 123         foreach_sched_node (n, &ctx->dag->heads) {
 124                 di(n->instr, "maxdel=%3d    ", n->max_delay);
 125
 126                 util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
 127                         struct ir3_postsched_node *child =
 128                                 (struct ir3_postsched_node *)edge->child;
 129
 130                         di(child->instr, " -> (%d parents) ", child->dag.parent_count);
 131                 }
 132         }
 133 }
 134
 135 /* find instruction to schedule: */
 136 static struct ir3_instruction *
 137 choose_instr(struct ir3_postsched_ctx *ctx)
 138 {
 139         struct ir3_postsched_node *chosen = NULL;
 140
 141         dump_state(ctx);
 142
 143         foreach_sched_node (n, &ctx->dag->heads) {
 144                 if (!is_meta(n->instr))
 145                         continue;
 146
 147                 if (!chosen || (chosen->max_delay < n->max_delay))
 148                         chosen = n;
 149         }
 150
 151         if (chosen) {
 152                 di(chosen->instr, "prio: chose (meta)");
 153                 return chosen->instr;
 154         }
 155
 156         /* Try to schedule inputs with a higher priority, if possible, as
 157          * the last bary.f unlocks varying storage to unblock more VS
 158          * warps.
 159          */
 160         foreach_sched_node (n, &ctx->dag->heads) {
 161                 if (!is_input(n->instr))
 162                         continue;
 163
 164                 if (!chosen || (chosen->max_delay < n->max_delay))
 165                         chosen = n;
 166         }
 167
 168         if (chosen) {
 169                 di(chosen->instr, "prio: chose (input)");
 170                 return chosen->instr;
 171         }
 172
 173         /* Next prioritize discards: */
 174         foreach_sched_node (n, &ctx->dag->heads) {
 175                 unsigned d = ir3_delay_calc(ctx->block, n->instr, false, false);
 176
 177                 if (d > 0)
 178                         continue;
 179
 180                 if (!is_kill(n->instr))
 181                         continue;
 182
 183                 if (!chosen || (chosen->max_delay < n->max_delay))
 184                         chosen = n;
 185         }
 186
 187         if (chosen) {
 188                 di(chosen->instr, "csp: chose (kill, hard ready)");
 189                 return chosen->instr;
 190         }
 191
 192         /* Next prioritize expensive instructions: */
 193         foreach_sched_node (n, &ctx->dag->heads) {
 194                 unsigned d = ir3_delay_calc(ctx->block, n->instr, false, false);
 195
 196                 if (d > 0)
 197                         continue;
 198
 199                 if (!(is_sfu(n->instr) || is_tex(n->instr)))
 200                         continue;
 201
 202                 if (!chosen || (chosen->max_delay < n->max_delay))
 203                         chosen = n;
 204         }
 205
 206         if (chosen) {
 207                 di(chosen->instr, "csp: chose (sfu/tex, hard ready)");
 208                 return chosen->instr;
 209         }
 210
 211         /* First try to find a ready leader w/ soft delay (ie. including extra
 212          * delay for things like tex fetch which can be synchronized w/ sync
 213          * bit (but we probably do want to schedule some other instructions
 214          * while we wait)
 215          */
 216         foreach_sched_node (n, &ctx->dag->heads) {
 217                 unsigned d = ir3_delay_calc(ctx->block, n->instr, true, false);
 218
 219                 if (d > 0)
 220                         continue;
 221
 222                 if (!chosen || (chosen->max_delay < n->max_delay))
 223                         chosen = n;
 224         }
 225
 226         if (chosen) {
 227                 di(chosen->instr, "csp: chose (soft ready)");
 228                 return chosen->instr;
 229         }
 230
 231         /* Next try to find a ready leader that can be scheduled without nop's,
 232          * which in the case of things that need (sy)/(ss) could result in
 233          * stalls.. but we've already decided there is not a better option.
 234          */
 235         foreach_sched_node (n, &ctx->dag->heads) {
 236                 unsigned d = ir3_delay_calc(ctx->block, n->instr, false, false);
 237
 238                 if (d > 0)
 239                         continue;
 240
 241                 if (!chosen || (chosen->max_delay < n->max_delay))
 242                         chosen = n;
 243         }
 244
 245         if (chosen) {
 246                 di(chosen->instr, "csp: chose (hard ready)");
 247                 return chosen->instr;
 248         }
 249
 250         /* Otherwise choose leader with maximum cost:
 251          *
 252          * TODO should we try to balance cost and delays?  I guess it is
 253          * a balance between now-nop's and future-nop's?
 254          */
 255         foreach_sched_node (n, &ctx->dag->heads) {
 256                 if (!chosen || chosen->max_delay < n->max_delay)
 257                         chosen = n;
 258         }
 259
 260         if (chosen) {
 261                 di(chosen->instr, "csp: chose (leader)");
 262                 return chosen->instr;
 263         }
 264
 265         return NULL;
 266 }
 267
 268 struct ir3_postsched_deps_state {
 269         struct ir3_context *ctx;
 270
 271         enum { F, R } direction;
 272
 273         bool merged;
 274
 275         /* Track the mapping between sched node (instruction) that last
 276          * wrote a given register (in whichever direction we are iterating
 277          * the block)
 278          *
 279          * Note, this table is twice as big as the # of regs, to deal with
 280          * half-precision regs.  The approach differs depending on whether
 281          * the half and full precision register files are "merged" (conflict,
 282          * ie. a6xx+) in which case we consider each full precision dep
 283          * as two half-precision dependencies, vs older separate (non-
 284          * conflicting) in which case the first half of the table is used
 285          * for full precision and 2nd half for half-precision.
 286          */
 287         struct ir3_postsched_node *regs[2 * 256];
 288 };
 289
 290 /* bounds checking read/write accessors, since OoB access to stuff on
 291  * the stack is gonna cause a bad day.
 292  */
 293 #define dep_reg(state, idx) *({ \
 294                 assert((idx) < ARRAY_SIZE((state)->regs)); \
 295                 &(state)->regs[(idx)]; \
 296         })
 297
 298 static void
 299 add_dep(struct ir3_postsched_deps_state *state,
 300                 struct ir3_postsched_node *before,
 301                 struct ir3_postsched_node *after)
 302 {
 303         if (!before || !after)
 304                 return;
 305
 306         assert(before != after);
 307
 308         if (state->direction == F) {
 309                 dag_add_edge(&before->dag, &after->dag, NULL);
 310         } else {
 311                 dag_add_edge(&after->dag, &before->dag, NULL);
 312         }
 313 }
 314
 315 static void
 316 add_single_reg_dep(struct ir3_postsched_deps_state *state,
 317                 struct ir3_postsched_node *node, unsigned num, bool write)
 318 {
 319         add_dep(state, dep_reg(state, num), node);
 320         if (write) {
 321                 dep_reg(state, num) = node;
 322         }
 323 }
 324
 325 /* This is where we handled full vs half-precision, and potential conflicts
 326  * between half and full precision that result in additional dependencies.
 327  * The 'reg' arg is really just to know half vs full precision.
 328  */
 329 static void
 330 add_reg_dep(struct ir3_postsched_deps_state *state,
 331                 struct ir3_postsched_node *node, const struct ir3_register *reg,
 332                 unsigned num, bool write)
 333 {
 334         if (state->merged) {
 335                 if (reg->flags & IR3_REG_HALF) {
 336                         /* single conflict in half-reg space: */
 337                         add_single_reg_dep(state, node, num, write);
 338                 } else {
 339                         /* two conflicts in half-reg space: */
 340                         add_single_reg_dep(state, node, 2 * num + 0, write);
 341                         add_single_reg_dep(state, node, 2 * num + 1, write);
 342                 }
 343         } else {
 344                 if (reg->flags & IR3_REG_HALF)
 345                         num += ARRAY_SIZE(state->regs) / 2;
 346                 add_single_reg_dep(state, node, num, write);
 347         }
 348 }
 349
 350 static void
 351 calculate_deps(struct ir3_postsched_deps_state *state,
 352                 struct ir3_postsched_node *node)
 353 {
 354         static const struct ir3_register half_reg = { .flags = IR3_REG_HALF };
 355         struct ir3_register *reg;
 356         int b;
 357
 358         /* Add dependencies on instructions that previously (or next,
 359          * in the reverse direction) wrote any of our src registers:
 360          */
 361         foreach_src_n (reg, i, node->instr) {
 362                 /* NOTE: relative access for a src can be either const or gpr: */
 363                 if (reg->flags & IR3_REG_RELATIV) {
 364                         /* also reads a0.x: */
 365                         add_reg_dep(state, node, &half_reg, regid(REG_A0, 0), false);
 366                 }
 367
 368                 if (reg->flags & (IR3_REG_CONST | IR3_REG_IMMED))
 369                         continue;
 370
 371                 if (reg->flags & IR3_REG_RELATIV) {
 372                         /* mark entire array as read: */
 373                         struct ir3_array *arr = ir3_lookup_array(state->ctx->ir, reg->array.id);
 374                         for (unsigned i = 0; i < arr->length; i++) {
 375                                 add_reg_dep(state, node, reg, arr->reg + i, false);
 376                         }
 377                 } else {
 378                         foreach_bit (b, reg->wrmask) {
 379                                 add_reg_dep(state, node, reg, reg->num + b, false);
 380
 381                                 struct ir3_postsched_node *dep = dep_reg(state, reg->num + b);
 382                                 if (dep && (state->direction == F)) {
 383                                         unsigned d = ir3_delayslots(dep->instr, node->instr, i, true);
 384                                         node->delay = MAX2(node->delay, d);
 385                                 }
 386                         }
 387                 }
 388         }
 389
 390         if (dest_regs(node->instr) == 0)
 391                 return;
 392
 393         /* And then after we update the state for what this instruction
 394          * wrote:
 395          */
 396         reg = node->instr->regs[0];
 397         if (reg->flags & IR3_REG_RELATIV) {
 398                 /* mark the entire array as written: */
 399                 struct ir3_array *arr = ir3_lookup_array(state->ctx->ir, reg->array.id);
 400                 for (unsigned i = 0; i < arr->length; i++) {
 401                         add_reg_dep(state, node, reg, arr->reg + i, true);
 402                 }
 403
 404                 /* also reads a0.x: */
 405                 add_reg_dep(state, node, &half_reg, regid(REG_A0, 0), false);
 406         } else {
 407                 foreach_bit (b, reg->wrmask) {
 408                         add_reg_dep(state, node, reg, reg->num + b, true);
 409                 }
 410         }
 411 }
 412
 413 static void
 414 calculate_forward_deps(struct ir3_postsched_ctx *ctx)
 415 {
 416         struct ir3_postsched_deps_state state = {
 417                         .ctx = ctx->ctx,
 418                         .direction = F,
 419                         .merged = ctx->ctx->compiler->gpu_id >= 600,
 420         };
 421
 422         foreach_instr (instr, &ctx->unscheduled_list) {
 423                 calculate_deps(&state, instr->data);
 424         }
 425 }
 426
 427 static void
 428 calculate_reverse_deps(struct ir3_postsched_ctx *ctx)
 429 {
 430         struct ir3_postsched_deps_state state = {
 431                         .ctx = ctx->ctx,
 432                         .direction = R,
 433                         .merged = ctx->ctx->compiler->gpu_id >= 600,
 434         };
 435
 436         foreach_instr_rev (instr, &ctx->unscheduled_list) {
 437                 calculate_deps(&state, instr->data);
 438         }
 439 }
 440
 441 static void
 442 sched_node_init(struct ir3_postsched_ctx *ctx, struct ir3_instruction *instr)
 443 {
 444         struct ir3_postsched_node *n = rzalloc(ctx->mem_ctx, struct ir3_postsched_node);
 445
 446         dag_init_node(ctx->dag, &n->dag);
 447
 448         n->instr = instr;
 449         instr->data = n;
 450 }
 451
 452 static void
 453 sched_dag_max_delay_cb(struct dag_node *node, void *state)
 454 {
 455         struct ir3_postsched_node *n = (struct ir3_postsched_node *)node;
 456         uint32_t max_delay = 0;
 457
 458         util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
 459                 struct ir3_postsched_node *child = (struct ir3_postsched_node *)edge->child;
 460                 max_delay = MAX2(child->max_delay, max_delay);
 461         }
 462
 463         n->max_delay = MAX2(n->max_delay, max_delay + n->delay);
 464 }
 465
 466 static void
 467 sched_dag_init(struct ir3_postsched_ctx *ctx)
 468 {
 469         ctx->mem_ctx = ralloc_context(NULL);
 470
 471         ctx->dag = dag_create(ctx->mem_ctx);
 472
 473         foreach_instr (instr, &ctx->unscheduled_list)
 474                 sched_node_init(ctx, instr);
 475
 476         calculate_forward_deps(ctx);
 477         calculate_reverse_deps(ctx);
 478
 479         /*
 480          * Normal srcs won't be in SSA at this point, those are dealt with in
 481          * calculate_forward_deps() and calculate_reverse_deps().  But we still
 482          * have the false-dep information in SSA form, so go ahead and add
 483          * dependencies for that here:
 484          */
 485         foreach_instr (instr, &ctx->unscheduled_list) {
 486                 struct ir3_postsched_node *n = instr->data;
 487                 struct ir3_instruction *src;
 488
 489                 foreach_ssa_src_n (src, i, instr) {
 490                         if (src->block != instr->block)
 491                                 continue;
 492
 493                         /* we can end up with unused false-deps.. just skip them: */
 494                         if (src->flags & IR3_INSTR_UNUSED)
 495                                 continue;
 496
 497                         struct ir3_postsched_node *sn = src->data;
 498
 499                         /* don't consider dependencies in other blocks: */
 500                         if (src->block != instr->block)
 501                                 continue;
 502
 503                         dag_add_edge(&sn->dag, &n->dag, NULL);
 504                 }
 505         }
 506
 507         // TODO do we want to do this after reverse-dependencies?
 508         dag_traverse_bottom_up(ctx->dag, sched_dag_max_delay_cb, NULL);
 509 }
 510
 511 static void
 512 sched_dag_destroy(struct ir3_postsched_ctx *ctx)
 513 {
 514         ralloc_free(ctx->mem_ctx);
 515         ctx->mem_ctx = NULL;
 516         ctx->dag = NULL;
 517 }
 518
 519 static void
 520 sched_block(struct ir3_postsched_ctx *ctx, struct ir3_block *block)
 521 {
 522         ctx->block = block;
 523         ctx->scheduled = NULL;
 524         ctx->pred = NULL;
 525
 526         /* move all instructions to the unscheduled list, and
 527          * empty the block's instruction list (to which we will
 528          * be inserting).
 529          */
 530         list_replace(&block->instr_list, &ctx->unscheduled_list);
 531         list_inithead(&block->instr_list);
 532
 533         // TODO once we are using post-sched for everything we can
 534         // just not stick in NOP's prior to post-sched, and drop this.
 535         // for now keep this, since it makes post-sched optional:
 536         foreach_instr_safe (instr, &ctx->unscheduled_list) {
 537                 switch (instr->opc) {
 538                 case OPC_NOP:
 539                 case OPC_BR:
 540                 case OPC_JUMP:
 541                         list_delinit(&instr->node);
 542                         break;
 543                 default:
 544                         break;
 545                 }
 546         }
 547
 548         sched_dag_init(ctx);
 549
 550         /* First schedule all meta:input instructions, followed by
 551          * tex-prefetch.  We want all of the instructions that load
 552          * values into registers before the shader starts to go
 553          * before any other instructions.  But in particular we
 554          * want inputs to come before prefetches.  This is because
 555          * a FS's bary_ij input may not actually be live in the
 556          * shader, but it should not be scheduled on top of any
 557          * other input (but can be overwritten by a tex prefetch)
 558          */
 559         foreach_instr_safe (instr, &ctx->unscheduled_list)
 560                 if (instr->opc == OPC_META_INPUT)
 561                         schedule(ctx, instr);
 562
 563         foreach_instr_safe (instr, &ctx->unscheduled_list)
 564                 if (instr->opc == OPC_META_TEX_PREFETCH)
 565                         schedule(ctx, instr);
 566
 567         while (!list_is_empty(&ctx->unscheduled_list)) {
 568                 struct ir3_instruction *instr;
 569
 570                 instr = choose_instr(ctx);
 571
 572                 /* this shouldn't happen: */
 573                 if (!instr) {
 574                         ctx->error = true;
 575                         break;
 576                 }
 577
 578                 unsigned delay = ir3_delay_calc(ctx->block, instr, false, false);
 579                 d("delay=%u", delay);
 580
 581                 /* and if we run out of instructions that can be scheduled,
 582                  * then it is time for nop's:
 583                  */
 584                 debug_assert(delay <= 6);
 585                 while (delay > 0) {
 586                         ir3_NOP(block);
 587                         delay--;
 588                 }
 589
 590                 schedule(ctx, instr);
 591         }
 592
 593         sched_dag_destroy(ctx);
 594 }
 595
 596
 597 static bool
 598 is_self_mov(struct ir3_instruction *instr)
 599 {
 600         if (!is_same_type_mov(instr))
 601                 return false;
 602
 603         if (instr->regs[0]->num != instr->regs[1]->num)
 604                 return false;
 605
 606         if (instr->regs[0]->flags & IR3_REG_RELATIV)
 607                 return false;
 608
 609         if (instr->regs[1]->flags & (IR3_REG_CONST | IR3_REG_IMMED |
 610                         IR3_REG_RELATIV | IR3_REG_FNEG | IR3_REG_FABS |
 611                         IR3_REG_SNEG | IR3_REG_SABS | IR3_REG_BNOT |
 612                         IR3_REG_EVEN | IR3_REG_POS_INF))
 613                 return false;
 614
 615         return true;
 616 }
 617
 618 /* sometimes we end up w/ in-place mov's, ie. mov.u32u32 r1.y, r1.y
 619  * as a result of places were before RA we are not sure that it is
 620  * safe to eliminate.  We could eliminate these earlier, but sometimes
 621  * they are tangled up in false-dep's, etc, so it is easier just to
 622  * let them exist until after RA
 623  */
 624 static void
 625 cleanup_self_movs(struct ir3 *ir)
 626 {
 627         foreach_block (block, &ir->block_list) {
 628                 foreach_instr_safe (instr, &block->instr_list) {
 629                         struct ir3_register *reg;
 630
 631                         foreach_src (reg, instr) {
 632                                 if (!reg->instr)
 633                                         continue;
 634
 635                                 if (is_self_mov(reg->instr)) {
 636                                         list_delinit(&reg->instr->node);
 637                                         reg->instr = reg->instr->regs[1]->instr;
 638                                 }
 639                         }
 640
 641                         for (unsigned i = 0; i < instr->deps_count; i++) {
 642                                 if (is_self_mov(instr->deps[i])) {
 643                                         list_delinit(&instr->deps[i]->node);
 644                                         instr->deps[i] = instr->deps[i]->regs[1]->instr;
 645                                 }
 646                         }
 647                 }
 648         }
 649 }
 650
 651 int
 652 ir3_postsched(struct ir3_context *cctx)
 653 {
 654         struct ir3_postsched_ctx ctx = {
 655                         .ctx = cctx,
 656         };
 657
 658         ir3_remove_nops(cctx->ir);
 659         cleanup_self_movs(cctx->ir);
 660
 661         foreach_block (block, &cctx->ir->block_list) {
 662                 sched_block(&ctx, block);
 663         }
 664
 665         if (ctx.error)
 666                 return -1;
 667
 668         return 0;
 669 }