src/compiler/nir/nir_schedule.c

   1 /*
   2  * Copyright © 2019 Broadcom
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "nir.h"
  25 #include "util/dag.h"
  26 #include "util/u_dynarray.h"
  27
  28 /** @file
  29  *
  30  * Implements basic-block-level prepass instruction scheduling in NIR to
  31  * manage register pressure.
  32  *
  33  * This is based on the Goodman/Hsu paper (1988, cached copy at
  34  * https://people.freedesktop.org/~anholt/scheduling-goodman-hsu.pdf).  We
  35  * make up the DDG for NIR (which can be mostly done using the NIR def/use
  36  * chains for SSA instructions, plus some edges for ordering register writes
  37  * vs reads, and some more for ordering intrinsics).  Then we pick heads off
  38  * of the DDG using their heuristic to emit the NIR instructions back into the
  39  * block in their new order.
  40  *
  41  * The hard case for prepass scheduling on GPUs seems to always be consuming
  42  * texture/ubo results.  The register pressure heuristic doesn't want to pick
  43  * an instr that starts consuming texture results because it usually won't be
  44  * the only usage, so that instruction will increase pressure.
  45  *
  46  * If you try to force consumption of tex results always, then in a case where
  47  * single sample is used for many outputs, you'll end up picking every other
  48  * user and expanding register pressure.  The partially_evaluated_path flag
  49  * helps tremendously, in that if you happen for whatever reason to pick a
  50  * texture sample's output, then you'll try to finish off that sample.  Future
  51  * work may include doing some local search before locking in a choice, to try
  52  * to more reliably find the case where just a few choices going against the
  53  * heuristic can manage to free the whole vector.
  54  */
  55
  56 static bool debug;
  57
  58 /**
  59  * Represents a node in the DDG for a NIR instruction.
  60  */
  61 typedef struct {
  62    struct dag_node dag; /* must be first for our u_dynarray_foreach */
  63    nir_instr *instr;
  64    bool partially_evaluated_path;
  65
  66    /* Approximate estimate of the delay between starting this instruction and
  67     * its results being available.
  68     *
  69     * Accuracy is not too important, given that we're prepass scheduling here
  70     * and just trying to reduce excess dependencies introduced by a register
  71     * allocator by stretching out the live intervals of expensive
  72     * instructions.
  73     */
  74    uint32_t delay;
  75
  76    /* Cost of the maximum-delay path from this node to the leaves. */
  77    uint32_t max_delay;
  78
  79    /* scoreboard->time value when this instruction can be scheduled without
  80     * any stalls expected.
  81     */
  82    uint32_t ready_time;
  83 } nir_schedule_node;
  84
  85 typedef struct {
  86    struct dag *dag;
  87
  88    nir_shader *shader;
  89
  90    /* Mapping from nir_register * or nir_ssa_def * to a struct set of
  91     * instructions remaining to be scheduled using the register.
  92     */
  93    struct hash_table *remaining_uses;
  94
  95    /* Map from nir_instr to nir_schedule_node * */
  96    struct hash_table *instr_map;
  97
  98    /* Set of nir_register * or nir_ssa_def * that have had any instruction
  99     * scheduled on them.
 100     */
 101    struct set *live_values;
 102
 103    /* An abstract approximation of the number of nir_scheduler_node->delay
 104     * units since the start of the shader.
 105     */
 106    uint32_t time;
 107
 108    /* Number of channels currently used by the NIR instructions that have been
 109     * scheduled.
 110     */
 111    int pressure;
 112
 113    /* Number of channels that may be in use before we switch to the
 114     * pressure-prioritizing scheduling heuristic.
 115     */
 116    int threshold;
 117 } nir_schedule_scoreboard;
 118
 119 /* When walking the instructions in reverse, we use this flag to swap
 120  * before/after in add_dep().
 121  */
 122 enum direction { F, R };
 123
 124 typedef struct {
 125    nir_shader *shader;
 126
 127    /* Map from nir_instr to nir_schedule_node * */
 128    struct hash_table *instr_map;
 129    /* Map from nir_register to nir_schedule_node * */
 130    struct hash_table *reg_map;
 131
 132    /* Scheduler nodes for last instruction involved in some class of dependency.
 133     */
 134    nir_schedule_node *load_input;
 135    nir_schedule_node *store_shared;
 136    nir_schedule_node *unknown_intrinsic;
 137    nir_schedule_node *discard;
 138    nir_schedule_node *jump;
 139
 140    enum direction dir;
 141 } nir_deps_state;
 142
 143 static void *
 144 _mesa_hash_table_search_data(struct hash_table *ht, void *key)
 145 {
 146    struct hash_entry *entry = _mesa_hash_table_search(ht, key);
 147    if (!entry)
 148       return NULL;
 149    return entry->data;
 150 }
 151
 152 static nir_schedule_node *
 153 nir_schedule_get_node(struct hash_table *instr_map, nir_instr *instr)
 154 {
 155    return _mesa_hash_table_search_data(instr_map, instr);
 156 }
 157
 158 static struct set *
 159 nir_schedule_scoreboard_get_src(nir_schedule_scoreboard *scoreboard, nir_src *src)
 160 {
 161    if (src->is_ssa) {
 162       return _mesa_hash_table_search_data(scoreboard->remaining_uses, src->ssa);
 163    } else {
 164       return _mesa_hash_table_search_data(scoreboard->remaining_uses,
 165                                           src->reg.reg);
 166    }
 167 }
 168
 169 static int
 170 nir_schedule_def_pressure(nir_ssa_def *def)
 171 {
 172    return def->num_components;
 173 }
 174
 175 static int
 176 nir_schedule_src_pressure(nir_src *src)
 177 {
 178    if (src->is_ssa)
 179       return nir_schedule_def_pressure(src->ssa);
 180    else
 181       return src->reg.reg->num_components;
 182 }
 183
 184 static int
 185 nir_schedule_dest_pressure(nir_dest *dest)
 186 {
 187    if (dest->is_ssa)
 188       return nir_schedule_def_pressure(&dest->ssa);
 189    else
 190       return dest->reg.reg->num_components;
 191 }
 192
 193 /**
 194  * Adds a dependency such that @after must appear in the final program after
 195  * @before.
 196  *
 197  * We add @before as a child of @after, so that DAG heads are the outputs of
 198  * the program and we make our scheduling decisions bottom to top.
 199  */
 200 static void
 201 add_dep(nir_deps_state *state,
 202         nir_schedule_node *before,
 203         nir_schedule_node *after)
 204 {
 205    if (!before || !after)
 206       return;
 207
 208    assert(before != after);
 209
 210    if (state->dir == F)
 211       dag_add_edge(&before->dag, &after->dag, NULL);
 212    else
 213       dag_add_edge(&after->dag, &before->dag, NULL);
 214 }
 215
 216
 217 static void
 218 add_read_dep(nir_deps_state *state,
 219              nir_schedule_node *before,
 220              nir_schedule_node *after)
 221 {
 222    add_dep(state, before, after);
 223 }
 224
 225 static void
 226 add_write_dep(nir_deps_state *state,
 227               nir_schedule_node **before,
 228               nir_schedule_node *after)
 229 {
 230    add_dep(state, *before, after);
 231    *before = after;
 232 }
 233
 234 static bool
 235 nir_schedule_reg_src_deps(nir_src *src, void *in_state)
 236 {
 237    nir_deps_state *state = in_state;
 238
 239    if (src->is_ssa)
 240       return true;
 241
 242    struct hash_entry *entry = _mesa_hash_table_search(state->reg_map,
 243                                                       src->reg.reg);
 244    if (!entry)
 245       return true;
 246    nir_schedule_node *dst_n = entry->data;
 247
 248    nir_schedule_node *src_n = nir_schedule_get_node(state->instr_map,
 249                                                     src->parent_instr);
 250
 251    add_dep(state, dst_n, src_n);
 252
 253    return true;
 254 }
 255
 256 static bool
 257 nir_schedule_reg_dest_deps(nir_dest *dest, void *in_state)
 258 {
 259    nir_deps_state *state = in_state;
 260
 261    if (dest->is_ssa)
 262       return true;
 263
 264    nir_schedule_node *dest_n = nir_schedule_get_node(state->instr_map,
 265                                                      dest->reg.parent_instr);
 266
 267    struct hash_entry *entry = _mesa_hash_table_search(state->reg_map,
 268                                                       dest->reg.reg);
 269    if (!entry) {
 270       _mesa_hash_table_insert(state->reg_map, dest->reg.reg, dest_n);
 271       return true;
 272    }
 273    nir_schedule_node **before = (nir_schedule_node **)&entry->data;
 274
 275    add_write_dep(state, before, dest_n);
 276
 277    return true;
 278 }
 279
 280 static bool
 281 nir_schedule_ssa_deps(nir_ssa_def *def, void *in_state)
 282 {
 283    nir_deps_state *state = in_state;
 284    nir_schedule_node *def_n = nir_schedule_get_node(state->instr_map, def->parent_instr);
 285
 286    nir_foreach_use(src, def) {
 287       nir_schedule_node *use_n = nir_schedule_get_node(state->instr_map,
 288                                                        src->parent_instr);
 289
 290       add_read_dep(state, def_n, use_n);
 291    }
 292
 293    return true;
 294 }
 295
 296 static void
 297 nir_schedule_intrinsic_deps(nir_deps_state *state,
 298                             nir_intrinsic_instr *instr)
 299 {
 300    nir_schedule_node *n = nir_schedule_get_node(state->instr_map, &instr->instr);
 301
 302    switch (instr->intrinsic) {
 303    case nir_intrinsic_load_uniform:
 304    case nir_intrinsic_load_ubo:
 305    case nir_intrinsic_load_front_face:
 306       break;
 307
 308    case nir_intrinsic_discard:
 309    case nir_intrinsic_discard_if:
 310       /* We are adding two dependencies:
 311        *
 312        * * A individual one that we could use to add a read_dep while handling
 313        *   nir_instr_type_tex
 314        *
 315        * * Include it on the unknown intrinsic set, as we want discard to be
 316        *   serialized in in the same order relative to intervening stores or
 317        *   atomic accesses to SSBOs and images
 318        */
 319       add_write_dep(state, &state->discard, n);
 320       add_write_dep(state, &state->unknown_intrinsic, n);
 321       break;
 322
 323    case nir_intrinsic_store_output:
 324       /* For some non-FS shader stages, or for some hardware, output stores
 325        * affect the same shared memory as input loads.
 326        */
 327       if (state->shader->info.stage != MESA_SHADER_FRAGMENT)
 328          add_write_dep(state, &state->load_input, n);
 329
 330       /* Make sure that preceding discards stay before the store_output */
 331       add_read_dep(state, state->discard, n);
 332
 333       break;
 334
 335    case nir_intrinsic_load_input:
 336    case nir_intrinsic_load_per_vertex_input:
 337       add_read_dep(state, state->load_input, n);
 338       break;
 339
 340    case nir_intrinsic_load_shared:
 341       /* Don't move load_shared beyond a following store_shared, as it could
 342        * change their value
 343        */
 344       add_read_dep(state, state->store_shared, n);
 345       break;
 346
 347    case nir_intrinsic_store_shared:
 348       add_write_dep(state, &state->store_shared, n);
 349       break;
 350
 351    case nir_intrinsic_control_barrier:
 352    case nir_intrinsic_memory_barrier_shared:
 353       add_write_dep(state, &state->store_shared, n);
 354
 355       /* Serialize against ssbos/atomics/etc. */
 356       add_write_dep(state, &state->unknown_intrinsic, n);
 357       break;
 358
 359    default:
 360       /* Attempt to handle other intrinsics that we haven't individually
 361        * categorized by serializing them in the same order relative to each
 362        * other.
 363        */
 364       add_write_dep(state, &state->unknown_intrinsic, n);
 365       break;
 366    }
 367 }
 368
 369 /**
 370  * Common code for dependencies that need to be tracked both forward and
 371  * backward.
 372  *
 373  * This is for things like "all reads of r4 have to happen between the r4
 374  * writes that surround them".
 375  */
 376 static void
 377 nir_schedule_calculate_deps(nir_deps_state *state, nir_schedule_node *n)
 378 {
 379    nir_instr *instr = n->instr;
 380
 381    /* For NIR SSA defs, we only need to do a single pass of making the uses
 382     * depend on the def.
 383     */
 384    if (state->dir == F)
 385       nir_foreach_ssa_def(instr, nir_schedule_ssa_deps, state);
 386
 387    /* For NIR regs, track the last writer in the scheduler state so that we
 388     * can keep the writes in order and let reads get reordered only between
 389     * each write.
 390     */
 391    nir_foreach_src(instr, nir_schedule_reg_src_deps, state);
 392
 393    nir_foreach_dest(instr, nir_schedule_reg_dest_deps, state);
 394
 395    /* Make sure any other instructions keep their positions relative to
 396     * jumps.
 397     */
 398    if (instr->type != nir_instr_type_jump)
 399       add_read_dep(state, state->jump, n);
 400
 401    switch (instr->type) {
 402    case nir_instr_type_ssa_undef:
 403    case nir_instr_type_load_const:
 404    case nir_instr_type_alu:
 405    case nir_instr_type_deref:
 406       break;
 407
 408    case nir_instr_type_tex:
 409       /* Don't move texture ops before a discard, as that could increase
 410        * memory bandwidth for reading the discarded samples.
 411        */
 412       add_read_dep(state, state->discard, n);
 413       break;
 414
 415    case nir_instr_type_jump:
 416       add_write_dep(state, &state->jump, n);
 417       break;
 418
 419    case nir_instr_type_call:
 420       unreachable("Calls should have been lowered");
 421       break;
 422
 423    case nir_instr_type_parallel_copy:
 424       unreachable("Parallel copies should have been lowered");
 425       break;
 426
 427    case nir_instr_type_phi:
 428       unreachable("nir_schedule() should be called after lowering from SSA");
 429       break;
 430
 431    case nir_instr_type_intrinsic:
 432       nir_schedule_intrinsic_deps(state, nir_instr_as_intrinsic(instr));
 433       break;
 434    }
 435 }
 436
 437 static void
 438 calculate_forward_deps(nir_schedule_scoreboard *scoreboard, nir_block *block)
 439 {
 440    nir_deps_state state = {
 441       .shader = scoreboard->shader,
 442       .dir = F,
 443       .instr_map = scoreboard->instr_map,
 444       .reg_map = _mesa_pointer_hash_table_create(NULL),
 445    };
 446
 447    nir_foreach_instr(instr, block) {
 448       nir_schedule_node *node = nir_schedule_get_node(scoreboard->instr_map,
 449                                                       instr);
 450       nir_schedule_calculate_deps(&state, node);
 451    }
 452
 453    ralloc_free(state.reg_map);
 454 }
 455
 456 static void
 457 calculate_reverse_deps(nir_schedule_scoreboard *scoreboard, nir_block *block)
 458 {
 459    nir_deps_state state = {
 460       .shader = scoreboard->shader,
 461       .dir = R,
 462       .instr_map = scoreboard->instr_map,
 463       .reg_map = _mesa_pointer_hash_table_create(NULL),
 464    };
 465
 466    nir_foreach_instr_reverse(instr, block) {
 467       nir_schedule_node *node = nir_schedule_get_node(scoreboard->instr_map,
 468                                                       instr);
 469       nir_schedule_calculate_deps(&state, node);
 470    }
 471
 472    ralloc_free(state.reg_map);
 473 }
 474
 475 typedef struct {
 476    nir_schedule_scoreboard *scoreboard;
 477    int regs_freed;
 478 } nir_schedule_regs_freed_state;
 479
 480 static bool
 481 nir_schedule_regs_freed_src_cb(nir_src *src, void *in_state)
 482 {
 483    nir_schedule_regs_freed_state *state = in_state;
 484    nir_schedule_scoreboard *scoreboard = state->scoreboard;
 485    struct set *remaining_uses = nir_schedule_scoreboard_get_src(scoreboard, src);
 486
 487    if (remaining_uses->entries == 1 &&
 488        _mesa_set_search(remaining_uses, src->parent_instr)) {
 489       state->regs_freed += nir_schedule_src_pressure(src);
 490    }
 491
 492    return true;
 493 }
 494
 495 static bool
 496 nir_schedule_regs_freed_def_cb(nir_ssa_def *def, void *in_state)
 497 {
 498    nir_schedule_regs_freed_state *state = in_state;
 499
 500    state->regs_freed -= nir_schedule_def_pressure(def);
 501
 502    return true;
 503 }
 504
 505 static bool
 506 nir_schedule_regs_freed_dest_cb(nir_dest *dest, void *in_state)
 507 {
 508    nir_schedule_regs_freed_state *state = in_state;
 509    nir_schedule_scoreboard *scoreboard = state->scoreboard;
 510
 511    if (dest->is_ssa)
 512       return true;
 513
 514    nir_register *reg = dest->reg.reg;
 515
 516    /* Only the first def of a reg counts against register pressure. */
 517    if (!_mesa_set_search(scoreboard->live_values, reg))
 518       state->regs_freed -= nir_schedule_dest_pressure(dest);
 519
 520    return true;
 521 }
 522
 523 static int
 524 nir_schedule_regs_freed(nir_schedule_scoreboard *scoreboard, nir_schedule_node *n)
 525 {
 526    nir_schedule_regs_freed_state state = {
 527       .scoreboard = scoreboard,
 528    };
 529
 530    nir_foreach_src(n->instr, nir_schedule_regs_freed_src_cb, &state);
 531
 532    nir_foreach_ssa_def(n->instr, nir_schedule_regs_freed_def_cb, &state);
 533
 534    nir_foreach_dest(n->instr, nir_schedule_regs_freed_dest_cb, &state);
 535
 536    return state.regs_freed;
 537 }
 538
 539 /**
 540  * Chooses an instruction to schedule using the Goodman/Hsu (1988) CSP (Code
 541  * Scheduling for Parallelism) heuristic.
 542  *
 543  * Picks an instruction on the critical that's ready to execute without
 544  * stalls, if possible, otherwise picks the instruction on the critical path.
 545  */
 546 static nir_schedule_node *
 547 nir_schedule_choose_instruction_csp(nir_schedule_scoreboard *scoreboard)
 548 {
 549    nir_schedule_node *chosen = NULL;
 550
 551    /* Find the leader in the ready (shouldn't-stall) set with the maximum
 552     * cost.
 553     */
 554    list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
 555       if (scoreboard->time < n->ready_time)
 556          continue;
 557
 558       if (!chosen || chosen->max_delay < n->max_delay)
 559          chosen = n;
 560    }
 561    if (chosen) {
 562       if (debug) {
 563          fprintf(stderr, "chose (ready):          ");
 564          nir_print_instr(chosen->instr, stderr);
 565          fprintf(stderr, "\n");
 566       }
 567
 568       return chosen;
 569    }
 570
 571    /* Otherwise, choose the leader with the maximum cost. */
 572    list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
 573       if (!chosen || chosen->max_delay < n->max_delay)
 574          chosen = n;
 575    }
 576    if (debug) {
 577       fprintf(stderr, "chose (leader):         ");
 578       nir_print_instr(chosen->instr, stderr);
 579       fprintf(stderr, "\n");
 580    }
 581
 582    return chosen;
 583 }
 584
 585 /**
 586  * Chooses an instruction to schedule using the Goodman/Hsu (1988) CSR (Code
 587  * Scheduling for Register pressure) heuristic.
 588  */
 589 static nir_schedule_node *
 590 nir_schedule_choose_instruction_csr(nir_schedule_scoreboard *scoreboard)
 591 {
 592    nir_schedule_node *chosen = NULL;
 593
 594    /* Find a ready inst with regs freed and pick the one with max cost. */
 595    list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
 596       if (n->ready_time > scoreboard->time)
 597          continue;
 598
 599       int regs_freed = nir_schedule_regs_freed(scoreboard, n);
 600
 601       if (regs_freed > 0 && (!chosen || chosen->max_delay < n->max_delay)) {
 602          chosen = n;
 603       }
 604    }
 605    if (chosen) {
 606       if (debug) {
 607          fprintf(stderr, "chose (freed+ready):    ");
 608          nir_print_instr(chosen->instr, stderr);
 609          fprintf(stderr, "\n");
 610       }
 611
 612       return chosen;
 613    }
 614
 615    /* Find a leader with regs freed and pick the one with max cost. */
 616    list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
 617       int regs_freed = nir_schedule_regs_freed(scoreboard, n);
 618
 619       if (regs_freed > 0 && (!chosen || chosen->max_delay < n->max_delay)) {
 620          chosen = n;
 621       }
 622    }
 623    if (chosen) {
 624       if (debug) {
 625          fprintf(stderr, "chose (regs freed):     ");
 626          nir_print_instr(chosen->instr, stderr);
 627          fprintf(stderr, "\n");
 628       }
 629
 630       return chosen;
 631    }
 632
 633    /* Find a partially evaluated path and try to finish it off */
 634    list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
 635       if (n->partially_evaluated_path &&
 636           (!chosen || chosen->max_delay < n->max_delay)) {
 637          chosen = n;
 638       }
 639    }
 640    if (chosen) {
 641       if (debug) {
 642          fprintf(stderr, "chose (partial path):   ");
 643          nir_print_instr(chosen->instr, stderr);
 644          fprintf(stderr, "\n");
 645       }
 646
 647       return chosen;
 648    }
 649
 650    /* Contra the paper, pick a leader with no effect on used regs.  This may
 651     * open up new opportunities, as otherwise a single-operand instr consuming
 652     * a value will tend to block finding freeing that value.  This had a
 653     * massive effect on reducing spilling on V3D.
 654     *
 655     * XXX: Should this prioritize ready?
 656     */
 657    list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
 658       if (nir_schedule_regs_freed(scoreboard, n) != 0)
 659          continue;
 660
 661       if (!chosen || chosen->max_delay < n->max_delay)
 662          chosen = n;
 663    }
 664    if (chosen) {
 665       if (debug) {
 666          fprintf(stderr, "chose (regs no-op):     ");
 667          nir_print_instr(chosen->instr, stderr);
 668          fprintf(stderr, "\n");
 669       }
 670
 671       return chosen;
 672    }
 673
 674    /* Pick the max delay of the remaining ready set. */
 675    list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
 676       if (n->ready_time > scoreboard->time)
 677          continue;
 678       if (!chosen || chosen->max_delay < n->max_delay)
 679          chosen = n;
 680    }
 681    if (chosen) {
 682       if (debug) {
 683          fprintf(stderr, "chose (ready max delay):   ");
 684          nir_print_instr(chosen->instr, stderr);
 685          fprintf(stderr, "\n");
 686       }
 687       return chosen;
 688    }
 689
 690    /* Pick the max delay of the remaining leaders. */
 691    list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
 692       if (!chosen || chosen->max_delay < n->max_delay)
 693          chosen = n;
 694    }
 695
 696    if (debug) {
 697       fprintf(stderr, "chose (max delay):         ");
 698       nir_print_instr(chosen->instr, stderr);
 699       fprintf(stderr, "\n");
 700    }
 701
 702    return chosen;
 703 }
 704
 705 static void
 706 dump_state(nir_schedule_scoreboard *scoreboard)
 707 {
 708    list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
 709       fprintf(stderr, "maxdel %5d ", n->max_delay);
 710       nir_print_instr(n->instr, stderr);
 711       fprintf(stderr, "\n");
 712
 713       util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
 714          nir_schedule_node *child = (nir_schedule_node *)edge->child;
 715
 716          fprintf(stderr, " -> (%d parents) ", child->dag.parent_count);
 717          nir_print_instr(child->instr, stderr);
 718          fprintf(stderr, "\n");
 719       }
 720    }
 721 }
 722
 723 static void
 724 nir_schedule_mark_use(nir_schedule_scoreboard *scoreboard,
 725                       void *reg_or_def,
 726                       nir_instr *reg_or_def_parent,
 727                       int pressure)
 728 {
 729    /* Make the value live if it's the first time it's been used. */
 730    if (!_mesa_set_search(scoreboard->live_values, reg_or_def)) {
 731       _mesa_set_add(scoreboard->live_values, reg_or_def);
 732       scoreboard->pressure += pressure;
 733    }
 734
 735    /* Make the value dead if it's the last remaining use.  Be careful when one
 736     * instruction uses a value twice to not decrement pressure twice.
 737     */
 738    struct set *remaining_uses =
 739       _mesa_hash_table_search_data(scoreboard->remaining_uses, reg_or_def);
 740    struct set_entry *entry = _mesa_set_search(remaining_uses, reg_or_def_parent);
 741    if (entry) {
 742       _mesa_set_remove(remaining_uses, entry);
 743
 744       if (remaining_uses->entries == 0)
 745          scoreboard->pressure -= pressure;
 746    }
 747 }
 748
 749 static bool
 750 nir_schedule_mark_src_scheduled(nir_src *src, void *state)
 751 {
 752    nir_schedule_scoreboard *scoreboard = state;
 753    struct set *remaining_uses = nir_schedule_scoreboard_get_src(scoreboard, src);
 754
 755    struct set_entry *entry = _mesa_set_search(remaining_uses,
 756                                               src->parent_instr);
 757    if (entry) {
 758       /* Once we've used an SSA value in one instruction, bump the priority of
 759        * the other uses so the SSA value can get fully consumed.
 760        *
 761        * We don't do this for registers, and it's would be a hassle and it's
 762        * unclear if that would help or not.  Also, skip it for constants, as
 763        * they're often folded as immediates into backend instructions and have
 764        * many unrelated instructions all referencing the same value (0).
 765        */
 766       if (src->is_ssa &&
 767           src->ssa->parent_instr->type != nir_instr_type_load_const) {
 768          nir_foreach_use(other_src, src->ssa) {
 769             if (other_src->parent_instr == src->parent_instr)
 770                continue;
 771
 772             nir_schedule_node *n =
 773                nir_schedule_get_node(scoreboard->instr_map,
 774                                      other_src->parent_instr);
 775
 776             if (n && !n->partially_evaluated_path) {
 777                if (debug) {
 778                   fprintf(stderr, "  New partially evaluated path: ");
 779                   nir_print_instr(n->instr, stderr);
 780                   fprintf(stderr, "\n");
 781                }
 782
 783                n->partially_evaluated_path = true;
 784             }
 785          }
 786       }
 787    }
 788
 789    nir_schedule_mark_use(scoreboard,
 790                          src->is_ssa ? (void *)src->ssa : (void *)src->reg.reg,
 791                          src->parent_instr,
 792                          nir_schedule_src_pressure(src));
 793
 794    return true;
 795 }
 796
 797 static bool
 798 nir_schedule_mark_def_scheduled(nir_ssa_def *def, void *state)
 799 {
 800    nir_schedule_scoreboard *scoreboard = state;
 801
 802    nir_schedule_mark_use(scoreboard, def, def->parent_instr,
 803                          nir_schedule_def_pressure(def));
 804
 805    return true;
 806 }
 807
 808 static bool
 809 nir_schedule_mark_dest_scheduled(nir_dest *dest, void *state)
 810 {
 811    nir_schedule_scoreboard *scoreboard = state;
 812
 813    /* SSA defs were handled in nir_schedule_mark_def_scheduled()
 814     */
 815    if (dest->is_ssa)
 816       return true;
 817
 818    /* XXX: This is not actually accurate for regs -- the last use of a reg may
 819     * have a live interval that extends across control flow.  We should
 820     * calculate the live ranges of regs, and have scheduler nodes for the CF
 821     * nodes that also "use" the reg.
 822     */
 823    nir_schedule_mark_use(scoreboard, dest->reg.reg,
 824                          dest->reg.parent_instr,
 825                          nir_schedule_dest_pressure(dest));
 826
 827    return true;
 828 }
 829
 830 static void
 831 nir_schedule_mark_node_scheduled(nir_schedule_scoreboard *scoreboard,
 832                                  nir_schedule_node *n)
 833 {
 834    nir_foreach_src(n->instr, nir_schedule_mark_src_scheduled, scoreboard);
 835    nir_foreach_ssa_def(n->instr, nir_schedule_mark_def_scheduled, scoreboard);
 836    nir_foreach_dest(n->instr, nir_schedule_mark_dest_scheduled, scoreboard);
 837
 838    util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
 839       nir_schedule_node *child = (nir_schedule_node *)edge->child;
 840
 841       child->ready_time = MAX2(child->ready_time,
 842                                scoreboard->time + n->delay);
 843
 844       if (child->dag.parent_count == 1) {
 845          if (debug) {
 846             fprintf(stderr, "  New DAG head: ");
 847             nir_print_instr(child->instr, stderr);
 848             fprintf(stderr, "\n");
 849          }
 850       }
 851    }
 852
 853    dag_prune_head(scoreboard->dag, &n->dag);
 854
 855    scoreboard->time = MAX2(n->ready_time, scoreboard->time);
 856    scoreboard->time++;
 857 }
 858
 859 static void
 860 nir_schedule_instructions(nir_schedule_scoreboard *scoreboard, nir_block *block)
 861 {
 862    while (!list_is_empty(&scoreboard->dag->heads)) {
 863       if (debug) {
 864          fprintf(stderr, "current list:\n");
 865          dump_state(scoreboard);
 866       }
 867
 868       nir_schedule_node *chosen;
 869       if (scoreboard->pressure < scoreboard->threshold)
 870          chosen = nir_schedule_choose_instruction_csp(scoreboard);
 871       else
 872          chosen = nir_schedule_choose_instruction_csr(scoreboard);
 873
 874       /* Now that we've scheduled a new instruction, some of its children may
 875        * be promoted to the list of instructions ready to be scheduled.
 876        */
 877       nir_schedule_mark_node_scheduled(scoreboard, chosen);
 878
 879       /* Move the instruction to the end (so our first chosen instructions are
 880        * the start of the program).
 881        */
 882       exec_node_remove(&chosen->instr->node);
 883       exec_list_push_tail(&block->instr_list, &chosen->instr->node);
 884
 885       if (debug)
 886          fprintf(stderr, "\n");
 887    }
 888 }
 889
 890 static uint32_t
 891 nir_schedule_get_delay(nir_instr *instr)
 892 {
 893    switch (instr->type) {
 894    case nir_instr_type_ssa_undef:
 895    case nir_instr_type_load_const:
 896    case nir_instr_type_alu:
 897    case nir_instr_type_deref:
 898    case nir_instr_type_jump:
 899    case nir_instr_type_parallel_copy:
 900    case nir_instr_type_call:
 901    case nir_instr_type_phi:
 902       return 1;
 903
 904    case nir_instr_type_intrinsic:
 905       /* XXX: Pick a large number for UBO/SSBO/image/shared loads */
 906       return 1;
 907
 908    case nir_instr_type_tex:
 909       /* Pick some large number to try to fetch textures early and sample them
 910        * late.
 911        */
 912       return 100;
 913    }
 914
 915    return 0;
 916 }
 917
 918 static void
 919 nir_schedule_dag_max_delay_cb(struct dag_node *node, void *state)
 920 {
 921    nir_schedule_node *n = (nir_schedule_node *)node;
 922    uint32_t max_delay = 0;
 923
 924    util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
 925       nir_schedule_node *child = (nir_schedule_node *)edge->child;
 926       max_delay = MAX2(child->max_delay, max_delay);
 927    }
 928
 929    n->max_delay = MAX2(n->max_delay, max_delay + n->delay);
 930  }
 931
 932 static void
 933 nir_schedule_block(nir_schedule_scoreboard *scoreboard, nir_block *block)
 934 {
 935    void *mem_ctx = ralloc_context(NULL);
 936    scoreboard->instr_map = _mesa_pointer_hash_table_create(mem_ctx);
 937
 938    scoreboard->dag = dag_create(mem_ctx);
 939
 940    nir_foreach_instr(instr, block) {
 941       nir_schedule_node *n =
 942          rzalloc(mem_ctx, nir_schedule_node);
 943
 944       n->instr = instr;
 945       n->delay = nir_schedule_get_delay(instr);
 946       dag_init_node(scoreboard->dag, &n->dag);
 947
 948       _mesa_hash_table_insert(scoreboard->instr_map, instr, n);
 949    }
 950
 951    calculate_forward_deps(scoreboard, block);
 952    calculate_reverse_deps(scoreboard, block);
 953
 954    dag_traverse_bottom_up(scoreboard->dag, nir_schedule_dag_max_delay_cb, NULL);
 955
 956    nir_schedule_instructions(scoreboard, block);
 957
 958    ralloc_free(mem_ctx);
 959    scoreboard->instr_map = NULL;
 960 }
 961
 962 static bool
 963 nir_schedule_ssa_def_init_scoreboard(nir_ssa_def *def, void *state)
 964 {
 965    nir_schedule_scoreboard *scoreboard = state;
 966    struct set *def_uses = _mesa_pointer_set_create(scoreboard);
 967
 968    _mesa_hash_table_insert(scoreboard->remaining_uses, def, def_uses);
 969
 970    _mesa_set_add(def_uses, def->parent_instr);
 971
 972    nir_foreach_use(src, def) {
 973       _mesa_set_add(def_uses, src->parent_instr);
 974    }
 975
 976    /* XXX: Handle if uses */
 977
 978    return true;
 979 }
 980
 981 static nir_schedule_scoreboard *
 982 nir_schedule_get_scoreboard(nir_shader *shader, int threshold)
 983 {
 984    nir_schedule_scoreboard *scoreboard = rzalloc(NULL, nir_schedule_scoreboard);
 985
 986    scoreboard->shader = shader;
 987    scoreboard->live_values = _mesa_pointer_set_create(scoreboard);
 988    scoreboard->remaining_uses = _mesa_pointer_hash_table_create(scoreboard);
 989    scoreboard->threshold = threshold;
 990    scoreboard->pressure = 0;
 991
 992    nir_foreach_function(function, shader) {
 993       nir_foreach_register(reg, &function->impl->registers) {
 994          struct set *register_uses =
 995             _mesa_pointer_set_create(scoreboard);
 996
 997          _mesa_hash_table_insert(scoreboard->remaining_uses, reg, register_uses);
 998
 999          nir_foreach_use(src, reg) {
1000             _mesa_set_add(register_uses, src->parent_instr);
1001          }
1002
1003          /* XXX: Handle if uses */
1004
1005          nir_foreach_def(dest, reg) {
1006             _mesa_set_add(register_uses, dest->reg.parent_instr);
1007          }
1008       }
1009
1010       nir_foreach_block(block, function->impl) {
1011          nir_foreach_instr(instr, block) {
1012             nir_foreach_ssa_def(instr, nir_schedule_ssa_def_init_scoreboard,
1013                                 scoreboard);
1014          }
1015
1016          /* XXX: We're ignoring if uses, which may prioritize scheduling other
1017           * uses of the if src even when it doesn't help.  That's not many
1018           * values, though, so meh.
1019           */
1020       }
1021    }
1022
1023    return scoreboard;
1024 }
1025
1026 static void
1027 nir_schedule_validate_uses(nir_schedule_scoreboard *scoreboard)
1028 {
1029 #ifdef NDEBUG
1030    return;
1031 #endif
1032
1033    bool any_uses = false;
1034
1035    hash_table_foreach(scoreboard->remaining_uses, entry) {
1036       struct set *remaining_uses = entry->data;
1037
1038       set_foreach(remaining_uses, instr_entry) {
1039          if (!any_uses) {
1040             fprintf(stderr, "Tracked uses remain after scheduling.  "
1041                     "Affected instructions: \n");
1042             any_uses = true;
1043          }
1044          nir_print_instr(instr_entry->key, stderr);
1045          fprintf(stderr, "\n");
1046       }
1047    }
1048
1049    assert(!any_uses);
1050 }
1051
1052 /**
1053  * Schedules the NIR instructions to try to decrease stalls (for example,
1054  * delaying texture reads) while managing register pressure.
1055  *
1056  * The threshold represents "number of NIR register/SSA def channels live
1057  * before switching the scheduling heuristic to reduce register pressure",
1058  * since most of our GPU architectures are scalar (extending to vector with a
1059  * flag wouldn't be hard).  This number should be a bit below the number of
1060  * registers available (counting any that may be occupied by system value
1061  * payload values, for example), since the heuristic may not always be able to
1062  * free a register immediately.  The amount below the limit is up to you to
1063  * tune.
1064  */
1065 void
1066 nir_schedule(nir_shader *shader, int threshold)
1067 {
1068    nir_schedule_scoreboard *scoreboard = nir_schedule_get_scoreboard(shader,
1069                                                                      threshold);
1070
1071    if (debug) {
1072       fprintf(stderr, "NIR shader before scheduling:\n");
1073       nir_print_shader(shader, stderr);
1074    }
1075
1076    nir_foreach_function(function, shader) {
1077       if (!function->impl)
1078          continue;
1079
1080       nir_foreach_block(block, function->impl) {
1081          nir_schedule_block(scoreboard, block);
1082       }
1083    }
1084
1085    nir_schedule_validate_uses(scoreboard);
1086
1087    ralloc_free(scoreboard);
1088 }