src/compiler/nir/nir_schedule.c

   1 /*
   2  * Copyright © 2019 Broadcom
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "nir_schedule.h"
  25 #include "util/dag.h"
  26 #include "util/u_dynarray.h"
  27
  28 /** @file
  29  *
  30  * Implements basic-block-level prepass instruction scheduling in NIR to
  31  * manage register pressure.
  32  *
  33  * This is based on the Goodman/Hsu paper (1988, cached copy at
  34  * https://people.freedesktop.org/~anholt/scheduling-goodman-hsu.pdf).  We
  35  * make up the DDG for NIR (which can be mostly done using the NIR def/use
  36  * chains for SSA instructions, plus some edges for ordering register writes
  37  * vs reads, and some more for ordering intrinsics).  Then we pick heads off
  38  * of the DDG using their heuristic to emit the NIR instructions back into the
  39  * block in their new order.
  40  *
  41  * The hard case for prepass scheduling on GPUs seems to always be consuming
  42  * texture/ubo results.  The register pressure heuristic doesn't want to pick
  43  * an instr that starts consuming texture results because it usually won't be
  44  * the only usage, so that instruction will increase pressure.
  45  *
  46  * If you try to force consumption of tex results always, then in a case where
  47  * single sample is used for many outputs, you'll end up picking every other
  48  * user and expanding register pressure.  The partially_evaluated_path flag
  49  * helps tremendously, in that if you happen for whatever reason to pick a
  50  * texture sample's output, then you'll try to finish off that sample.  Future
  51  * work may include doing some local search before locking in a choice, to try
  52  * to more reliably find the case where just a few choices going against the
  53  * heuristic can manage to free the whole vector.
  54  */
  55
  56 static bool debug;
  57
  58 /**
  59  * Represents a node in the DDG for a NIR instruction.
  60  */
  61 typedef struct {
  62    struct dag_node dag; /* must be first for our u_dynarray_foreach */
  63    nir_instr *instr;
  64    bool partially_evaluated_path;
  65
  66    /* Approximate estimate of the delay between starting this instruction and
  67     * its results being available.
  68     *
  69     * Accuracy is not too important, given that we're prepass scheduling here
  70     * and just trying to reduce excess dependencies introduced by a register
  71     * allocator by stretching out the live intervals of expensive
  72     * instructions.
  73     */
  74    uint32_t delay;
  75
  76    /* Cost of the maximum-delay path from this node to the leaves. */
  77    uint32_t max_delay;
  78
  79    /* scoreboard->time value when this instruction can be scheduled without
  80     * any stalls expected.
  81     */
  82    uint32_t ready_time;
  83 } nir_schedule_node;
  84
  85 typedef struct {
  86    struct dag *dag;
  87
  88    nir_shader *shader;
  89
  90    /* Mapping from nir_register * or nir_ssa_def * to a struct set of
  91     * instructions remaining to be scheduled using the register.
  92     */
  93    struct hash_table *remaining_uses;
  94
  95    /* Map from nir_instr to nir_schedule_node * */
  96    struct hash_table *instr_map;
  97
  98    /* Set of nir_register * or nir_ssa_def * that have had any instruction
  99     * scheduled on them.
 100     */
 101    struct set *live_values;
 102
 103    /* An abstract approximation of the number of nir_scheduler_node->delay
 104     * units since the start of the shader.
 105     */
 106    uint32_t time;
 107
 108    /* Number of channels currently used by the NIR instructions that have been
 109     * scheduled.
 110     */
 111    int pressure;
 112
 113    /* Options specified by the backend */
 114    const nir_schedule_options *options;
 115 } nir_schedule_scoreboard;
 116
 117 /* When walking the instructions in reverse, we use this flag to swap
 118  * before/after in add_dep().
 119  */
 120 enum direction { F, R };
 121
 122 typedef struct {
 123    nir_schedule_scoreboard *scoreboard;
 124
 125    /* Map from nir_register to nir_schedule_node * */
 126    struct hash_table *reg_map;
 127
 128    /* Scheduler nodes for last instruction involved in some class of dependency.
 129     */
 130    nir_schedule_node *load_input;
 131    nir_schedule_node *store_shared;
 132    nir_schedule_node *unknown_intrinsic;
 133    nir_schedule_node *discard;
 134    nir_schedule_node *jump;
 135
 136    enum direction dir;
 137 } nir_deps_state;
 138
 139 static void *
 140 _mesa_hash_table_search_data(struct hash_table *ht, void *key)
 141 {
 142    struct hash_entry *entry = _mesa_hash_table_search(ht, key);
 143    if (!entry)
 144       return NULL;
 145    return entry->data;
 146 }
 147
 148 static nir_schedule_node *
 149 nir_schedule_get_node(struct hash_table *instr_map, nir_instr *instr)
 150 {
 151    return _mesa_hash_table_search_data(instr_map, instr);
 152 }
 153
 154 static struct set *
 155 nir_schedule_scoreboard_get_src(nir_schedule_scoreboard *scoreboard, nir_src *src)
 156 {
 157    if (src->is_ssa) {
 158       return _mesa_hash_table_search_data(scoreboard->remaining_uses, src->ssa);
 159    } else {
 160       return _mesa_hash_table_search_data(scoreboard->remaining_uses,
 161                                           src->reg.reg);
 162    }
 163 }
 164
 165 static int
 166 nir_schedule_def_pressure(nir_ssa_def *def)
 167 {
 168    return def->num_components;
 169 }
 170
 171 static int
 172 nir_schedule_src_pressure(nir_src *src)
 173 {
 174    if (src->is_ssa)
 175       return nir_schedule_def_pressure(src->ssa);
 176    else
 177       return src->reg.reg->num_components;
 178 }
 179
 180 static int
 181 nir_schedule_dest_pressure(nir_dest *dest)
 182 {
 183    if (dest->is_ssa)
 184       return nir_schedule_def_pressure(&dest->ssa);
 185    else
 186       return dest->reg.reg->num_components;
 187 }
 188
 189 /**
 190  * Adds a dependency such that @after must appear in the final program after
 191  * @before.
 192  *
 193  * We add @before as a child of @after, so that DAG heads are the outputs of
 194  * the program and we make our scheduling decisions bottom to top.
 195  */
 196 static void
 197 add_dep(nir_deps_state *state,
 198         nir_schedule_node *before,
 199         nir_schedule_node *after)
 200 {
 201    if (!before || !after)
 202       return;
 203
 204    assert(before != after);
 205
 206    if (state->dir == F)
 207       dag_add_edge(&before->dag, &after->dag, NULL);
 208    else
 209       dag_add_edge(&after->dag, &before->dag, NULL);
 210 }
 211
 212
 213 static void
 214 add_read_dep(nir_deps_state *state,
 215              nir_schedule_node *before,
 216              nir_schedule_node *after)
 217 {
 218    add_dep(state, before, after);
 219 }
 220
 221 static void
 222 add_write_dep(nir_deps_state *state,
 223               nir_schedule_node **before,
 224               nir_schedule_node *after)
 225 {
 226    add_dep(state, *before, after);
 227    *before = after;
 228 }
 229
 230 static bool
 231 nir_schedule_reg_src_deps(nir_src *src, void *in_state)
 232 {
 233    nir_deps_state *state = in_state;
 234
 235    if (src->is_ssa)
 236       return true;
 237
 238    struct hash_entry *entry = _mesa_hash_table_search(state->reg_map,
 239                                                       src->reg.reg);
 240    if (!entry)
 241       return true;
 242    nir_schedule_node *dst_n = entry->data;
 243
 244    nir_schedule_node *src_n =
 245       nir_schedule_get_node(state->scoreboard->instr_map,
 246                             src->parent_instr);
 247
 248    add_dep(state, dst_n, src_n);
 249
 250    return true;
 251 }
 252
 253 static bool
 254 nir_schedule_reg_dest_deps(nir_dest *dest, void *in_state)
 255 {
 256    nir_deps_state *state = in_state;
 257
 258    if (dest->is_ssa)
 259       return true;
 260
 261    nir_schedule_node *dest_n =
 262       nir_schedule_get_node(state->scoreboard->instr_map,
 263                             dest->reg.parent_instr);
 264
 265    struct hash_entry *entry = _mesa_hash_table_search(state->reg_map,
 266                                                       dest->reg.reg);
 267    if (!entry) {
 268       _mesa_hash_table_insert(state->reg_map, dest->reg.reg, dest_n);
 269       return true;
 270    }
 271    nir_schedule_node **before = (nir_schedule_node **)&entry->data;
 272
 273    add_write_dep(state, before, dest_n);
 274
 275    return true;
 276 }
 277
 278 static bool
 279 nir_schedule_ssa_deps(nir_ssa_def *def, void *in_state)
 280 {
 281    nir_deps_state *state = in_state;
 282    struct hash_table *instr_map = state->scoreboard->instr_map;
 283    nir_schedule_node *def_n = nir_schedule_get_node(instr_map, def->parent_instr);
 284
 285    nir_foreach_use(src, def) {
 286       nir_schedule_node *use_n = nir_schedule_get_node(instr_map,
 287                                                        src->parent_instr);
 288
 289       add_read_dep(state, def_n, use_n);
 290    }
 291
 292    return true;
 293 }
 294
 295 static void
 296 nir_schedule_intrinsic_deps(nir_deps_state *state,
 297                             nir_intrinsic_instr *instr)
 298 {
 299    nir_schedule_node *n = nir_schedule_get_node(state->scoreboard->instr_map,
 300                                                 &instr->instr);
 301
 302    switch (instr->intrinsic) {
 303    case nir_intrinsic_load_uniform:
 304    case nir_intrinsic_load_ubo:
 305    case nir_intrinsic_load_front_face:
 306       break;
 307
 308    case nir_intrinsic_discard:
 309    case nir_intrinsic_discard_if:
 310       /* We are adding two dependencies:
 311        *
 312        * * A individual one that we could use to add a read_dep while handling
 313        *   nir_instr_type_tex
 314        *
 315        * * Include it on the unknown intrinsic set, as we want discard to be
 316        *   serialized in in the same order relative to intervening stores or
 317        *   atomic accesses to SSBOs and images
 318        */
 319       add_write_dep(state, &state->discard, n);
 320       add_write_dep(state, &state->unknown_intrinsic, n);
 321       break;
 322
 323    case nir_intrinsic_store_output:
 324       /* For some hardware and stages, output stores affect the same shared
 325        * memory as input loads.
 326        */
 327       if ((state->scoreboard->options->stages_with_shared_io_memory &
 328            (1 << state->scoreboard->shader->info.stage)))
 329          add_write_dep(state, &state->load_input, n);
 330
 331       /* Make sure that preceding discards stay before the store_output */
 332       add_read_dep(state, state->discard, n);
 333
 334       break;
 335
 336    case nir_intrinsic_load_input:
 337    case nir_intrinsic_load_per_vertex_input:
 338       add_read_dep(state, state->load_input, n);
 339       break;
 340
 341    case nir_intrinsic_load_shared:
 342       /* Don't move load_shared beyond a following store_shared, as it could
 343        * change their value
 344        */
 345       add_read_dep(state, state->store_shared, n);
 346       break;
 347
 348    case nir_intrinsic_store_shared:
 349       add_write_dep(state, &state->store_shared, n);
 350       break;
 351
 352    case nir_intrinsic_control_barrier:
 353    case nir_intrinsic_memory_barrier_shared:
 354       add_write_dep(state, &state->store_shared, n);
 355
 356       /* Serialize against ssbos/atomics/etc. */
 357       add_write_dep(state, &state->unknown_intrinsic, n);
 358       break;
 359
 360    default:
 361       /* Attempt to handle other intrinsics that we haven't individually
 362        * categorized by serializing them in the same order relative to each
 363        * other.
 364        */
 365       add_write_dep(state, &state->unknown_intrinsic, n);
 366       break;
 367    }
 368 }
 369
 370 /**
 371  * Common code for dependencies that need to be tracked both forward and
 372  * backward.
 373  *
 374  * This is for things like "all reads of r4 have to happen between the r4
 375  * writes that surround them".
 376  */
 377 static void
 378 nir_schedule_calculate_deps(nir_deps_state *state, nir_schedule_node *n)
 379 {
 380    nir_instr *instr = n->instr;
 381
 382    /* For NIR SSA defs, we only need to do a single pass of making the uses
 383     * depend on the def.
 384     */
 385    if (state->dir == F)
 386       nir_foreach_ssa_def(instr, nir_schedule_ssa_deps, state);
 387
 388    /* For NIR regs, track the last writer in the scheduler state so that we
 389     * can keep the writes in order and let reads get reordered only between
 390     * each write.
 391     */
 392    nir_foreach_src(instr, nir_schedule_reg_src_deps, state);
 393
 394    nir_foreach_dest(instr, nir_schedule_reg_dest_deps, state);
 395
 396    /* Make sure any other instructions keep their positions relative to
 397     * jumps.
 398     */
 399    if (instr->type != nir_instr_type_jump)
 400       add_read_dep(state, state->jump, n);
 401
 402    switch (instr->type) {
 403    case nir_instr_type_ssa_undef:
 404    case nir_instr_type_load_const:
 405    case nir_instr_type_alu:
 406    case nir_instr_type_deref:
 407       break;
 408
 409    case nir_instr_type_tex:
 410       /* Don't move texture ops before a discard, as that could increase
 411        * memory bandwidth for reading the discarded samples.
 412        */
 413       add_read_dep(state, state->discard, n);
 414       break;
 415
 416    case nir_instr_type_jump:
 417       add_write_dep(state, &state->jump, n);
 418       break;
 419
 420    case nir_instr_type_call:
 421       unreachable("Calls should have been lowered");
 422       break;
 423
 424    case nir_instr_type_parallel_copy:
 425       unreachable("Parallel copies should have been lowered");
 426       break;
 427
 428    case nir_instr_type_phi:
 429       unreachable("nir_schedule() should be called after lowering from SSA");
 430       break;
 431
 432    case nir_instr_type_intrinsic:
 433       nir_schedule_intrinsic_deps(state, nir_instr_as_intrinsic(instr));
 434       break;
 435    }
 436 }
 437
 438 static void
 439 calculate_forward_deps(nir_schedule_scoreboard *scoreboard, nir_block *block)
 440 {
 441    nir_deps_state state = {
 442       .scoreboard = scoreboard,
 443       .dir = F,
 444       .reg_map = _mesa_pointer_hash_table_create(NULL),
 445    };
 446
 447    nir_foreach_instr(instr, block) {
 448       nir_schedule_node *node = nir_schedule_get_node(scoreboard->instr_map,
 449                                                       instr);
 450       nir_schedule_calculate_deps(&state, node);
 451    }
 452
 453    ralloc_free(state.reg_map);
 454 }
 455
 456 static void
 457 calculate_reverse_deps(nir_schedule_scoreboard *scoreboard, nir_block *block)
 458 {
 459    nir_deps_state state = {
 460       .scoreboard = scoreboard,
 461       .dir = R,
 462       .reg_map = _mesa_pointer_hash_table_create(NULL),
 463    };
 464
 465    nir_foreach_instr_reverse(instr, block) {
 466       nir_schedule_node *node = nir_schedule_get_node(scoreboard->instr_map,
 467                                                       instr);
 468       nir_schedule_calculate_deps(&state, node);
 469    }
 470
 471    ralloc_free(state.reg_map);
 472 }
 473
 474 typedef struct {
 475    nir_schedule_scoreboard *scoreboard;
 476    int regs_freed;
 477 } nir_schedule_regs_freed_state;
 478
 479 static bool
 480 nir_schedule_regs_freed_src_cb(nir_src *src, void *in_state)
 481 {
 482    nir_schedule_regs_freed_state *state = in_state;
 483    nir_schedule_scoreboard *scoreboard = state->scoreboard;
 484    struct set *remaining_uses = nir_schedule_scoreboard_get_src(scoreboard, src);
 485
 486    if (remaining_uses->entries == 1 &&
 487        _mesa_set_search(remaining_uses, src->parent_instr)) {
 488       state->regs_freed += nir_schedule_src_pressure(src);
 489    }
 490
 491    return true;
 492 }
 493
 494 static bool
 495 nir_schedule_regs_freed_def_cb(nir_ssa_def *def, void *in_state)
 496 {
 497    nir_schedule_regs_freed_state *state = in_state;
 498
 499    state->regs_freed -= nir_schedule_def_pressure(def);
 500
 501    return true;
 502 }
 503
 504 static bool
 505 nir_schedule_regs_freed_dest_cb(nir_dest *dest, void *in_state)
 506 {
 507    nir_schedule_regs_freed_state *state = in_state;
 508    nir_schedule_scoreboard *scoreboard = state->scoreboard;
 509
 510    if (dest->is_ssa)
 511       return true;
 512
 513    nir_register *reg = dest->reg.reg;
 514
 515    /* Only the first def of a reg counts against register pressure. */
 516    if (!_mesa_set_search(scoreboard->live_values, reg))
 517       state->regs_freed -= nir_schedule_dest_pressure(dest);
 518
 519    return true;
 520 }
 521
 522 static int
 523 nir_schedule_regs_freed(nir_schedule_scoreboard *scoreboard, nir_schedule_node *n)
 524 {
 525    nir_schedule_regs_freed_state state = {
 526       .scoreboard = scoreboard,
 527    };
 528
 529    nir_foreach_src(n->instr, nir_schedule_regs_freed_src_cb, &state);
 530
 531    nir_foreach_ssa_def(n->instr, nir_schedule_regs_freed_def_cb, &state);
 532
 533    nir_foreach_dest(n->instr, nir_schedule_regs_freed_dest_cb, &state);
 534
 535    return state.regs_freed;
 536 }
 537
 538 /**
 539  * Chooses an instruction to schedule using the Goodman/Hsu (1988) CSP (Code
 540  * Scheduling for Parallelism) heuristic.
 541  *
 542  * Picks an instruction on the critical that's ready to execute without
 543  * stalls, if possible, otherwise picks the instruction on the critical path.
 544  */
 545 static nir_schedule_node *
 546 nir_schedule_choose_instruction_csp(nir_schedule_scoreboard *scoreboard)
 547 {
 548    nir_schedule_node *chosen = NULL;
 549
 550    /* Find the leader in the ready (shouldn't-stall) set with the maximum
 551     * cost.
 552     */
 553    list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
 554       if (scoreboard->time < n->ready_time)
 555          continue;
 556
 557       if (!chosen || chosen->max_delay < n->max_delay)
 558          chosen = n;
 559    }
 560    if (chosen) {
 561       if (debug) {
 562          fprintf(stderr, "chose (ready):          ");
 563          nir_print_instr(chosen->instr, stderr);
 564          fprintf(stderr, "\n");
 565       }
 566
 567       return chosen;
 568    }
 569
 570    /* Otherwise, choose the leader with the maximum cost. */
 571    list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
 572       if (!chosen || chosen->max_delay < n->max_delay)
 573          chosen = n;
 574    }
 575    if (debug) {
 576       fprintf(stderr, "chose (leader):         ");
 577       nir_print_instr(chosen->instr, stderr);
 578       fprintf(stderr, "\n");
 579    }
 580
 581    return chosen;
 582 }
 583
 584 /**
 585  * Chooses an instruction to schedule using the Goodman/Hsu (1988) CSR (Code
 586  * Scheduling for Register pressure) heuristic.
 587  */
 588 static nir_schedule_node *
 589 nir_schedule_choose_instruction_csr(nir_schedule_scoreboard *scoreboard)
 590 {
 591    nir_schedule_node *chosen = NULL;
 592
 593    /* Find a ready inst with regs freed and pick the one with max cost. */
 594    list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
 595       if (n->ready_time > scoreboard->time)
 596          continue;
 597
 598       int regs_freed = nir_schedule_regs_freed(scoreboard, n);
 599
 600       if (regs_freed > 0 && (!chosen || chosen->max_delay < n->max_delay)) {
 601          chosen = n;
 602       }
 603    }
 604    if (chosen) {
 605       if (debug) {
 606          fprintf(stderr, "chose (freed+ready):    ");
 607          nir_print_instr(chosen->instr, stderr);
 608          fprintf(stderr, "\n");
 609       }
 610
 611       return chosen;
 612    }
 613
 614    /* Find a leader with regs freed and pick the one with max cost. */
 615    list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
 616       int regs_freed = nir_schedule_regs_freed(scoreboard, n);
 617
 618       if (regs_freed > 0 && (!chosen || chosen->max_delay < n->max_delay)) {
 619          chosen = n;
 620       }
 621    }
 622    if (chosen) {
 623       if (debug) {
 624          fprintf(stderr, "chose (regs freed):     ");
 625          nir_print_instr(chosen->instr, stderr);
 626          fprintf(stderr, "\n");
 627       }
 628
 629       return chosen;
 630    }
 631
 632    /* Find a partially evaluated path and try to finish it off */
 633    list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
 634       if (n->partially_evaluated_path &&
 635           (!chosen || chosen->max_delay < n->max_delay)) {
 636          chosen = n;
 637       }
 638    }
 639    if (chosen) {
 640       if (debug) {
 641          fprintf(stderr, "chose (partial path):   ");
 642          nir_print_instr(chosen->instr, stderr);
 643          fprintf(stderr, "\n");
 644       }
 645
 646       return chosen;
 647    }
 648
 649    /* Contra the paper, pick a leader with no effect on used regs.  This may
 650     * open up new opportunities, as otherwise a single-operand instr consuming
 651     * a value will tend to block finding freeing that value.  This had a
 652     * massive effect on reducing spilling on V3D.
 653     *
 654     * XXX: Should this prioritize ready?
 655     */
 656    list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
 657       if (nir_schedule_regs_freed(scoreboard, n) != 0)
 658          continue;
 659
 660       if (!chosen || chosen->max_delay < n->max_delay)
 661          chosen = n;
 662    }
 663    if (chosen) {
 664       if (debug) {
 665          fprintf(stderr, "chose (regs no-op):     ");
 666          nir_print_instr(chosen->instr, stderr);
 667          fprintf(stderr, "\n");
 668       }
 669
 670       return chosen;
 671    }
 672
 673    /* Pick the max delay of the remaining ready set. */
 674    list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
 675       if (n->ready_time > scoreboard->time)
 676          continue;
 677       if (!chosen || chosen->max_delay < n->max_delay)
 678          chosen = n;
 679    }
 680    if (chosen) {
 681       if (debug) {
 682          fprintf(stderr, "chose (ready max delay):   ");
 683          nir_print_instr(chosen->instr, stderr);
 684          fprintf(stderr, "\n");
 685       }
 686       return chosen;
 687    }
 688
 689    /* Pick the max delay of the remaining leaders. */
 690    list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
 691       if (!chosen || chosen->max_delay < n->max_delay)
 692          chosen = n;
 693    }
 694
 695    if (debug) {
 696       fprintf(stderr, "chose (max delay):         ");
 697       nir_print_instr(chosen->instr, stderr);
 698       fprintf(stderr, "\n");
 699    }
 700
 701    return chosen;
 702 }
 703
 704 static void
 705 dump_state(nir_schedule_scoreboard *scoreboard)
 706 {
 707    list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
 708       fprintf(stderr, "maxdel %5d ", n->max_delay);
 709       nir_print_instr(n->instr, stderr);
 710       fprintf(stderr, "\n");
 711
 712       util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
 713          nir_schedule_node *child = (nir_schedule_node *)edge->child;
 714
 715          fprintf(stderr, " -> (%d parents) ", child->dag.parent_count);
 716          nir_print_instr(child->instr, stderr);
 717          fprintf(stderr, "\n");
 718       }
 719    }
 720 }
 721
 722 static void
 723 nir_schedule_mark_use(nir_schedule_scoreboard *scoreboard,
 724                       void *reg_or_def,
 725                       nir_instr *reg_or_def_parent,
 726                       int pressure)
 727 {
 728    /* Make the value live if it's the first time it's been used. */
 729    if (!_mesa_set_search(scoreboard->live_values, reg_or_def)) {
 730       _mesa_set_add(scoreboard->live_values, reg_or_def);
 731       scoreboard->pressure += pressure;
 732    }
 733
 734    /* Make the value dead if it's the last remaining use.  Be careful when one
 735     * instruction uses a value twice to not decrement pressure twice.
 736     */
 737    struct set *remaining_uses =
 738       _mesa_hash_table_search_data(scoreboard->remaining_uses, reg_or_def);
 739    struct set_entry *entry = _mesa_set_search(remaining_uses, reg_or_def_parent);
 740    if (entry) {
 741       _mesa_set_remove(remaining_uses, entry);
 742
 743       if (remaining_uses->entries == 0)
 744          scoreboard->pressure -= pressure;
 745    }
 746 }
 747
 748 static bool
 749 nir_schedule_mark_src_scheduled(nir_src *src, void *state)
 750 {
 751    nir_schedule_scoreboard *scoreboard = state;
 752    struct set *remaining_uses = nir_schedule_scoreboard_get_src(scoreboard, src);
 753
 754    struct set_entry *entry = _mesa_set_search(remaining_uses,
 755                                               src->parent_instr);
 756    if (entry) {
 757       /* Once we've used an SSA value in one instruction, bump the priority of
 758        * the other uses so the SSA value can get fully consumed.
 759        *
 760        * We don't do this for registers, and it's would be a hassle and it's
 761        * unclear if that would help or not.  Also, skip it for constants, as
 762        * they're often folded as immediates into backend instructions and have
 763        * many unrelated instructions all referencing the same value (0).
 764        */
 765       if (src->is_ssa &&
 766           src->ssa->parent_instr->type != nir_instr_type_load_const) {
 767          nir_foreach_use(other_src, src->ssa) {
 768             if (other_src->parent_instr == src->parent_instr)
 769                continue;
 770
 771             nir_schedule_node *n =
 772                nir_schedule_get_node(scoreboard->instr_map,
 773                                      other_src->parent_instr);
 774
 775             if (n && !n->partially_evaluated_path) {
 776                if (debug) {
 777                   fprintf(stderr, "  New partially evaluated path: ");
 778                   nir_print_instr(n->instr, stderr);
 779                   fprintf(stderr, "\n");
 780                }
 781
 782                n->partially_evaluated_path = true;
 783             }
 784          }
 785       }
 786    }
 787
 788    nir_schedule_mark_use(scoreboard,
 789                          src->is_ssa ? (void *)src->ssa : (void *)src->reg.reg,
 790                          src->parent_instr,
 791                          nir_schedule_src_pressure(src));
 792
 793    return true;
 794 }
 795
 796 static bool
 797 nir_schedule_mark_def_scheduled(nir_ssa_def *def, void *state)
 798 {
 799    nir_schedule_scoreboard *scoreboard = state;
 800
 801    nir_schedule_mark_use(scoreboard, def, def->parent_instr,
 802                          nir_schedule_def_pressure(def));
 803
 804    return true;
 805 }
 806
 807 static bool
 808 nir_schedule_mark_dest_scheduled(nir_dest *dest, void *state)
 809 {
 810    nir_schedule_scoreboard *scoreboard = state;
 811
 812    /* SSA defs were handled in nir_schedule_mark_def_scheduled()
 813     */
 814    if (dest->is_ssa)
 815       return true;
 816
 817    /* XXX: This is not actually accurate for regs -- the last use of a reg may
 818     * have a live interval that extends across control flow.  We should
 819     * calculate the live ranges of regs, and have scheduler nodes for the CF
 820     * nodes that also "use" the reg.
 821     */
 822    nir_schedule_mark_use(scoreboard, dest->reg.reg,
 823                          dest->reg.parent_instr,
 824                          nir_schedule_dest_pressure(dest));
 825
 826    return true;
 827 }
 828
 829 static void
 830 nir_schedule_mark_node_scheduled(nir_schedule_scoreboard *scoreboard,
 831                                  nir_schedule_node *n)
 832 {
 833    nir_foreach_src(n->instr, nir_schedule_mark_src_scheduled, scoreboard);
 834    nir_foreach_ssa_def(n->instr, nir_schedule_mark_def_scheduled, scoreboard);
 835    nir_foreach_dest(n->instr, nir_schedule_mark_dest_scheduled, scoreboard);
 836
 837    util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
 838       nir_schedule_node *child = (nir_schedule_node *)edge->child;
 839
 840       child->ready_time = MAX2(child->ready_time,
 841                                scoreboard->time + n->delay);
 842
 843       if (child->dag.parent_count == 1) {
 844          if (debug) {
 845             fprintf(stderr, "  New DAG head: ");
 846             nir_print_instr(child->instr, stderr);
 847             fprintf(stderr, "\n");
 848          }
 849       }
 850    }
 851
 852    dag_prune_head(scoreboard->dag, &n->dag);
 853
 854    scoreboard->time = MAX2(n->ready_time, scoreboard->time);
 855    scoreboard->time++;
 856 }
 857
 858 static void
 859 nir_schedule_instructions(nir_schedule_scoreboard *scoreboard, nir_block *block)
 860 {
 861    while (!list_is_empty(&scoreboard->dag->heads)) {
 862       if (debug) {
 863          fprintf(stderr, "current list:\n");
 864          dump_state(scoreboard);
 865       }
 866
 867       nir_schedule_node *chosen;
 868       if (scoreboard->pressure < scoreboard->options->threshold)
 869          chosen = nir_schedule_choose_instruction_csp(scoreboard);
 870       else
 871          chosen = nir_schedule_choose_instruction_csr(scoreboard);
 872
 873       /* Now that we've scheduled a new instruction, some of its children may
 874        * be promoted to the list of instructions ready to be scheduled.
 875        */
 876       nir_schedule_mark_node_scheduled(scoreboard, chosen);
 877
 878       /* Move the instruction to the end (so our first chosen instructions are
 879        * the start of the program).
 880        */
 881       exec_node_remove(&chosen->instr->node);
 882       exec_list_push_tail(&block->instr_list, &chosen->instr->node);
 883
 884       if (debug)
 885          fprintf(stderr, "\n");
 886    }
 887 }
 888
 889 static uint32_t
 890 nir_schedule_get_delay(nir_instr *instr)
 891 {
 892    switch (instr->type) {
 893    case nir_instr_type_ssa_undef:
 894    case nir_instr_type_load_const:
 895    case nir_instr_type_alu:
 896    case nir_instr_type_deref:
 897    case nir_instr_type_jump:
 898    case nir_instr_type_parallel_copy:
 899    case nir_instr_type_call:
 900    case nir_instr_type_phi:
 901       return 1;
 902
 903    case nir_instr_type_intrinsic:
 904       /* XXX: Pick a large number for UBO/SSBO/image/shared loads */
 905       return 1;
 906
 907    case nir_instr_type_tex:
 908       /* Pick some large number to try to fetch textures early and sample them
 909        * late.
 910        */
 911       return 100;
 912    }
 913
 914    return 0;
 915 }
 916
 917 static void
 918 nir_schedule_dag_max_delay_cb(struct dag_node *node, void *state)
 919 {
 920    nir_schedule_node *n = (nir_schedule_node *)node;
 921    uint32_t max_delay = 0;
 922
 923    util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
 924       nir_schedule_node *child = (nir_schedule_node *)edge->child;
 925       max_delay = MAX2(child->max_delay, max_delay);
 926    }
 927
 928    n->max_delay = MAX2(n->max_delay, max_delay + n->delay);
 929  }
 930
 931 static void
 932 nir_schedule_block(nir_schedule_scoreboard *scoreboard, nir_block *block)
 933 {
 934    void *mem_ctx = ralloc_context(NULL);
 935    scoreboard->instr_map = _mesa_pointer_hash_table_create(mem_ctx);
 936
 937    scoreboard->dag = dag_create(mem_ctx);
 938
 939    nir_foreach_instr(instr, block) {
 940       nir_schedule_node *n =
 941          rzalloc(mem_ctx, nir_schedule_node);
 942
 943       n->instr = instr;
 944       n->delay = nir_schedule_get_delay(instr);
 945       dag_init_node(scoreboard->dag, &n->dag);
 946
 947       _mesa_hash_table_insert(scoreboard->instr_map, instr, n);
 948    }
 949
 950    calculate_forward_deps(scoreboard, block);
 951    calculate_reverse_deps(scoreboard, block);
 952
 953    dag_traverse_bottom_up(scoreboard->dag, nir_schedule_dag_max_delay_cb, NULL);
 954
 955    nir_schedule_instructions(scoreboard, block);
 956
 957    ralloc_free(mem_ctx);
 958    scoreboard->instr_map = NULL;
 959 }
 960
 961 static bool
 962 nir_schedule_ssa_def_init_scoreboard(nir_ssa_def *def, void *state)
 963 {
 964    nir_schedule_scoreboard *scoreboard = state;
 965    struct set *def_uses = _mesa_pointer_set_create(scoreboard);
 966
 967    _mesa_hash_table_insert(scoreboard->remaining_uses, def, def_uses);
 968
 969    _mesa_set_add(def_uses, def->parent_instr);
 970
 971    nir_foreach_use(src, def) {
 972       _mesa_set_add(def_uses, src->parent_instr);
 973    }
 974
 975    /* XXX: Handle if uses */
 976
 977    return true;
 978 }
 979
 980 static nir_schedule_scoreboard *
 981 nir_schedule_get_scoreboard(nir_shader *shader,
 982                             const nir_schedule_options *options)
 983 {
 984    nir_schedule_scoreboard *scoreboard = rzalloc(NULL, nir_schedule_scoreboard);
 985
 986    scoreboard->shader = shader;
 987    scoreboard->live_values = _mesa_pointer_set_create(scoreboard);
 988    scoreboard->remaining_uses = _mesa_pointer_hash_table_create(scoreboard);
 989    scoreboard->options = options;
 990    scoreboard->pressure = 0;
 991
 992    nir_foreach_function(function, shader) {
 993       nir_foreach_register(reg, &function->impl->registers) {
 994          struct set *register_uses =
 995             _mesa_pointer_set_create(scoreboard);
 996
 997          _mesa_hash_table_insert(scoreboard->remaining_uses, reg, register_uses);
 998
 999          nir_foreach_use(src, reg) {
1000             _mesa_set_add(register_uses, src->parent_instr);
1001          }
1002
1003          /* XXX: Handle if uses */
1004
1005          nir_foreach_def(dest, reg) {
1006             _mesa_set_add(register_uses, dest->reg.parent_instr);
1007          }
1008       }
1009
1010       nir_foreach_block(block, function->impl) {
1011          nir_foreach_instr(instr, block) {
1012             nir_foreach_ssa_def(instr, nir_schedule_ssa_def_init_scoreboard,
1013                                 scoreboard);
1014          }
1015
1016          /* XXX: We're ignoring if uses, which may prioritize scheduling other
1017           * uses of the if src even when it doesn't help.  That's not many
1018           * values, though, so meh.
1019           */
1020       }
1021    }
1022
1023    return scoreboard;
1024 }
1025
1026 static void
1027 nir_schedule_validate_uses(nir_schedule_scoreboard *scoreboard)
1028 {
1029 #ifdef NDEBUG
1030    return;
1031 #endif
1032
1033    bool any_uses = false;
1034
1035    hash_table_foreach(scoreboard->remaining_uses, entry) {
1036       struct set *remaining_uses = entry->data;
1037
1038       set_foreach(remaining_uses, instr_entry) {
1039          if (!any_uses) {
1040             fprintf(stderr, "Tracked uses remain after scheduling.  "
1041                     "Affected instructions: \n");
1042             any_uses = true;
1043          }
1044          nir_print_instr(instr_entry->key, stderr);
1045          fprintf(stderr, "\n");
1046       }
1047    }
1048
1049    assert(!any_uses);
1050 }
1051
1052 /**
1053  * Schedules the NIR instructions to try to decrease stalls (for example,
1054  * delaying texture reads) while managing register pressure.
1055  *
1056  * The threshold represents "number of NIR register/SSA def channels live
1057  * before switching the scheduling heuristic to reduce register pressure",
1058  * since most of our GPU architectures are scalar (extending to vector with a
1059  * flag wouldn't be hard).  This number should be a bit below the number of
1060  * registers available (counting any that may be occupied by system value
1061  * payload values, for example), since the heuristic may not always be able to
1062  * free a register immediately.  The amount below the limit is up to you to
1063  * tune.
1064  */
1065 void
1066 nir_schedule(nir_shader *shader,
1067              const nir_schedule_options *options)
1068 {
1069    nir_schedule_scoreboard *scoreboard = nir_schedule_get_scoreboard(shader,
1070                                                                      options);
1071
1072    if (debug) {
1073       fprintf(stderr, "NIR shader before scheduling:\n");
1074       nir_print_shader(shader, stderr);
1075    }
1076
1077    nir_foreach_function(function, shader) {
1078       if (!function->impl)
1079          continue;
1080
1081       nir_foreach_block(block, function->impl) {
1082          nir_schedule_block(scoreboard, block);
1083       }
1084    }
1085
1086    nir_schedule_validate_uses(scoreboard);
1087
1088    ralloc_free(scoreboard);
1089 }