src/compiler/nir/nir_schedule.c

   1 /*
   2  * Copyright © 2019 Broadcom
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "nir_schedule.h"
  25 #include "util/dag.h"
  26 #include "util/u_dynarray.h"
  27
  28 /** @file
  29  *
  30  * Implements basic-block-level prepass instruction scheduling in NIR to
  31  * manage register pressure.
  32  *
  33  * This is based on the Goodman/Hsu paper (1988, cached copy at
  34  * https://people.freedesktop.org/~anholt/scheduling-goodman-hsu.pdf).  We
  35  * make up the DDG for NIR (which can be mostly done using the NIR def/use
  36  * chains for SSA instructions, plus some edges for ordering register writes
  37  * vs reads, and some more for ordering intrinsics).  Then we pick heads off
  38  * of the DDG using their heuristic to emit the NIR instructions back into the
  39  * block in their new order.
  40  *
  41  * The hard case for prepass scheduling on GPUs seems to always be consuming
  42  * texture/ubo results.  The register pressure heuristic doesn't want to pick
  43  * an instr that starts consuming texture results because it usually won't be
  44  * the only usage, so that instruction will increase pressure.
  45  *
  46  * If you try to force consumption of tex results always, then in a case where
  47  * single sample is used for many outputs, you'll end up picking every other
  48  * user and expanding register pressure.  The partially_evaluated_path flag
  49  * helps tremendously, in that if you happen for whatever reason to pick a
  50  * texture sample's output, then you'll try to finish off that sample.  Future
  51  * work may include doing some local search before locking in a choice, to try
  52  * to more reliably find the case where just a few choices going against the
  53  * heuristic can manage to free the whole vector.
  54  */
  55
  56 static bool debug;
  57
  58 /**
  59  * Represents a node in the DDG for a NIR instruction.
  60  */
  61 typedef struct {
  62    struct dag_node dag; /* must be first for our u_dynarray_foreach */
  63    nir_instr *instr;
  64    bool partially_evaluated_path;
  65
  66    /* Approximate estimate of the delay between starting this instruction and
  67     * its results being available.
  68     *
  69     * Accuracy is not too important, given that we're prepass scheduling here
  70     * and just trying to reduce excess dependencies introduced by a register
  71     * allocator by stretching out the live intervals of expensive
  72     * instructions.
  73     */
  74    uint32_t delay;
  75
  76    /* Cost of the maximum-delay path from this node to the leaves. */
  77    uint32_t max_delay;
  78
  79    /* scoreboard->time value when this instruction can be scheduled without
  80     * any stalls expected.
  81     */
  82    uint32_t ready_time;
  83 } nir_schedule_node;
  84
  85 typedef struct {
  86    struct dag *dag;
  87
  88    nir_shader *shader;
  89
  90    /* Mapping from nir_register * or nir_ssa_def * to a struct set of
  91     * instructions remaining to be scheduled using the register.
  92     */
  93    struct hash_table *remaining_uses;
  94
  95    /* Map from nir_instr to nir_schedule_node * */
  96    struct hash_table *instr_map;
  97
  98    /* Set of nir_register * or nir_ssa_def * that have had any instruction
  99     * scheduled on them.
 100     */
 101    struct set *live_values;
 102
 103    /* An abstract approximation of the number of nir_scheduler_node->delay
 104     * units since the start of the shader.
 105     */
 106    uint32_t time;
 107
 108    /* Number of channels currently used by the NIR instructions that have been
 109     * scheduled.
 110     */
 111    int pressure;
 112
 113    /* Number of channels that may be in use before we switch to the
 114     * pressure-prioritizing scheduling heuristic.
 115     */
 116    int threshold;
 117
 118    /* Mask of stages that share memory for inputs and outputs */
 119    unsigned stages_with_shared_io_memory;
 120 } nir_schedule_scoreboard;
 121
 122 /* When walking the instructions in reverse, we use this flag to swap
 123  * before/after in add_dep().
 124  */
 125 enum direction { F, R };
 126
 127 typedef struct {
 128    nir_schedule_scoreboard *scoreboard;
 129
 130    /* Map from nir_register to nir_schedule_node * */
 131    struct hash_table *reg_map;
 132
 133    /* Scheduler nodes for last instruction involved in some class of dependency.
 134     */
 135    nir_schedule_node *load_input;
 136    nir_schedule_node *store_shared;
 137    nir_schedule_node *unknown_intrinsic;
 138    nir_schedule_node *discard;
 139    nir_schedule_node *jump;
 140
 141    enum direction dir;
 142 } nir_deps_state;
 143
 144 static void *
 145 _mesa_hash_table_search_data(struct hash_table *ht, void *key)
 146 {
 147    struct hash_entry *entry = _mesa_hash_table_search(ht, key);
 148    if (!entry)
 149       return NULL;
 150    return entry->data;
 151 }
 152
 153 static nir_schedule_node *
 154 nir_schedule_get_node(struct hash_table *instr_map, nir_instr *instr)
 155 {
 156    return _mesa_hash_table_search_data(instr_map, instr);
 157 }
 158
 159 static struct set *
 160 nir_schedule_scoreboard_get_src(nir_schedule_scoreboard *scoreboard, nir_src *src)
 161 {
 162    if (src->is_ssa) {
 163       return _mesa_hash_table_search_data(scoreboard->remaining_uses, src->ssa);
 164    } else {
 165       return _mesa_hash_table_search_data(scoreboard->remaining_uses,
 166                                           src->reg.reg);
 167    }
 168 }
 169
 170 static int
 171 nir_schedule_def_pressure(nir_ssa_def *def)
 172 {
 173    return def->num_components;
 174 }
 175
 176 static int
 177 nir_schedule_src_pressure(nir_src *src)
 178 {
 179    if (src->is_ssa)
 180       return nir_schedule_def_pressure(src->ssa);
 181    else
 182       return src->reg.reg->num_components;
 183 }
 184
 185 static int
 186 nir_schedule_dest_pressure(nir_dest *dest)
 187 {
 188    if (dest->is_ssa)
 189       return nir_schedule_def_pressure(&dest->ssa);
 190    else
 191       return dest->reg.reg->num_components;
 192 }
 193
 194 /**
 195  * Adds a dependency such that @after must appear in the final program after
 196  * @before.
 197  *
 198  * We add @before as a child of @after, so that DAG heads are the outputs of
 199  * the program and we make our scheduling decisions bottom to top.
 200  */
 201 static void
 202 add_dep(nir_deps_state *state,
 203         nir_schedule_node *before,
 204         nir_schedule_node *after)
 205 {
 206    if (!before || !after)
 207       return;
 208
 209    assert(before != after);
 210
 211    if (state->dir == F)
 212       dag_add_edge(&before->dag, &after->dag, NULL);
 213    else
 214       dag_add_edge(&after->dag, &before->dag, NULL);
 215 }
 216
 217
 218 static void
 219 add_read_dep(nir_deps_state *state,
 220              nir_schedule_node *before,
 221              nir_schedule_node *after)
 222 {
 223    add_dep(state, before, after);
 224 }
 225
 226 static void
 227 add_write_dep(nir_deps_state *state,
 228               nir_schedule_node **before,
 229               nir_schedule_node *after)
 230 {
 231    add_dep(state, *before, after);
 232    *before = after;
 233 }
 234
 235 static bool
 236 nir_schedule_reg_src_deps(nir_src *src, void *in_state)
 237 {
 238    nir_deps_state *state = in_state;
 239
 240    if (src->is_ssa)
 241       return true;
 242
 243    struct hash_entry *entry = _mesa_hash_table_search(state->reg_map,
 244                                                       src->reg.reg);
 245    if (!entry)
 246       return true;
 247    nir_schedule_node *dst_n = entry->data;
 248
 249    nir_schedule_node *src_n =
 250       nir_schedule_get_node(state->scoreboard->instr_map,
 251                             src->parent_instr);
 252
 253    add_dep(state, dst_n, src_n);
 254
 255    return true;
 256 }
 257
 258 static bool
 259 nir_schedule_reg_dest_deps(nir_dest *dest, void *in_state)
 260 {
 261    nir_deps_state *state = in_state;
 262
 263    if (dest->is_ssa)
 264       return true;
 265
 266    nir_schedule_node *dest_n =
 267       nir_schedule_get_node(state->scoreboard->instr_map,
 268                             dest->reg.parent_instr);
 269
 270    struct hash_entry *entry = _mesa_hash_table_search(state->reg_map,
 271                                                       dest->reg.reg);
 272    if (!entry) {
 273       _mesa_hash_table_insert(state->reg_map, dest->reg.reg, dest_n);
 274       return true;
 275    }
 276    nir_schedule_node **before = (nir_schedule_node **)&entry->data;
 277
 278    add_write_dep(state, before, dest_n);
 279
 280    return true;
 281 }
 282
 283 static bool
 284 nir_schedule_ssa_deps(nir_ssa_def *def, void *in_state)
 285 {
 286    nir_deps_state *state = in_state;
 287    struct hash_table *instr_map = state->scoreboard->instr_map;
 288    nir_schedule_node *def_n = nir_schedule_get_node(instr_map, def->parent_instr);
 289
 290    nir_foreach_use(src, def) {
 291       nir_schedule_node *use_n = nir_schedule_get_node(instr_map,
 292                                                        src->parent_instr);
 293
 294       add_read_dep(state, def_n, use_n);
 295    }
 296
 297    return true;
 298 }
 299
 300 static void
 301 nir_schedule_intrinsic_deps(nir_deps_state *state,
 302                             nir_intrinsic_instr *instr)
 303 {
 304    nir_schedule_node *n = nir_schedule_get_node(state->scoreboard->instr_map,
 305                                                 &instr->instr);
 306
 307    switch (instr->intrinsic) {
 308    case nir_intrinsic_load_uniform:
 309    case nir_intrinsic_load_ubo:
 310    case nir_intrinsic_load_front_face:
 311       break;
 312
 313    case nir_intrinsic_discard:
 314    case nir_intrinsic_discard_if:
 315       /* We are adding two dependencies:
 316        *
 317        * * A individual one that we could use to add a read_dep while handling
 318        *   nir_instr_type_tex
 319        *
 320        * * Include it on the unknown intrinsic set, as we want discard to be
 321        *   serialized in in the same order relative to intervening stores or
 322        *   atomic accesses to SSBOs and images
 323        */
 324       add_write_dep(state, &state->discard, n);
 325       add_write_dep(state, &state->unknown_intrinsic, n);
 326       break;
 327
 328    case nir_intrinsic_store_output:
 329       /* For some hardware and stages, output stores affect the same shared
 330        * memory as input loads.
 331        */
 332       if ((state->scoreboard->stages_with_shared_io_memory &
 333            (1 << state->scoreboard->shader->info.stage)))
 334          add_write_dep(state, &state->load_input, n);
 335
 336       /* Make sure that preceding discards stay before the store_output */
 337       add_read_dep(state, state->discard, n);
 338
 339       break;
 340
 341    case nir_intrinsic_load_input:
 342    case nir_intrinsic_load_per_vertex_input:
 343       add_read_dep(state, state->load_input, n);
 344       break;
 345
 346    case nir_intrinsic_load_shared:
 347       /* Don't move load_shared beyond a following store_shared, as it could
 348        * change their value
 349        */
 350       add_read_dep(state, state->store_shared, n);
 351       break;
 352
 353    case nir_intrinsic_store_shared:
 354       add_write_dep(state, &state->store_shared, n);
 355       break;
 356
 357    case nir_intrinsic_control_barrier:
 358    case nir_intrinsic_memory_barrier_shared:
 359       add_write_dep(state, &state->store_shared, n);
 360
 361       /* Serialize against ssbos/atomics/etc. */
 362       add_write_dep(state, &state->unknown_intrinsic, n);
 363       break;
 364
 365    default:
 366       /* Attempt to handle other intrinsics that we haven't individually
 367        * categorized by serializing them in the same order relative to each
 368        * other.
 369        */
 370       add_write_dep(state, &state->unknown_intrinsic, n);
 371       break;
 372    }
 373 }
 374
 375 /**
 376  * Common code for dependencies that need to be tracked both forward and
 377  * backward.
 378  *
 379  * This is for things like "all reads of r4 have to happen between the r4
 380  * writes that surround them".
 381  */
 382 static void
 383 nir_schedule_calculate_deps(nir_deps_state *state, nir_schedule_node *n)
 384 {
 385    nir_instr *instr = n->instr;
 386
 387    /* For NIR SSA defs, we only need to do a single pass of making the uses
 388     * depend on the def.
 389     */
 390    if (state->dir == F)
 391       nir_foreach_ssa_def(instr, nir_schedule_ssa_deps, state);
 392
 393    /* For NIR regs, track the last writer in the scheduler state so that we
 394     * can keep the writes in order and let reads get reordered only between
 395     * each write.
 396     */
 397    nir_foreach_src(instr, nir_schedule_reg_src_deps, state);
 398
 399    nir_foreach_dest(instr, nir_schedule_reg_dest_deps, state);
 400
 401    /* Make sure any other instructions keep their positions relative to
 402     * jumps.
 403     */
 404    if (instr->type != nir_instr_type_jump)
 405       add_read_dep(state, state->jump, n);
 406
 407    switch (instr->type) {
 408    case nir_instr_type_ssa_undef:
 409    case nir_instr_type_load_const:
 410    case nir_instr_type_alu:
 411    case nir_instr_type_deref:
 412       break;
 413
 414    case nir_instr_type_tex:
 415       /* Don't move texture ops before a discard, as that could increase
 416        * memory bandwidth for reading the discarded samples.
 417        */
 418       add_read_dep(state, state->discard, n);
 419       break;
 420
 421    case nir_instr_type_jump:
 422       add_write_dep(state, &state->jump, n);
 423       break;
 424
 425    case nir_instr_type_call:
 426       unreachable("Calls should have been lowered");
 427       break;
 428
 429    case nir_instr_type_parallel_copy:
 430       unreachable("Parallel copies should have been lowered");
 431       break;
 432
 433    case nir_instr_type_phi:
 434       unreachable("nir_schedule() should be called after lowering from SSA");
 435       break;
 436
 437    case nir_instr_type_intrinsic:
 438       nir_schedule_intrinsic_deps(state, nir_instr_as_intrinsic(instr));
 439       break;
 440    }
 441 }
 442
 443 static void
 444 calculate_forward_deps(nir_schedule_scoreboard *scoreboard, nir_block *block)
 445 {
 446    nir_deps_state state = {
 447       .scoreboard = scoreboard,
 448       .dir = F,
 449       .reg_map = _mesa_pointer_hash_table_create(NULL),
 450    };
 451
 452    nir_foreach_instr(instr, block) {
 453       nir_schedule_node *node = nir_schedule_get_node(scoreboard->instr_map,
 454                                                       instr);
 455       nir_schedule_calculate_deps(&state, node);
 456    }
 457
 458    ralloc_free(state.reg_map);
 459 }
 460
 461 static void
 462 calculate_reverse_deps(nir_schedule_scoreboard *scoreboard, nir_block *block)
 463 {
 464    nir_deps_state state = {
 465       .scoreboard = scoreboard,
 466       .dir = R,
 467       .reg_map = _mesa_pointer_hash_table_create(NULL),
 468    };
 469
 470    nir_foreach_instr_reverse(instr, block) {
 471       nir_schedule_node *node = nir_schedule_get_node(scoreboard->instr_map,
 472                                                       instr);
 473       nir_schedule_calculate_deps(&state, node);
 474    }
 475
 476    ralloc_free(state.reg_map);
 477 }
 478
 479 typedef struct {
 480    nir_schedule_scoreboard *scoreboard;
 481    int regs_freed;
 482 } nir_schedule_regs_freed_state;
 483
 484 static bool
 485 nir_schedule_regs_freed_src_cb(nir_src *src, void *in_state)
 486 {
 487    nir_schedule_regs_freed_state *state = in_state;
 488    nir_schedule_scoreboard *scoreboard = state->scoreboard;
 489    struct set *remaining_uses = nir_schedule_scoreboard_get_src(scoreboard, src);
 490
 491    if (remaining_uses->entries == 1 &&
 492        _mesa_set_search(remaining_uses, src->parent_instr)) {
 493       state->regs_freed += nir_schedule_src_pressure(src);
 494    }
 495
 496    return true;
 497 }
 498
 499 static bool
 500 nir_schedule_regs_freed_def_cb(nir_ssa_def *def, void *in_state)
 501 {
 502    nir_schedule_regs_freed_state *state = in_state;
 503
 504    state->regs_freed -= nir_schedule_def_pressure(def);
 505
 506    return true;
 507 }
 508
 509 static bool
 510 nir_schedule_regs_freed_dest_cb(nir_dest *dest, void *in_state)
 511 {
 512    nir_schedule_regs_freed_state *state = in_state;
 513    nir_schedule_scoreboard *scoreboard = state->scoreboard;
 514
 515    if (dest->is_ssa)
 516       return true;
 517
 518    nir_register *reg = dest->reg.reg;
 519
 520    /* Only the first def of a reg counts against register pressure. */
 521    if (!_mesa_set_search(scoreboard->live_values, reg))
 522       state->regs_freed -= nir_schedule_dest_pressure(dest);
 523
 524    return true;
 525 }
 526
 527 static int
 528 nir_schedule_regs_freed(nir_schedule_scoreboard *scoreboard, nir_schedule_node *n)
 529 {
 530    nir_schedule_regs_freed_state state = {
 531       .scoreboard = scoreboard,
 532    };
 533
 534    nir_foreach_src(n->instr, nir_schedule_regs_freed_src_cb, &state);
 535
 536    nir_foreach_ssa_def(n->instr, nir_schedule_regs_freed_def_cb, &state);
 537
 538    nir_foreach_dest(n->instr, nir_schedule_regs_freed_dest_cb, &state);
 539
 540    return state.regs_freed;
 541 }
 542
 543 /**
 544  * Chooses an instruction to schedule using the Goodman/Hsu (1988) CSP (Code
 545  * Scheduling for Parallelism) heuristic.
 546  *
 547  * Picks an instruction on the critical that's ready to execute without
 548  * stalls, if possible, otherwise picks the instruction on the critical path.
 549  */
 550 static nir_schedule_node *
 551 nir_schedule_choose_instruction_csp(nir_schedule_scoreboard *scoreboard)
 552 {
 553    nir_schedule_node *chosen = NULL;
 554
 555    /* Find the leader in the ready (shouldn't-stall) set with the maximum
 556     * cost.
 557     */
 558    list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
 559       if (scoreboard->time < n->ready_time)
 560          continue;
 561
 562       if (!chosen || chosen->max_delay < n->max_delay)
 563          chosen = n;
 564    }
 565    if (chosen) {
 566       if (debug) {
 567          fprintf(stderr, "chose (ready):          ");
 568          nir_print_instr(chosen->instr, stderr);
 569          fprintf(stderr, "\n");
 570       }
 571
 572       return chosen;
 573    }
 574
 575    /* Otherwise, choose the leader with the maximum cost. */
 576    list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
 577       if (!chosen || chosen->max_delay < n->max_delay)
 578          chosen = n;
 579    }
 580    if (debug) {
 581       fprintf(stderr, "chose (leader):         ");
 582       nir_print_instr(chosen->instr, stderr);
 583       fprintf(stderr, "\n");
 584    }
 585
 586    return chosen;
 587 }
 588
 589 /**
 590  * Chooses an instruction to schedule using the Goodman/Hsu (1988) CSR (Code
 591  * Scheduling for Register pressure) heuristic.
 592  */
 593 static nir_schedule_node *
 594 nir_schedule_choose_instruction_csr(nir_schedule_scoreboard *scoreboard)
 595 {
 596    nir_schedule_node *chosen = NULL;
 597
 598    /* Find a ready inst with regs freed and pick the one with max cost. */
 599    list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
 600       if (n->ready_time > scoreboard->time)
 601          continue;
 602
 603       int regs_freed = nir_schedule_regs_freed(scoreboard, n);
 604
 605       if (regs_freed > 0 && (!chosen || chosen->max_delay < n->max_delay)) {
 606          chosen = n;
 607       }
 608    }
 609    if (chosen) {
 610       if (debug) {
 611          fprintf(stderr, "chose (freed+ready):    ");
 612          nir_print_instr(chosen->instr, stderr);
 613          fprintf(stderr, "\n");
 614       }
 615
 616       return chosen;
 617    }
 618
 619    /* Find a leader with regs freed and pick the one with max cost. */
 620    list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
 621       int regs_freed = nir_schedule_regs_freed(scoreboard, n);
 622
 623       if (regs_freed > 0 && (!chosen || chosen->max_delay < n->max_delay)) {
 624          chosen = n;
 625       }
 626    }
 627    if (chosen) {
 628       if (debug) {
 629          fprintf(stderr, "chose (regs freed):     ");
 630          nir_print_instr(chosen->instr, stderr);
 631          fprintf(stderr, "\n");
 632       }
 633
 634       return chosen;
 635    }
 636
 637    /* Find a partially evaluated path and try to finish it off */
 638    list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
 639       if (n->partially_evaluated_path &&
 640           (!chosen || chosen->max_delay < n->max_delay)) {
 641          chosen = n;
 642       }
 643    }
 644    if (chosen) {
 645       if (debug) {
 646          fprintf(stderr, "chose (partial path):   ");
 647          nir_print_instr(chosen->instr, stderr);
 648          fprintf(stderr, "\n");
 649       }
 650
 651       return chosen;
 652    }
 653
 654    /* Contra the paper, pick a leader with no effect on used regs.  This may
 655     * open up new opportunities, as otherwise a single-operand instr consuming
 656     * a value will tend to block finding freeing that value.  This had a
 657     * massive effect on reducing spilling on V3D.
 658     *
 659     * XXX: Should this prioritize ready?
 660     */
 661    list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
 662       if (nir_schedule_regs_freed(scoreboard, n) != 0)
 663          continue;
 664
 665       if (!chosen || chosen->max_delay < n->max_delay)
 666          chosen = n;
 667    }
 668    if (chosen) {
 669       if (debug) {
 670          fprintf(stderr, "chose (regs no-op):     ");
 671          nir_print_instr(chosen->instr, stderr);
 672          fprintf(stderr, "\n");
 673       }
 674
 675       return chosen;
 676    }
 677
 678    /* Pick the max delay of the remaining ready set. */
 679    list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
 680       if (n->ready_time > scoreboard->time)
 681          continue;
 682       if (!chosen || chosen->max_delay < n->max_delay)
 683          chosen = n;
 684    }
 685    if (chosen) {
 686       if (debug) {
 687          fprintf(stderr, "chose (ready max delay):   ");
 688          nir_print_instr(chosen->instr, stderr);
 689          fprintf(stderr, "\n");
 690       }
 691       return chosen;
 692    }
 693
 694    /* Pick the max delay of the remaining leaders. */
 695    list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
 696       if (!chosen || chosen->max_delay < n->max_delay)
 697          chosen = n;
 698    }
 699
 700    if (debug) {
 701       fprintf(stderr, "chose (max delay):         ");
 702       nir_print_instr(chosen->instr, stderr);
 703       fprintf(stderr, "\n");
 704    }
 705
 706    return chosen;
 707 }
 708
 709 static void
 710 dump_state(nir_schedule_scoreboard *scoreboard)
 711 {
 712    list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
 713       fprintf(stderr, "maxdel %5d ", n->max_delay);
 714       nir_print_instr(n->instr, stderr);
 715       fprintf(stderr, "\n");
 716
 717       util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
 718          nir_schedule_node *child = (nir_schedule_node *)edge->child;
 719
 720          fprintf(stderr, " -> (%d parents) ", child->dag.parent_count);
 721          nir_print_instr(child->instr, stderr);
 722          fprintf(stderr, "\n");
 723       }
 724    }
 725 }
 726
 727 static void
 728 nir_schedule_mark_use(nir_schedule_scoreboard *scoreboard,
 729                       void *reg_or_def,
 730                       nir_instr *reg_or_def_parent,
 731                       int pressure)
 732 {
 733    /* Make the value live if it's the first time it's been used. */
 734    if (!_mesa_set_search(scoreboard->live_values, reg_or_def)) {
 735       _mesa_set_add(scoreboard->live_values, reg_or_def);
 736       scoreboard->pressure += pressure;
 737    }
 738
 739    /* Make the value dead if it's the last remaining use.  Be careful when one
 740     * instruction uses a value twice to not decrement pressure twice.
 741     */
 742    struct set *remaining_uses =
 743       _mesa_hash_table_search_data(scoreboard->remaining_uses, reg_or_def);
 744    struct set_entry *entry = _mesa_set_search(remaining_uses, reg_or_def_parent);
 745    if (entry) {
 746       _mesa_set_remove(remaining_uses, entry);
 747
 748       if (remaining_uses->entries == 0)
 749          scoreboard->pressure -= pressure;
 750    }
 751 }
 752
 753 static bool
 754 nir_schedule_mark_src_scheduled(nir_src *src, void *state)
 755 {
 756    nir_schedule_scoreboard *scoreboard = state;
 757    struct set *remaining_uses = nir_schedule_scoreboard_get_src(scoreboard, src);
 758
 759    struct set_entry *entry = _mesa_set_search(remaining_uses,
 760                                               src->parent_instr);
 761    if (entry) {
 762       /* Once we've used an SSA value in one instruction, bump the priority of
 763        * the other uses so the SSA value can get fully consumed.
 764        *
 765        * We don't do this for registers, and it's would be a hassle and it's
 766        * unclear if that would help or not.  Also, skip it for constants, as
 767        * they're often folded as immediates into backend instructions and have
 768        * many unrelated instructions all referencing the same value (0).
 769        */
 770       if (src->is_ssa &&
 771           src->ssa->parent_instr->type != nir_instr_type_load_const) {
 772          nir_foreach_use(other_src, src->ssa) {
 773             if (other_src->parent_instr == src->parent_instr)
 774                continue;
 775
 776             nir_schedule_node *n =
 777                nir_schedule_get_node(scoreboard->instr_map,
 778                                      other_src->parent_instr);
 779
 780             if (n && !n->partially_evaluated_path) {
 781                if (debug) {
 782                   fprintf(stderr, "  New partially evaluated path: ");
 783                   nir_print_instr(n->instr, stderr);
 784                   fprintf(stderr, "\n");
 785                }
 786
 787                n->partially_evaluated_path = true;
 788             }
 789          }
 790       }
 791    }
 792
 793    nir_schedule_mark_use(scoreboard,
 794                          src->is_ssa ? (void *)src->ssa : (void *)src->reg.reg,
 795                          src->parent_instr,
 796                          nir_schedule_src_pressure(src));
 797
 798    return true;
 799 }
 800
 801 static bool
 802 nir_schedule_mark_def_scheduled(nir_ssa_def *def, void *state)
 803 {
 804    nir_schedule_scoreboard *scoreboard = state;
 805
 806    nir_schedule_mark_use(scoreboard, def, def->parent_instr,
 807                          nir_schedule_def_pressure(def));
 808
 809    return true;
 810 }
 811
 812 static bool
 813 nir_schedule_mark_dest_scheduled(nir_dest *dest, void *state)
 814 {
 815    nir_schedule_scoreboard *scoreboard = state;
 816
 817    /* SSA defs were handled in nir_schedule_mark_def_scheduled()
 818     */
 819    if (dest->is_ssa)
 820       return true;
 821
 822    /* XXX: This is not actually accurate for regs -- the last use of a reg may
 823     * have a live interval that extends across control flow.  We should
 824     * calculate the live ranges of regs, and have scheduler nodes for the CF
 825     * nodes that also "use" the reg.
 826     */
 827    nir_schedule_mark_use(scoreboard, dest->reg.reg,
 828                          dest->reg.parent_instr,
 829                          nir_schedule_dest_pressure(dest));
 830
 831    return true;
 832 }
 833
 834 static void
 835 nir_schedule_mark_node_scheduled(nir_schedule_scoreboard *scoreboard,
 836                                  nir_schedule_node *n)
 837 {
 838    nir_foreach_src(n->instr, nir_schedule_mark_src_scheduled, scoreboard);
 839    nir_foreach_ssa_def(n->instr, nir_schedule_mark_def_scheduled, scoreboard);
 840    nir_foreach_dest(n->instr, nir_schedule_mark_dest_scheduled, scoreboard);
 841
 842    util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
 843       nir_schedule_node *child = (nir_schedule_node *)edge->child;
 844
 845       child->ready_time = MAX2(child->ready_time,
 846                                scoreboard->time + n->delay);
 847
 848       if (child->dag.parent_count == 1) {
 849          if (debug) {
 850             fprintf(stderr, "  New DAG head: ");
 851             nir_print_instr(child->instr, stderr);
 852             fprintf(stderr, "\n");
 853          }
 854       }
 855    }
 856
 857    dag_prune_head(scoreboard->dag, &n->dag);
 858
 859    scoreboard->time = MAX2(n->ready_time, scoreboard->time);
 860    scoreboard->time++;
 861 }
 862
 863 static void
 864 nir_schedule_instructions(nir_schedule_scoreboard *scoreboard, nir_block *block)
 865 {
 866    while (!list_is_empty(&scoreboard->dag->heads)) {
 867       if (debug) {
 868          fprintf(stderr, "current list:\n");
 869          dump_state(scoreboard);
 870       }
 871
 872       nir_schedule_node *chosen;
 873       if (scoreboard->pressure < scoreboard->threshold)
 874          chosen = nir_schedule_choose_instruction_csp(scoreboard);
 875       else
 876          chosen = nir_schedule_choose_instruction_csr(scoreboard);
 877
 878       /* Now that we've scheduled a new instruction, some of its children may
 879        * be promoted to the list of instructions ready to be scheduled.
 880        */
 881       nir_schedule_mark_node_scheduled(scoreboard, chosen);
 882
 883       /* Move the instruction to the end (so our first chosen instructions are
 884        * the start of the program).
 885        */
 886       exec_node_remove(&chosen->instr->node);
 887       exec_list_push_tail(&block->instr_list, &chosen->instr->node);
 888
 889       if (debug)
 890          fprintf(stderr, "\n");
 891    }
 892 }
 893
 894 static uint32_t
 895 nir_schedule_get_delay(nir_instr *instr)
 896 {
 897    switch (instr->type) {
 898    case nir_instr_type_ssa_undef:
 899    case nir_instr_type_load_const:
 900    case nir_instr_type_alu:
 901    case nir_instr_type_deref:
 902    case nir_instr_type_jump:
 903    case nir_instr_type_parallel_copy:
 904    case nir_instr_type_call:
 905    case nir_instr_type_phi:
 906       return 1;
 907
 908    case nir_instr_type_intrinsic:
 909       /* XXX: Pick a large number for UBO/SSBO/image/shared loads */
 910       return 1;
 911
 912    case nir_instr_type_tex:
 913       /* Pick some large number to try to fetch textures early and sample them
 914        * late.
 915        */
 916       return 100;
 917    }
 918
 919    return 0;
 920 }
 921
 922 static void
 923 nir_schedule_dag_max_delay_cb(struct dag_node *node, void *state)
 924 {
 925    nir_schedule_node *n = (nir_schedule_node *)node;
 926    uint32_t max_delay = 0;
 927
 928    util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
 929       nir_schedule_node *child = (nir_schedule_node *)edge->child;
 930       max_delay = MAX2(child->max_delay, max_delay);
 931    }
 932
 933    n->max_delay = MAX2(n->max_delay, max_delay + n->delay);
 934  }
 935
 936 static void
 937 nir_schedule_block(nir_schedule_scoreboard *scoreboard, nir_block *block)
 938 {
 939    void *mem_ctx = ralloc_context(NULL);
 940    scoreboard->instr_map = _mesa_pointer_hash_table_create(mem_ctx);
 941
 942    scoreboard->dag = dag_create(mem_ctx);
 943
 944    nir_foreach_instr(instr, block) {
 945       nir_schedule_node *n =
 946          rzalloc(mem_ctx, nir_schedule_node);
 947
 948       n->instr = instr;
 949       n->delay = nir_schedule_get_delay(instr);
 950       dag_init_node(scoreboard->dag, &n->dag);
 951
 952       _mesa_hash_table_insert(scoreboard->instr_map, instr, n);
 953    }
 954
 955    calculate_forward_deps(scoreboard, block);
 956    calculate_reverse_deps(scoreboard, block);
 957
 958    dag_traverse_bottom_up(scoreboard->dag, nir_schedule_dag_max_delay_cb, NULL);
 959
 960    nir_schedule_instructions(scoreboard, block);
 961
 962    ralloc_free(mem_ctx);
 963    scoreboard->instr_map = NULL;
 964 }
 965
 966 static bool
 967 nir_schedule_ssa_def_init_scoreboard(nir_ssa_def *def, void *state)
 968 {
 969    nir_schedule_scoreboard *scoreboard = state;
 970    struct set *def_uses = _mesa_pointer_set_create(scoreboard);
 971
 972    _mesa_hash_table_insert(scoreboard->remaining_uses, def, def_uses);
 973
 974    _mesa_set_add(def_uses, def->parent_instr);
 975
 976    nir_foreach_use(src, def) {
 977       _mesa_set_add(def_uses, src->parent_instr);
 978    }
 979
 980    /* XXX: Handle if uses */
 981
 982    return true;
 983 }
 984
 985 static nir_schedule_scoreboard *
 986 nir_schedule_get_scoreboard(nir_shader *shader,
 987                             const nir_schedule_options *options)
 988 {
 989    nir_schedule_scoreboard *scoreboard = rzalloc(NULL, nir_schedule_scoreboard);
 990
 991    scoreboard->shader = shader;
 992    scoreboard->live_values = _mesa_pointer_set_create(scoreboard);
 993    scoreboard->remaining_uses = _mesa_pointer_hash_table_create(scoreboard);
 994    scoreboard->threshold = options->threshold;
 995    scoreboard->stages_with_shared_io_memory =
 996       options->stages_with_shared_io_memory;
 997    scoreboard->pressure = 0;
 998
 999    nir_foreach_function(function, shader) {
1000       nir_foreach_register(reg, &function->impl->registers) {
1001          struct set *register_uses =
1002             _mesa_pointer_set_create(scoreboard);
1003
1004          _mesa_hash_table_insert(scoreboard->remaining_uses, reg, register_uses);
1005
1006          nir_foreach_use(src, reg) {
1007             _mesa_set_add(register_uses, src->parent_instr);
1008          }
1009
1010          /* XXX: Handle if uses */
1011
1012          nir_foreach_def(dest, reg) {
1013             _mesa_set_add(register_uses, dest->reg.parent_instr);
1014          }
1015       }
1016
1017       nir_foreach_block(block, function->impl) {
1018          nir_foreach_instr(instr, block) {
1019             nir_foreach_ssa_def(instr, nir_schedule_ssa_def_init_scoreboard,
1020                                 scoreboard);
1021          }
1022
1023          /* XXX: We're ignoring if uses, which may prioritize scheduling other
1024           * uses of the if src even when it doesn't help.  That's not many
1025           * values, though, so meh.
1026           */
1027       }
1028    }
1029
1030    return scoreboard;
1031 }
1032
1033 static void
1034 nir_schedule_validate_uses(nir_schedule_scoreboard *scoreboard)
1035 {
1036 #ifdef NDEBUG
1037    return;
1038 #endif
1039
1040    bool any_uses = false;
1041
1042    hash_table_foreach(scoreboard->remaining_uses, entry) {
1043       struct set *remaining_uses = entry->data;
1044
1045       set_foreach(remaining_uses, instr_entry) {
1046          if (!any_uses) {
1047             fprintf(stderr, "Tracked uses remain after scheduling.  "
1048                     "Affected instructions: \n");
1049             any_uses = true;
1050          }
1051          nir_print_instr(instr_entry->key, stderr);
1052          fprintf(stderr, "\n");
1053       }
1054    }
1055
1056    assert(!any_uses);
1057 }
1058
1059 /**
1060  * Schedules the NIR instructions to try to decrease stalls (for example,
1061  * delaying texture reads) while managing register pressure.
1062  *
1063  * The threshold represents "number of NIR register/SSA def channels live
1064  * before switching the scheduling heuristic to reduce register pressure",
1065  * since most of our GPU architectures are scalar (extending to vector with a
1066  * flag wouldn't be hard).  This number should be a bit below the number of
1067  * registers available (counting any that may be occupied by system value
1068  * payload values, for example), since the heuristic may not always be able to
1069  * free a register immediately.  The amount below the limit is up to you to
1070  * tune.
1071  */
1072 void
1073 nir_schedule(nir_shader *shader,
1074              const nir_schedule_options *options)
1075 {
1076    nir_schedule_scoreboard *scoreboard = nir_schedule_get_scoreboard(shader,
1077                                                                      options);
1078
1079    if (debug) {
1080       fprintf(stderr, "NIR shader before scheduling:\n");
1081       nir_print_shader(shader, stderr);
1082    }
1083
1084    nir_foreach_function(function, shader) {
1085       if (!function->impl)
1086          continue;
1087
1088       nir_foreach_block(block, function->impl) {
1089          nir_schedule_block(scoreboard, block);
1090       }
1091    }
1092
1093    nir_schedule_validate_uses(scoreboard);
1094
1095    ralloc_free(scoreboard);
1096 }