src/compiler/nir/nir_schedule.c

   1 /*
   2  * Copyright © 2019 Broadcom
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "nir_schedule.h"
  25 #include "util/dag.h"
  26 #include "util/u_dynarray.h"
  27
  28 /** @file
  29  *
  30  * Implements basic-block-level prepass instruction scheduling in NIR to
  31  * manage register pressure.
  32  *
  33  * This is based on the Goodman/Hsu paper (1988, cached copy at
  34  * https://people.freedesktop.org/~anholt/scheduling-goodman-hsu.pdf).  We
  35  * make up the DDG for NIR (which can be mostly done using the NIR def/use
  36  * chains for SSA instructions, plus some edges for ordering register writes
  37  * vs reads, and some more for ordering intrinsics).  Then we pick heads off
  38  * of the DDG using their heuristic to emit the NIR instructions back into the
  39  * block in their new order.
  40  *
  41  * The hard case for prepass scheduling on GPUs seems to always be consuming
  42  * texture/ubo results.  The register pressure heuristic doesn't want to pick
  43  * an instr that starts consuming texture results because it usually won't be
  44  * the only usage, so that instruction will increase pressure.
  45  *
  46  * If you try to force consumption of tex results always, then in a case where
  47  * single sample is used for many outputs, you'll end up picking every other
  48  * user and expanding register pressure.  The partially_evaluated_path flag
  49  * helps tremendously, in that if you happen for whatever reason to pick a
  50  * texture sample's output, then you'll try to finish off that sample.  Future
  51  * work may include doing some local search before locking in a choice, to try
  52  * to more reliably find the case where just a few choices going against the
  53  * heuristic can manage to free the whole vector.
  54  */
  55
  56 static bool debug;
  57
  58 /**
  59  * Represents a node in the DDG for a NIR instruction.
  60  */
  61 typedef struct {
  62    struct dag_node dag; /* must be first for our u_dynarray_foreach */
  63    nir_instr *instr;
  64    bool partially_evaluated_path;
  65
  66    /* Approximate estimate of the delay between starting this instruction and
  67     * its results being available.
  68     *
  69     * Accuracy is not too important, given that we're prepass scheduling here
  70     * and just trying to reduce excess dependencies introduced by a register
  71     * allocator by stretching out the live intervals of expensive
  72     * instructions.
  73     */
  74    uint32_t delay;
  75
  76    /* Cost of the maximum-delay path from this node to the leaves. */
  77    uint32_t max_delay;
  78
  79    /* scoreboard->time value when this instruction can be scheduled without
  80     * any stalls expected.
  81     */
  82    uint32_t ready_time;
  83 } nir_schedule_node;
  84
  85 typedef struct {
  86    struct dag *dag;
  87
  88    nir_shader *shader;
  89
  90    /* Mapping from nir_register * or nir_ssa_def * to a struct set of
  91     * instructions remaining to be scheduled using the register.
  92     */
  93    struct hash_table *remaining_uses;
  94
  95    /* Map from nir_instr to nir_schedule_node * */
  96    struct hash_table *instr_map;
  97
  98    /* Set of nir_register * or nir_ssa_def * that have had any instruction
  99     * scheduled on them.
 100     */
 101    struct set *live_values;
 102
 103    /* An abstract approximation of the number of nir_scheduler_node->delay
 104     * units since the start of the shader.
 105     */
 106    uint32_t time;
 107
 108    /* Number of channels currently used by the NIR instructions that have been
 109     * scheduled.
 110     */
 111    int pressure;
 112
 113    /* Options specified by the backend */
 114    const nir_schedule_options *options;
 115 } nir_schedule_scoreboard;
 116
 117 /* When walking the instructions in reverse, we use this flag to swap
 118  * before/after in add_dep().
 119  */
 120 enum direction { F, R };
 121
 122 struct nir_schedule_class_dep {
 123    int klass;
 124    nir_schedule_node *node;
 125    struct nir_schedule_class_dep *next;
 126 };
 127
 128 typedef struct {
 129    nir_schedule_scoreboard *scoreboard;
 130
 131    /* Map from nir_register to nir_schedule_node * */
 132    struct hash_table *reg_map;
 133
 134    /* Scheduler nodes for last instruction involved in some class of dependency.
 135     */
 136    nir_schedule_node *load_input;
 137    nir_schedule_node *store_shared;
 138    nir_schedule_node *unknown_intrinsic;
 139    nir_schedule_node *discard;
 140    nir_schedule_node *jump;
 141
 142    struct nir_schedule_class_dep *class_deps;
 143
 144    enum direction dir;
 145 } nir_deps_state;
 146
 147 static void *
 148 _mesa_hash_table_search_data(struct hash_table *ht, void *key)
 149 {
 150    struct hash_entry *entry = _mesa_hash_table_search(ht, key);
 151    if (!entry)
 152       return NULL;
 153    return entry->data;
 154 }
 155
 156 static nir_schedule_node *
 157 nir_schedule_get_node(struct hash_table *instr_map, nir_instr *instr)
 158 {
 159    return _mesa_hash_table_search_data(instr_map, instr);
 160 }
 161
 162 static struct set *
 163 nir_schedule_scoreboard_get_src(nir_schedule_scoreboard *scoreboard, nir_src *src)
 164 {
 165    if (src->is_ssa) {
 166       return _mesa_hash_table_search_data(scoreboard->remaining_uses, src->ssa);
 167    } else {
 168       return _mesa_hash_table_search_data(scoreboard->remaining_uses,
 169                                           src->reg.reg);
 170    }
 171 }
 172
 173 static int
 174 nir_schedule_def_pressure(nir_ssa_def *def)
 175 {
 176    return def->num_components;
 177 }
 178
 179 static int
 180 nir_schedule_src_pressure(nir_src *src)
 181 {
 182    if (src->is_ssa)
 183       return nir_schedule_def_pressure(src->ssa);
 184    else
 185       return src->reg.reg->num_components;
 186 }
 187
 188 static int
 189 nir_schedule_dest_pressure(nir_dest *dest)
 190 {
 191    if (dest->is_ssa)
 192       return nir_schedule_def_pressure(&dest->ssa);
 193    else
 194       return dest->reg.reg->num_components;
 195 }
 196
 197 /**
 198  * Adds a dependency such that @after must appear in the final program after
 199  * @before.
 200  *
 201  * We add @before as a child of @after, so that DAG heads are the outputs of
 202  * the program and we make our scheduling decisions bottom to top.
 203  */
 204 static void
 205 add_dep(nir_deps_state *state,
 206         nir_schedule_node *before,
 207         nir_schedule_node *after)
 208 {
 209    if (!before || !after)
 210       return;
 211
 212    assert(before != after);
 213
 214    if (state->dir == F)
 215       dag_add_edge(&before->dag, &after->dag, NULL);
 216    else
 217       dag_add_edge(&after->dag, &before->dag, NULL);
 218 }
 219
 220
 221 static void
 222 add_read_dep(nir_deps_state *state,
 223              nir_schedule_node *before,
 224              nir_schedule_node *after)
 225 {
 226    add_dep(state, before, after);
 227 }
 228
 229 static void
 230 add_write_dep(nir_deps_state *state,
 231               nir_schedule_node **before,
 232               nir_schedule_node *after)
 233 {
 234    add_dep(state, *before, after);
 235    *before = after;
 236 }
 237
 238 static bool
 239 nir_schedule_reg_src_deps(nir_src *src, void *in_state)
 240 {
 241    nir_deps_state *state = in_state;
 242
 243    if (src->is_ssa)
 244       return true;
 245
 246    struct hash_entry *entry = _mesa_hash_table_search(state->reg_map,
 247                                                       src->reg.reg);
 248    if (!entry)
 249       return true;
 250    nir_schedule_node *dst_n = entry->data;
 251
 252    nir_schedule_node *src_n =
 253       nir_schedule_get_node(state->scoreboard->instr_map,
 254                             src->parent_instr);
 255
 256    add_dep(state, dst_n, src_n);
 257
 258    return true;
 259 }
 260
 261 static bool
 262 nir_schedule_reg_dest_deps(nir_dest *dest, void *in_state)
 263 {
 264    nir_deps_state *state = in_state;
 265
 266    if (dest->is_ssa)
 267       return true;
 268
 269    nir_schedule_node *dest_n =
 270       nir_schedule_get_node(state->scoreboard->instr_map,
 271                             dest->reg.parent_instr);
 272
 273    struct hash_entry *entry = _mesa_hash_table_search(state->reg_map,
 274                                                       dest->reg.reg);
 275    if (!entry) {
 276       _mesa_hash_table_insert(state->reg_map, dest->reg.reg, dest_n);
 277       return true;
 278    }
 279    nir_schedule_node **before = (nir_schedule_node **)&entry->data;
 280
 281    add_write_dep(state, before, dest_n);
 282
 283    return true;
 284 }
 285
 286 static bool
 287 nir_schedule_ssa_deps(nir_ssa_def *def, void *in_state)
 288 {
 289    nir_deps_state *state = in_state;
 290    struct hash_table *instr_map = state->scoreboard->instr_map;
 291    nir_schedule_node *def_n = nir_schedule_get_node(instr_map, def->parent_instr);
 292
 293    nir_foreach_use(src, def) {
 294       nir_schedule_node *use_n = nir_schedule_get_node(instr_map,
 295                                                        src->parent_instr);
 296
 297       add_read_dep(state, def_n, use_n);
 298    }
 299
 300    return true;
 301 }
 302
 303 static struct nir_schedule_class_dep *
 304 nir_schedule_get_class_dep(nir_deps_state *state,
 305                            int klass)
 306 {
 307    for (struct nir_schedule_class_dep *class_dep = state->class_deps;
 308         class_dep != NULL;
 309         class_dep = class_dep->next) {
 310       if (class_dep->klass == klass)
 311          return class_dep;
 312    }
 313
 314    struct nir_schedule_class_dep *class_dep =
 315       ralloc(state->reg_map, struct nir_schedule_class_dep);
 316
 317    class_dep->klass = klass;
 318    class_dep->node = NULL;
 319    class_dep->next = state->class_deps;
 320
 321    state->class_deps = class_dep;
 322
 323    return class_dep;
 324 }
 325
 326 static void
 327 nir_schedule_intrinsic_deps(nir_deps_state *state,
 328                             nir_intrinsic_instr *instr)
 329 {
 330    nir_schedule_node *n = nir_schedule_get_node(state->scoreboard->instr_map,
 331                                                 &instr->instr);
 332    const nir_schedule_options *options = state->scoreboard->options;
 333    nir_schedule_dependency dep;
 334
 335    if (options->intrinsic_cb &&
 336        options->intrinsic_cb(instr, &dep, options->intrinsic_cb_data)) {
 337       struct nir_schedule_class_dep *class_dep =
 338          nir_schedule_get_class_dep(state, dep.klass);
 339
 340       switch (dep.type) {
 341       case NIR_SCHEDULE_READ_DEPENDENCY:
 342          add_read_dep(state, class_dep->node, n);
 343          break;
 344       case NIR_SCHEDULE_WRITE_DEPENDENCY:
 345          add_write_dep(state, &class_dep->node, n);
 346          break;
 347       }
 348    }
 349
 350    switch (instr->intrinsic) {
 351    case nir_intrinsic_load_uniform:
 352    case nir_intrinsic_load_ubo:
 353    case nir_intrinsic_load_front_face:
 354       break;
 355
 356    case nir_intrinsic_discard:
 357    case nir_intrinsic_discard_if:
 358       /* We are adding two dependencies:
 359        *
 360        * * A individual one that we could use to add a read_dep while handling
 361        *   nir_instr_type_tex
 362        *
 363        * * Include it on the unknown intrinsic set, as we want discard to be
 364        *   serialized in in the same order relative to intervening stores or
 365        *   atomic accesses to SSBOs and images
 366        */
 367       add_write_dep(state, &state->discard, n);
 368       add_write_dep(state, &state->unknown_intrinsic, n);
 369       break;
 370
 371    case nir_intrinsic_store_output:
 372       /* For some hardware and stages, output stores affect the same shared
 373        * memory as input loads.
 374        */
 375       if ((state->scoreboard->options->stages_with_shared_io_memory &
 376            (1 << state->scoreboard->shader->info.stage)))
 377          add_write_dep(state, &state->load_input, n);
 378
 379       /* Make sure that preceding discards stay before the store_output */
 380       add_read_dep(state, state->discard, n);
 381
 382       break;
 383
 384    case nir_intrinsic_load_input:
 385    case nir_intrinsic_load_per_vertex_input:
 386       add_read_dep(state, state->load_input, n);
 387       break;
 388
 389    case nir_intrinsic_load_shared:
 390       /* Don't move load_shared beyond a following store_shared, as it could
 391        * change their value
 392        */
 393       add_read_dep(state, state->store_shared, n);
 394       break;
 395
 396    case nir_intrinsic_store_shared:
 397       add_write_dep(state, &state->store_shared, n);
 398       break;
 399
 400    case nir_intrinsic_control_barrier:
 401    case nir_intrinsic_memory_barrier_shared:
 402       add_write_dep(state, &state->store_shared, n);
 403
 404       /* Serialize against ssbos/atomics/etc. */
 405       add_write_dep(state, &state->unknown_intrinsic, n);
 406       break;
 407
 408    default:
 409       /* Attempt to handle other intrinsics that we haven't individually
 410        * categorized by serializing them in the same order relative to each
 411        * other.
 412        */
 413       add_write_dep(state, &state->unknown_intrinsic, n);
 414       break;
 415    }
 416 }
 417
 418 /**
 419  * Common code for dependencies that need to be tracked both forward and
 420  * backward.
 421  *
 422  * This is for things like "all reads of r4 have to happen between the r4
 423  * writes that surround them".
 424  */
 425 static void
 426 nir_schedule_calculate_deps(nir_deps_state *state, nir_schedule_node *n)
 427 {
 428    nir_instr *instr = n->instr;
 429
 430    /* For NIR SSA defs, we only need to do a single pass of making the uses
 431     * depend on the def.
 432     */
 433    if (state->dir == F)
 434       nir_foreach_ssa_def(instr, nir_schedule_ssa_deps, state);
 435
 436    /* For NIR regs, track the last writer in the scheduler state so that we
 437     * can keep the writes in order and let reads get reordered only between
 438     * each write.
 439     */
 440    nir_foreach_src(instr, nir_schedule_reg_src_deps, state);
 441
 442    nir_foreach_dest(instr, nir_schedule_reg_dest_deps, state);
 443
 444    /* Make sure any other instructions keep their positions relative to
 445     * jumps.
 446     */
 447    if (instr->type != nir_instr_type_jump)
 448       add_read_dep(state, state->jump, n);
 449
 450    switch (instr->type) {
 451    case nir_instr_type_ssa_undef:
 452    case nir_instr_type_load_const:
 453    case nir_instr_type_alu:
 454    case nir_instr_type_deref:
 455       break;
 456
 457    case nir_instr_type_tex:
 458       /* Don't move texture ops before a discard, as that could increase
 459        * memory bandwidth for reading the discarded samples.
 460        */
 461       add_read_dep(state, state->discard, n);
 462       break;
 463
 464    case nir_instr_type_jump:
 465       add_write_dep(state, &state->jump, n);
 466       break;
 467
 468    case nir_instr_type_call:
 469       unreachable("Calls should have been lowered");
 470       break;
 471
 472    case nir_instr_type_parallel_copy:
 473       unreachable("Parallel copies should have been lowered");
 474       break;
 475
 476    case nir_instr_type_phi:
 477       unreachable("nir_schedule() should be called after lowering from SSA");
 478       break;
 479
 480    case nir_instr_type_intrinsic:
 481       nir_schedule_intrinsic_deps(state, nir_instr_as_intrinsic(instr));
 482       break;
 483    }
 484 }
 485
 486 static void
 487 calculate_forward_deps(nir_schedule_scoreboard *scoreboard, nir_block *block)
 488 {
 489    nir_deps_state state = {
 490       .scoreboard = scoreboard,
 491       .dir = F,
 492       .reg_map = _mesa_pointer_hash_table_create(NULL),
 493    };
 494
 495    nir_foreach_instr(instr, block) {
 496       nir_schedule_node *node = nir_schedule_get_node(scoreboard->instr_map,
 497                                                       instr);
 498       nir_schedule_calculate_deps(&state, node);
 499    }
 500
 501    ralloc_free(state.reg_map);
 502 }
 503
 504 static void
 505 calculate_reverse_deps(nir_schedule_scoreboard *scoreboard, nir_block *block)
 506 {
 507    nir_deps_state state = {
 508       .scoreboard = scoreboard,
 509       .dir = R,
 510       .reg_map = _mesa_pointer_hash_table_create(NULL),
 511    };
 512
 513    nir_foreach_instr_reverse(instr, block) {
 514       nir_schedule_node *node = nir_schedule_get_node(scoreboard->instr_map,
 515                                                       instr);
 516       nir_schedule_calculate_deps(&state, node);
 517    }
 518
 519    ralloc_free(state.reg_map);
 520 }
 521
 522 typedef struct {
 523    nir_schedule_scoreboard *scoreboard;
 524    int regs_freed;
 525 } nir_schedule_regs_freed_state;
 526
 527 static bool
 528 nir_schedule_regs_freed_src_cb(nir_src *src, void *in_state)
 529 {
 530    nir_schedule_regs_freed_state *state = in_state;
 531    nir_schedule_scoreboard *scoreboard = state->scoreboard;
 532    struct set *remaining_uses = nir_schedule_scoreboard_get_src(scoreboard, src);
 533
 534    if (remaining_uses->entries == 1 &&
 535        _mesa_set_search(remaining_uses, src->parent_instr)) {
 536       state->regs_freed += nir_schedule_src_pressure(src);
 537    }
 538
 539    return true;
 540 }
 541
 542 static bool
 543 nir_schedule_regs_freed_def_cb(nir_ssa_def *def, void *in_state)
 544 {
 545    nir_schedule_regs_freed_state *state = in_state;
 546
 547    state->regs_freed -= nir_schedule_def_pressure(def);
 548
 549    return true;
 550 }
 551
 552 static bool
 553 nir_schedule_regs_freed_dest_cb(nir_dest *dest, void *in_state)
 554 {
 555    nir_schedule_regs_freed_state *state = in_state;
 556    nir_schedule_scoreboard *scoreboard = state->scoreboard;
 557
 558    if (dest->is_ssa)
 559       return true;
 560
 561    nir_register *reg = dest->reg.reg;
 562
 563    /* Only the first def of a reg counts against register pressure. */
 564    if (!_mesa_set_search(scoreboard->live_values, reg))
 565       state->regs_freed -= nir_schedule_dest_pressure(dest);
 566
 567    return true;
 568 }
 569
 570 static int
 571 nir_schedule_regs_freed(nir_schedule_scoreboard *scoreboard, nir_schedule_node *n)
 572 {
 573    nir_schedule_regs_freed_state state = {
 574       .scoreboard = scoreboard,
 575    };
 576
 577    nir_foreach_src(n->instr, nir_schedule_regs_freed_src_cb, &state);
 578
 579    nir_foreach_ssa_def(n->instr, nir_schedule_regs_freed_def_cb, &state);
 580
 581    nir_foreach_dest(n->instr, nir_schedule_regs_freed_dest_cb, &state);
 582
 583    return state.regs_freed;
 584 }
 585
 586 /**
 587  * Chooses an instruction that will minimise the register pressure as much as
 588  * possible. This should only be used as a fallback when the regular scheduling
 589  * generates a shader whose register allocation fails.
 590  */
 591 static nir_schedule_node *
 592 nir_schedule_choose_instruction_fallback(nir_schedule_scoreboard *scoreboard)
 593 {
 594    nir_schedule_node *chosen = NULL;
 595
 596    /* Find the leader in the ready (shouldn't-stall) set with the mininum
 597     * cost.
 598     */
 599    list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
 600       if (scoreboard->time < n->ready_time)
 601          continue;
 602
 603       if (!chosen || chosen->max_delay > n->max_delay)
 604          chosen = n;
 605    }
 606    if (chosen) {
 607       if (debug) {
 608          fprintf(stderr, "chose (ready fallback):          ");
 609          nir_print_instr(chosen->instr, stderr);
 610          fprintf(stderr, "\n");
 611       }
 612
 613       return chosen;
 614    }
 615
 616    /* Otherwise, choose the leader with the minimum cost. */
 617    list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
 618       if (!chosen || chosen->max_delay > n->max_delay)
 619          chosen = n;
 620    }
 621    if (debug) {
 622       fprintf(stderr, "chose (leader fallback):         ");
 623       nir_print_instr(chosen->instr, stderr);
 624       fprintf(stderr, "\n");
 625    }
 626
 627    return chosen;
 628 }
 629
 630 /**
 631  * Chooses an instruction to schedule using the Goodman/Hsu (1988) CSP (Code
 632  * Scheduling for Parallelism) heuristic.
 633  *
 634  * Picks an instruction on the critical that's ready to execute without
 635  * stalls, if possible, otherwise picks the instruction on the critical path.
 636  */
 637 static nir_schedule_node *
 638 nir_schedule_choose_instruction_csp(nir_schedule_scoreboard *scoreboard)
 639 {
 640    nir_schedule_node *chosen = NULL;
 641
 642    /* Find the leader in the ready (shouldn't-stall) set with the maximum
 643     * cost.
 644     */
 645    list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
 646       if (scoreboard->time < n->ready_time)
 647          continue;
 648
 649       if (!chosen || chosen->max_delay < n->max_delay)
 650          chosen = n;
 651    }
 652    if (chosen) {
 653       if (debug) {
 654          fprintf(stderr, "chose (ready):          ");
 655          nir_print_instr(chosen->instr, stderr);
 656          fprintf(stderr, "\n");
 657       }
 658
 659       return chosen;
 660    }
 661
 662    /* Otherwise, choose the leader with the maximum cost. */
 663    list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
 664       if (!chosen || chosen->max_delay < n->max_delay)
 665          chosen = n;
 666    }
 667    if (debug) {
 668       fprintf(stderr, "chose (leader):         ");
 669       nir_print_instr(chosen->instr, stderr);
 670       fprintf(stderr, "\n");
 671    }
 672
 673    return chosen;
 674 }
 675
 676 /**
 677  * Chooses an instruction to schedule using the Goodman/Hsu (1988) CSR (Code
 678  * Scheduling for Register pressure) heuristic.
 679  */
 680 static nir_schedule_node *
 681 nir_schedule_choose_instruction_csr(nir_schedule_scoreboard *scoreboard)
 682 {
 683    nir_schedule_node *chosen = NULL;
 684
 685    /* Find a ready inst with regs freed and pick the one with max cost. */
 686    list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
 687       if (n->ready_time > scoreboard->time)
 688          continue;
 689
 690       int regs_freed = nir_schedule_regs_freed(scoreboard, n);
 691
 692       if (regs_freed > 0 && (!chosen || chosen->max_delay < n->max_delay)) {
 693          chosen = n;
 694       }
 695    }
 696    if (chosen) {
 697       if (debug) {
 698          fprintf(stderr, "chose (freed+ready):    ");
 699          nir_print_instr(chosen->instr, stderr);
 700          fprintf(stderr, "\n");
 701       }
 702
 703       return chosen;
 704    }
 705
 706    /* Find a leader with regs freed and pick the one with max cost. */
 707    list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
 708       int regs_freed = nir_schedule_regs_freed(scoreboard, n);
 709
 710       if (regs_freed > 0 && (!chosen || chosen->max_delay < n->max_delay)) {
 711          chosen = n;
 712       }
 713    }
 714    if (chosen) {
 715       if (debug) {
 716          fprintf(stderr, "chose (regs freed):     ");
 717          nir_print_instr(chosen->instr, stderr);
 718          fprintf(stderr, "\n");
 719       }
 720
 721       return chosen;
 722    }
 723
 724    /* Find a partially evaluated path and try to finish it off */
 725    list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
 726       if (n->partially_evaluated_path &&
 727           (!chosen || chosen->max_delay < n->max_delay)) {
 728          chosen = n;
 729       }
 730    }
 731    if (chosen) {
 732       if (debug) {
 733          fprintf(stderr, "chose (partial path):   ");
 734          nir_print_instr(chosen->instr, stderr);
 735          fprintf(stderr, "\n");
 736       }
 737
 738       return chosen;
 739    }
 740
 741    /* Contra the paper, pick a leader with no effect on used regs.  This may
 742     * open up new opportunities, as otherwise a single-operand instr consuming
 743     * a value will tend to block finding freeing that value.  This had a
 744     * massive effect on reducing spilling on V3D.
 745     *
 746     * XXX: Should this prioritize ready?
 747     */
 748    list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
 749       if (nir_schedule_regs_freed(scoreboard, n) != 0)
 750          continue;
 751
 752       if (!chosen || chosen->max_delay < n->max_delay)
 753          chosen = n;
 754    }
 755    if (chosen) {
 756       if (debug) {
 757          fprintf(stderr, "chose (regs no-op):     ");
 758          nir_print_instr(chosen->instr, stderr);
 759          fprintf(stderr, "\n");
 760       }
 761
 762       return chosen;
 763    }
 764
 765    /* Pick the max delay of the remaining ready set. */
 766    list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
 767       if (n->ready_time > scoreboard->time)
 768          continue;
 769       if (!chosen || chosen->max_delay < n->max_delay)
 770          chosen = n;
 771    }
 772    if (chosen) {
 773       if (debug) {
 774          fprintf(stderr, "chose (ready max delay):   ");
 775          nir_print_instr(chosen->instr, stderr);
 776          fprintf(stderr, "\n");
 777       }
 778       return chosen;
 779    }
 780
 781    /* Pick the max delay of the remaining leaders. */
 782    list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
 783       if (!chosen || chosen->max_delay < n->max_delay)
 784          chosen = n;
 785    }
 786
 787    if (debug) {
 788       fprintf(stderr, "chose (max delay):         ");
 789       nir_print_instr(chosen->instr, stderr);
 790       fprintf(stderr, "\n");
 791    }
 792
 793    return chosen;
 794 }
 795
 796 static void
 797 dump_state(nir_schedule_scoreboard *scoreboard)
 798 {
 799    list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
 800       fprintf(stderr, "maxdel %5d ", n->max_delay);
 801       nir_print_instr(n->instr, stderr);
 802       fprintf(stderr, "\n");
 803
 804       util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
 805          nir_schedule_node *child = (nir_schedule_node *)edge->child;
 806
 807          fprintf(stderr, " -> (%d parents) ", child->dag.parent_count);
 808          nir_print_instr(child->instr, stderr);
 809          fprintf(stderr, "\n");
 810       }
 811    }
 812 }
 813
 814 static void
 815 nir_schedule_mark_use(nir_schedule_scoreboard *scoreboard,
 816                       void *reg_or_def,
 817                       nir_instr *reg_or_def_parent,
 818                       int pressure)
 819 {
 820    /* Make the value live if it's the first time it's been used. */
 821    if (!_mesa_set_search(scoreboard->live_values, reg_or_def)) {
 822       _mesa_set_add(scoreboard->live_values, reg_or_def);
 823       scoreboard->pressure += pressure;
 824    }
 825
 826    /* Make the value dead if it's the last remaining use.  Be careful when one
 827     * instruction uses a value twice to not decrement pressure twice.
 828     */
 829    struct set *remaining_uses =
 830       _mesa_hash_table_search_data(scoreboard->remaining_uses, reg_or_def);
 831    struct set_entry *entry = _mesa_set_search(remaining_uses, reg_or_def_parent);
 832    if (entry) {
 833       _mesa_set_remove(remaining_uses, entry);
 834
 835       if (remaining_uses->entries == 0)
 836          scoreboard->pressure -= pressure;
 837    }
 838 }
 839
 840 static bool
 841 nir_schedule_mark_src_scheduled(nir_src *src, void *state)
 842 {
 843    nir_schedule_scoreboard *scoreboard = state;
 844    struct set *remaining_uses = nir_schedule_scoreboard_get_src(scoreboard, src);
 845
 846    struct set_entry *entry = _mesa_set_search(remaining_uses,
 847                                               src->parent_instr);
 848    if (entry) {
 849       /* Once we've used an SSA value in one instruction, bump the priority of
 850        * the other uses so the SSA value can get fully consumed.
 851        *
 852        * We don't do this for registers, and it's would be a hassle and it's
 853        * unclear if that would help or not.  Also, skip it for constants, as
 854        * they're often folded as immediates into backend instructions and have
 855        * many unrelated instructions all referencing the same value (0).
 856        */
 857       if (src->is_ssa &&
 858           src->ssa->parent_instr->type != nir_instr_type_load_const) {
 859          nir_foreach_use(other_src, src->ssa) {
 860             if (other_src->parent_instr == src->parent_instr)
 861                continue;
 862
 863             nir_schedule_node *n =
 864                nir_schedule_get_node(scoreboard->instr_map,
 865                                      other_src->parent_instr);
 866
 867             if (n && !n->partially_evaluated_path) {
 868                if (debug) {
 869                   fprintf(stderr, "  New partially evaluated path: ");
 870                   nir_print_instr(n->instr, stderr);
 871                   fprintf(stderr, "\n");
 872                }
 873
 874                n->partially_evaluated_path = true;
 875             }
 876          }
 877       }
 878    }
 879
 880    nir_schedule_mark_use(scoreboard,
 881                          src->is_ssa ? (void *)src->ssa : (void *)src->reg.reg,
 882                          src->parent_instr,
 883                          nir_schedule_src_pressure(src));
 884
 885    return true;
 886 }
 887
 888 static bool
 889 nir_schedule_mark_def_scheduled(nir_ssa_def *def, void *state)
 890 {
 891    nir_schedule_scoreboard *scoreboard = state;
 892
 893    nir_schedule_mark_use(scoreboard, def, def->parent_instr,
 894                          nir_schedule_def_pressure(def));
 895
 896    return true;
 897 }
 898
 899 static bool
 900 nir_schedule_mark_dest_scheduled(nir_dest *dest, void *state)
 901 {
 902    nir_schedule_scoreboard *scoreboard = state;
 903
 904    /* SSA defs were handled in nir_schedule_mark_def_scheduled()
 905     */
 906    if (dest->is_ssa)
 907       return true;
 908
 909    /* XXX: This is not actually accurate for regs -- the last use of a reg may
 910     * have a live interval that extends across control flow.  We should
 911     * calculate the live ranges of regs, and have scheduler nodes for the CF
 912     * nodes that also "use" the reg.
 913     */
 914    nir_schedule_mark_use(scoreboard, dest->reg.reg,
 915                          dest->reg.parent_instr,
 916                          nir_schedule_dest_pressure(dest));
 917
 918    return true;
 919 }
 920
 921 static void
 922 nir_schedule_mark_node_scheduled(nir_schedule_scoreboard *scoreboard,
 923                                  nir_schedule_node *n)
 924 {
 925    nir_foreach_src(n->instr, nir_schedule_mark_src_scheduled, scoreboard);
 926    nir_foreach_ssa_def(n->instr, nir_schedule_mark_def_scheduled, scoreboard);
 927    nir_foreach_dest(n->instr, nir_schedule_mark_dest_scheduled, scoreboard);
 928
 929    util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
 930       nir_schedule_node *child = (nir_schedule_node *)edge->child;
 931
 932       child->ready_time = MAX2(child->ready_time,
 933                                scoreboard->time + n->delay);
 934
 935       if (child->dag.parent_count == 1) {
 936          if (debug) {
 937             fprintf(stderr, "  New DAG head: ");
 938             nir_print_instr(child->instr, stderr);
 939             fprintf(stderr, "\n");
 940          }
 941       }
 942    }
 943
 944    dag_prune_head(scoreboard->dag, &n->dag);
 945
 946    scoreboard->time = MAX2(n->ready_time, scoreboard->time);
 947    scoreboard->time++;
 948 }
 949
 950 static void
 951 nir_schedule_instructions(nir_schedule_scoreboard *scoreboard, nir_block *block)
 952 {
 953    while (!list_is_empty(&scoreboard->dag->heads)) {
 954       if (debug) {
 955          fprintf(stderr, "current list:\n");
 956          dump_state(scoreboard);
 957       }
 958
 959       nir_schedule_node *chosen;
 960       if (scoreboard->options->fallback)
 961          chosen = nir_schedule_choose_instruction_fallback(scoreboard);
 962       else if (scoreboard->pressure < scoreboard->options->threshold)
 963          chosen = nir_schedule_choose_instruction_csp(scoreboard);
 964       else
 965          chosen = nir_schedule_choose_instruction_csr(scoreboard);
 966
 967       /* Now that we've scheduled a new instruction, some of its children may
 968        * be promoted to the list of instructions ready to be scheduled.
 969        */
 970       nir_schedule_mark_node_scheduled(scoreboard, chosen);
 971
 972       /* Move the instruction to the end (so our first chosen instructions are
 973        * the start of the program).
 974        */
 975       exec_node_remove(&chosen->instr->node);
 976       exec_list_push_tail(&block->instr_list, &chosen->instr->node);
 977
 978       if (debug)
 979          fprintf(stderr, "\n");
 980    }
 981 }
 982
 983 static uint32_t
 984 nir_schedule_get_delay(nir_instr *instr)
 985 {
 986    switch (instr->type) {
 987    case nir_instr_type_ssa_undef:
 988    case nir_instr_type_load_const:
 989    case nir_instr_type_alu:
 990    case nir_instr_type_deref:
 991    case nir_instr_type_jump:
 992    case nir_instr_type_parallel_copy:
 993    case nir_instr_type_call:
 994    case nir_instr_type_phi:
 995       return 1;
 996
 997    case nir_instr_type_intrinsic:
 998       /* XXX: Pick a large number for UBO/SSBO/image/shared loads */
 999       return 1;
1000
1001    case nir_instr_type_tex:
1002       /* Pick some large number to try to fetch textures early and sample them
1003        * late.
1004        */
1005       return 100;
1006    }
1007
1008    return 0;
1009 }
1010
1011 static void
1012 nir_schedule_dag_max_delay_cb(struct dag_node *node, void *state)
1013 {
1014    nir_schedule_node *n = (nir_schedule_node *)node;
1015    uint32_t max_delay = 0;
1016
1017    util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
1018       nir_schedule_node *child = (nir_schedule_node *)edge->child;
1019       max_delay = MAX2(child->max_delay, max_delay);
1020    }
1021
1022    n->max_delay = MAX2(n->max_delay, max_delay + n->delay);
1023  }
1024
1025 static void
1026 nir_schedule_block(nir_schedule_scoreboard *scoreboard, nir_block *block)
1027 {
1028    void *mem_ctx = ralloc_context(NULL);
1029    scoreboard->instr_map = _mesa_pointer_hash_table_create(mem_ctx);
1030
1031    scoreboard->dag = dag_create(mem_ctx);
1032
1033    nir_foreach_instr(instr, block) {
1034       nir_schedule_node *n =
1035          rzalloc(mem_ctx, nir_schedule_node);
1036
1037       n->instr = instr;
1038       n->delay = nir_schedule_get_delay(instr);
1039       dag_init_node(scoreboard->dag, &n->dag);
1040
1041       _mesa_hash_table_insert(scoreboard->instr_map, instr, n);
1042    }
1043
1044    calculate_forward_deps(scoreboard, block);
1045    calculate_reverse_deps(scoreboard, block);
1046
1047    dag_traverse_bottom_up(scoreboard->dag, nir_schedule_dag_max_delay_cb, NULL);
1048
1049    nir_schedule_instructions(scoreboard, block);
1050
1051    ralloc_free(mem_ctx);
1052    scoreboard->instr_map = NULL;
1053 }
1054
1055 static bool
1056 nir_schedule_ssa_def_init_scoreboard(nir_ssa_def *def, void *state)
1057 {
1058    nir_schedule_scoreboard *scoreboard = state;
1059    struct set *def_uses = _mesa_pointer_set_create(scoreboard);
1060
1061    _mesa_hash_table_insert(scoreboard->remaining_uses, def, def_uses);
1062
1063    _mesa_set_add(def_uses, def->parent_instr);
1064
1065    nir_foreach_use(src, def) {
1066       _mesa_set_add(def_uses, src->parent_instr);
1067    }
1068
1069    /* XXX: Handle if uses */
1070
1071    return true;
1072 }
1073
1074 static nir_schedule_scoreboard *
1075 nir_schedule_get_scoreboard(nir_shader *shader,
1076                             const nir_schedule_options *options)
1077 {
1078    nir_schedule_scoreboard *scoreboard = rzalloc(NULL, nir_schedule_scoreboard);
1079
1080    scoreboard->shader = shader;
1081    scoreboard->live_values = _mesa_pointer_set_create(scoreboard);
1082    scoreboard->remaining_uses = _mesa_pointer_hash_table_create(scoreboard);
1083    scoreboard->options = options;
1084    scoreboard->pressure = 0;
1085
1086    nir_foreach_function(function, shader) {
1087       nir_foreach_register(reg, &function->impl->registers) {
1088          struct set *register_uses =
1089             _mesa_pointer_set_create(scoreboard);
1090
1091          _mesa_hash_table_insert(scoreboard->remaining_uses, reg, register_uses);
1092
1093          nir_foreach_use(src, reg) {
1094             _mesa_set_add(register_uses, src->parent_instr);
1095          }
1096
1097          /* XXX: Handle if uses */
1098
1099          nir_foreach_def(dest, reg) {
1100             _mesa_set_add(register_uses, dest->reg.parent_instr);
1101          }
1102       }
1103
1104       nir_foreach_block(block, function->impl) {
1105          nir_foreach_instr(instr, block) {
1106             nir_foreach_ssa_def(instr, nir_schedule_ssa_def_init_scoreboard,
1107                                 scoreboard);
1108          }
1109
1110          /* XXX: We're ignoring if uses, which may prioritize scheduling other
1111           * uses of the if src even when it doesn't help.  That's not many
1112           * values, though, so meh.
1113           */
1114       }
1115    }
1116
1117    return scoreboard;
1118 }
1119
1120 static void
1121 nir_schedule_validate_uses(nir_schedule_scoreboard *scoreboard)
1122 {
1123 #ifdef NDEBUG
1124    return;
1125 #endif
1126
1127    bool any_uses = false;
1128
1129    hash_table_foreach(scoreboard->remaining_uses, entry) {
1130       struct set *remaining_uses = entry->data;
1131
1132       set_foreach(remaining_uses, instr_entry) {
1133          if (!any_uses) {
1134             fprintf(stderr, "Tracked uses remain after scheduling.  "
1135                     "Affected instructions: \n");
1136             any_uses = true;
1137          }
1138          nir_print_instr(instr_entry->key, stderr);
1139          fprintf(stderr, "\n");
1140       }
1141    }
1142
1143    assert(!any_uses);
1144 }
1145
1146 /**
1147  * Schedules the NIR instructions to try to decrease stalls (for example,
1148  * delaying texture reads) while managing register pressure.
1149  *
1150  * The threshold represents "number of NIR register/SSA def channels live
1151  * before switching the scheduling heuristic to reduce register pressure",
1152  * since most of our GPU architectures are scalar (extending to vector with a
1153  * flag wouldn't be hard).  This number should be a bit below the number of
1154  * registers available (counting any that may be occupied by system value
1155  * payload values, for example), since the heuristic may not always be able to
1156  * free a register immediately.  The amount below the limit is up to you to
1157  * tune.
1158  */
1159 void
1160 nir_schedule(nir_shader *shader,
1161              const nir_schedule_options *options)
1162 {
1163    nir_schedule_scoreboard *scoreboard = nir_schedule_get_scoreboard(shader,
1164                                                                      options);
1165
1166    if (debug) {
1167       fprintf(stderr, "NIR shader before scheduling:\n");
1168       nir_print_shader(shader, stderr);
1169    }
1170
1171    nir_foreach_function(function, shader) {
1172       if (!function->impl)
1173          continue;
1174
1175       nir_foreach_block(block, function->impl) {
1176          nir_schedule_block(scoreboard, block);
1177       }
1178    }
1179
1180    nir_schedule_validate_uses(scoreboard);
1181
1182    ralloc_free(scoreboard);
1183 }