src/gallium/drivers/lima/ir/gp/scheduler.c

   1 /*
   2  * Copyright (c) 2017 Lima Project
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sub license,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the
  12  * next paragraph) shall be included in all copies or substantial portions
  13  * of the Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  21  * DEALINGS IN THE SOFTWARE.
  22  *
  23  */
  24
  25 #include <limits.h>
  26
  27 #include "gpir.h"
  28
  29 /*
  30  * GP scheduling algorithm (by Connor Abbott <cwabbott0@gmail.com>)
  31  *
  32  * The GP pipeline has three main stages:
  33  *
  34  * --------------------------------------------------------
  35  * |                                                      |
  36  * |               Register/Attr/Temp Fetch               |
  37  * |                                                      |
  38  * --------------------------------------------------------
  39  * |        |        |        |        |        |         |
  40  * |  Mul0  |  Mul1  |  Add0  |  Add1  |  Cplx  |  Pass   |
  41  * |        |        |        |        |        |         |
  42  * --------------------------------------------------------
  43  * |                 |                          |         |
  44  * |    Complex1     |   Temp/Register/Varying  |  Pass   |
  45  * |    Stage 2      |         Store            | Stage 2 |
  46  * |                 |                          |         |
  47  * --------------------------------------------------------
  48  *
  49  * Because of this setup, storing a register has a latency of three cycles.
  50  * Also, the register file is organized into 4-component vectors, and the
  51  * load stage can only load two vectors at a time. Aside from these highly
  52  * constrained register load/store units, there is an explicit bypass
  53  * network, where each unit (mul0/mul1/etc.) can access the results of the
  54  * any unit from the previous two cycles directly, except for the complex
  55  * unit whose result can only be accessed for one cycle (since it's expected
  56  * to be used directly by the complex2 instruction in the following cycle).
  57  *
  58  * Because of the very restricted register file, and because only rarely are
  59  * all the units in use at the same time, it can be very beneficial to use
  60  * the unused units to "thread" a value from source to destination by using
  61  * moves in the otherwise-unused units, without involving the register file
  62  * at all. It's very difficult to fully exploit this with a traditional
  63  * scheduler, so we need to do something a little un-traditional. The 512
  64  * instruction limit means that for more complex shaders, we need to do as
  65  * well as possible or else the app won't even work.
  66  *
  67  * The scheduler works by considering the bypass network as a kind of
  68  * register file. It's a quite unusual register file, since registers have to
  69  * be assigned "on the fly" as we schedule operations, but with some care, we
  70  * can use something conceptually similar to a linear-scan allocator to
  71  * successfully schedule nodes to instructions without running into
  72  * conflicts.
  73  *
  74  * Values in the IR are separated into normal values, or "value registers",
  75  * which is what normal nodes like add, mul, etc. produce, and which only
  76  * live inside one basic block, and registers, which can span multiple basic
  77  * blocks but have to be accessed via special load_reg/store_reg nodes. RA
  78  * assigns physical registers to both value registers and normal registers,
  79  * treating load_reg/store_reg as a move instruction, but these are only used
  80  * directly for normal registers -- the physreg assigned to a value register
  81  * is "fake," and is only used inside the scheduler. Before scheduling we
  82  * insert read-after-write dependencies, even for value registers, as if
  83  * we're going to use those, but then we throw them away. For example, if we
  84  * had something like:
  85  *
  86  * (*)r2 = add (*)r1, (*)r2
  87  * (*)r1 = load_reg r0
  88  *
  89  * we'd insert a write-after-read dependency between the add and load_reg,
  90  * even though the starred registers aren't actually used by the scheduler
  91  * after this step. This step is crucial since it guarantees that during any
  92  * point in the schedule, the number of live registers + live value registers
  93  * will never exceed the capacity of the register file and the bypass network
  94  * combined. This is because each live register/value register will have a
  95  * different fake number, thanks to the fake dependencies inserted before
  96  * scheduling. This allows us to not have to worry about spilling to
  97  * temporaries, which is only done ahead of time.
  98  *
  99  * The scheduler is a bottom-up scheduler. It keeps track of each live value
 100  * register, and decides on-the-fly which value registers to keep in the
 101  * bypass network and which to "spill" to registers. Of particular importance
 102  * is the "ready list," which consists of "input nodes" (nodes that produce a
 103  * value that can be consumed via the bypass network), both "partially ready"
 104  * (only some of the uses have been scheduled) and "fully ready" (all uses
 105  * have been scheduled), as well as other non-input nodes like register
 106  * stores. Each input node on the ready list represents a live value register
 107  * before the current instruction. There must be at most 11 such input nodes
 108  * at all times, since there are only 11 slots in the next two instructions
 109  * which can reach the current instruction.
 110  *
 111  * An input node is a "max node" if it has a use two cycles ago, which must be
 112  * connected to a definition this cycle. Otherwise it may be a "next max node"
 113  * if it will be a max node on the next instruction (i.e. it has a use at most
 114  * one cycle ago), or it may be neither if all of its uses are this cycle. As
 115  * we keep adding instructions to the front, input nodes graduate from
 116  * neither, to next max, to max, unless we decide to insert a move to keep it
 117  * alive longer, at which point any uses after the current instruction are
 118  * rewritten to be uses of the move so that the original node returns to
 119  * neither. The scheduler decides which nodes to try freely, but we have to
 120  * reserve slots for two different reasons: (1) out of the 5 non-complex
 121  * slots, we reserve a slot for each max node, so that we can connect a
 122  * definition to the use 2 cycles ago. (2) Out of all 6 slots, we reserve a
 123  * slot for every next-max node above 5, so that for the next instruction
 124  * there are no more than 5 max nodes. When a max or next-max node gets
 125  * scheduled, the corresponding reservation is reduced by one. At the end, we
 126  * insert moves for every slot that was reserved. The reservation is actually
 127  * managed by nir_instr, and all we have to do is tell it how many to reserve
 128  * at the beginning and then tell it which nodes are max/next-max nodes. When
 129  * we start scheduling an instruction, there will be at most 5 max nodes
 130  * thanks to the previous instruction's next-max reservation/move insertion.
 131  * Since there are at most 11 total input nodes, if there are N max nodes,
 132  * there are at most 11 - N next-max nodes, and therefore at most 11 - N - 5 =
 133  * 6 - N slots need to be reserved for next-max nodes, and so at most
 134  * 6 - N + N = 6 slots need to be reserved in total, exactly the total number
 135  * of slots. So, thanks to the total input node restriction, we will never
 136  * need to reserve too many slots.
 137  *
 138  * It sometimes happens that scheduling a given node will violate this total
 139  * input node restriction, or that a reservation will mean that we can't
 140  * schedule it. We first schedule a node "speculatively" to see if this is a
 141  * problem. If some of the node's sources are loads, then we can schedule
 142  * the node and its dependent loads in one swoop to avoid going over the
 143  * pressure limit. If that fails, we can try to spill a ready or
 144  * partially-ready input node to a register by rewriting all of its uses to
 145  * refer to a register load. This removes it from the list of ready and
 146  * partially ready input nodes as all of its uses are now unscheduled. If
 147  * successful, we can then proceed with scheduling the original node. All of
 148  * this happens "speculatively," meaning that afterwards the node is removed
 149  * and the entire state of the scheduler is reverted to before it was tried, to
 150  * ensure that we never get into an invalid state and run out of spots for
 151  * moves. In try_nodes(), we try to schedule each node speculatively on the
 152  * ready list, keeping only the nodes that could be successfully scheduled, so
 153  * that when we finally decide which node to actually schedule, we know it
 154  * will succeed.  This is how we decide on the fly which values go in
 155  * registers and which go in the bypass network. Note that "unspilling" a
 156  * value is simply a matter of scheduling the store_reg instruction created
 157  * when we spill.
 158  *
 159  * The careful accounting of live value registers, reservations for moves, and
 160  * speculative scheduling guarantee that we never run into a failure case
 161  * while scheduling. However, we need to make sure that this scheduler will
 162  * not get stuck in an infinite loop, i.e. that we'll always make forward
 163  * progress by eventually scheduling a non-move node. If we run out of value
 164  * registers, then we may have to spill a node to a register. If we
 165  * were to schedule one of the fully-ready nodes, then we'd have 11 + N live
 166  * value registers before the current instruction. But since there are at most
 167  * 64+11 live registers and register values total thanks to the fake
 168  * dependencies we inserted before scheduling, there are at most 64 - N live
 169  * physical registers, and therefore there are at least N registers available
 170  * for spilling. Not all these registers will be available immediately, since
 171  * in order to spill a node to a given register we have to ensure that there
 172  * are slots available to rewrite every use to a load instruction, and that
 173  * may not be the case. There may also be intervening writes which prevent
 174  * some registers from being used. However, these are all temporary problems,
 175  * since as we create each instruction, we create additional register load
 176  * slots that can be freely used for spilling, and we create more move nodes
 177  * which means that the uses of the nodes we're trying to spill keep moving
 178  * forward. This means that eventually, these problems will go away, at which
 179  * point we'll be able to spill a node successfully, so eventually we'll be
 180  * able to schedule the first node on the ready list.
 181  */
 182
 183 typedef struct {
 184    /* This is the list of ready and partially-ready nodes. A partially-ready
 185     * node must have at least one input dependency already scheduled.
 186     */
 187    struct list_head ready_list;
 188
 189    /* The number of ready or partially-ready nodes with at least one input
 190     * dependency already scheduled. In other words, the number of live value
 191     * registers. This must be at most 11.
 192     */
 193    int ready_list_slots;
 194
 195    /* The physical registers live into the current instruction. */
 196    uint64_t live_physregs;
 197
 198    /* The current instruction. */
 199    gpir_instr *instr;
 200
 201    /* The current basic block. */
 202    gpir_block *block;
 203
 204    /* True if at least one node failed to schedule due to lack of available
 205     * value registers.
 206     */
 207    bool try_spill_all;
 208
 209    /* The number of max nodes needed to spill to successfully schedule the
 210     * instruction.
 211     */
 212    int max_node_spill_needed;
 213
 214    /* The number of max and next-max nodes needed to spill to successfully
 215     * schedule the instruction.
 216     */
 217    int total_spill_needed;
 218 } sched_ctx;
 219
 220 static int gpir_min_dist_alu(gpir_dep *dep)
 221 {
 222    switch (dep->pred->op) {
 223    case gpir_op_load_uniform:
 224    case gpir_op_load_temp:
 225    case gpir_op_load_reg:
 226    case gpir_op_load_attribute:
 227       return 0;
 228
 229    case gpir_op_complex1:
 230       return 2;
 231
 232    default:
 233       return 1;
 234    }
 235 }
 236
 237 static int gpir_get_min_dist(gpir_dep *dep)
 238 {
 239    switch (dep->type) {
 240    case GPIR_DEP_INPUT:
 241       switch (dep->succ->op) {
 242       case gpir_op_store_temp:
 243       case gpir_op_store_reg:
 244       case gpir_op_store_varying:
 245          /* Stores must use an alu node as input. Also, complex1 takes two
 246           * cycles, which means that its result cannot be stored to a register
 247           * as part of the normal path, and therefore it must also have a move
 248           * inserted.
 249           */
 250          if (dep->pred->type == gpir_node_type_load ||
 251              dep->pred->op == gpir_op_complex1)
 252             return INT_MAX >> 2;
 253          else
 254             return 0;
 255
 256       default:
 257          return gpir_min_dist_alu(dep);
 258       }
 259
 260    case GPIR_DEP_OFFSET:
 261       assert(dep->succ->op == gpir_op_store_temp);
 262       return gpir_min_dist_alu(dep);
 263
 264    case GPIR_DEP_READ_AFTER_WRITE:
 265       if (dep->succ->op == gpir_op_load_temp &&
 266           dep->pred->op == gpir_op_store_temp) {
 267          return 4;
 268       } else if (dep->succ->op == gpir_op_load_reg &&
 269                  dep->pred->op == gpir_op_store_reg) {
 270          return 3;
 271       } else if ((dep->pred->op == gpir_op_store_temp_load_off0 ||
 272                   dep->pred->op == gpir_op_store_temp_load_off1 ||
 273                   dep->pred->op == gpir_op_store_temp_load_off2) &&
 274                  dep->succ->op == gpir_op_load_uniform) {
 275          return 4;
 276       } else {
 277          /* Fake dependency */
 278          return 0;
 279       }
 280
 281    case GPIR_DEP_WRITE_AFTER_READ:
 282       return 0;
 283    }
 284
 285    return 0;
 286 }
 287
 288 static int gpir_max_dist_alu(gpir_dep *dep)
 289 {
 290    switch (dep->pred->op) {
 291    case gpir_op_load_uniform:
 292    case gpir_op_load_temp:
 293       return 0;
 294    case gpir_op_load_attribute:
 295       return 1;
 296    case gpir_op_load_reg:
 297       if (dep->pred->sched.pos < GPIR_INSTR_SLOT_REG0_LOAD0 ||
 298           dep->pred->sched.pos > GPIR_INSTR_SLOT_REG0_LOAD3)
 299          return 0;
 300       else
 301          return 1;
 302    case gpir_op_exp2_impl:
 303    case gpir_op_log2_impl:
 304    case gpir_op_rcp_impl:
 305    case gpir_op_rsqrt_impl:
 306    case gpir_op_store_temp_load_off0:
 307    case gpir_op_store_temp_load_off1:
 308    case gpir_op_store_temp_load_off2:
 309       return 1;
 310    case gpir_op_mov:
 311       if (dep->pred->sched.pos == GPIR_INSTR_SLOT_COMPLEX)
 312          return 1;
 313       else
 314          return 2;
 315    default:
 316       return 2;
 317    }
 318 }
 319
 320 static int gpir_get_max_dist(gpir_dep *dep)
 321 {
 322    switch (dep->type) {
 323    case GPIR_DEP_INPUT:
 324       switch (dep->succ->op) {
 325       case gpir_op_store_temp:
 326       case gpir_op_store_reg:
 327       case gpir_op_store_varying:
 328          return 0;
 329
 330       default:
 331          return gpir_max_dist_alu(dep);
 332       }
 333
 334    case GPIR_DEP_OFFSET:
 335       assert(dep->succ->op == gpir_op_store_temp);
 336       return gpir_max_dist_alu(dep);
 337
 338    default:
 339       return INT_MAX >> 2; /* Don't want to overflow... */
 340    }
 341 }
 342
 343 static void schedule_update_distance(gpir_node *node)
 344 {
 345    if (gpir_node_is_leaf(node)) {
 346       node->sched.dist = 0;
 347       return;
 348    }
 349
 350    gpir_node_foreach_pred(node, dep) {
 351       gpir_node *pred = dep->pred;
 352
 353       if (pred->sched.dist < 0)
 354          schedule_update_distance(pred);
 355
 356       int dist = pred->sched.dist + gpir_min_dist_alu(dep);
 357       if (node->sched.dist < dist)
 358          node->sched.dist = dist;
 359    }
 360 }
 361
 362 static bool gpir_is_input_node(gpir_node *node)
 363 {
 364    gpir_node_foreach_succ(node, dep) {
 365       if (dep->type == GPIR_DEP_INPUT)
 366          return true;
 367    }
 368    return false;
 369 }
 370
 371
 372 /* Get the number of slots required for a node on the ready list.
 373  */
 374 static int gpir_get_slots_required(gpir_node *node)
 375 {
 376    if (!gpir_is_input_node(node))
 377       return 0;
 378
 379    /* Note that we assume every node only consumes one slot, even dual-slot
 380     * instructions. While dual-slot instructions may consume more than one
 381     * slot, we can always safely insert a move if it turns out that there
 382     * isn't enough space for them. There's the risk that we get stuck in an
 383     * infinite loop if all the fully ready nodes are dual-slot nodes, but we
 384     * rely on spilling to registers to save us here.
 385     */
 386    return 1;
 387 }
 388
 389 static void verify_ready_list(sched_ctx *ctx)
 390 {
 391    list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
 392       if (!gpir_is_input_node(node)) {
 393          assert(node->sched.ready);
 394       }
 395
 396       if (node->sched.ready) {
 397          /* Every successor must have been scheduled */
 398          gpir_node_foreach_succ(node, dep) {
 399             assert(dep->succ->sched.instr);
 400          }
 401       } else {
 402          /* There must be at least one successor that's not scheduled. */
 403          bool unscheduled = false;
 404          gpir_node_foreach_succ(node, dep) {
 405             unscheduled |= !(dep->succ->sched.instr);
 406          }
 407
 408          assert(unscheduled);
 409       }
 410    }
 411 }
 412
 413 static void schedule_insert_ready_list(sched_ctx *ctx,
 414                                        gpir_node *insert_node)
 415 {
 416    /* if this node is fully ready or partially ready
 417     *   fully ready: all successors have been scheduled
 418     *   partially ready: part of input successors have been scheduled
 419     *
 420     * either fully ready or partially ready node need be inserted to
 421     * the ready list, but we only schedule a move node for partially
 422     * ready node.
 423     */
 424    bool ready = true, insert = false;
 425    gpir_node_foreach_succ(insert_node, dep) {
 426       gpir_node *succ = dep->succ;
 427       if (succ->sched.instr) {
 428          if (dep->type == GPIR_DEP_INPUT)
 429             insert = true;
 430       }
 431       else
 432          ready = false;
 433    }
 434
 435    insert_node->sched.ready = ready;
 436    /* for root node */
 437    insert |= ready;
 438
 439    if (!insert || insert_node->sched.inserted)
 440       return;
 441
 442    struct list_head *insert_pos = &ctx->ready_list;
 443    list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
 444       if (insert_node->sched.dist > node->sched.dist ||
 445           gpir_op_infos[insert_node->op].schedule_first) {
 446          insert_pos = &node->list;
 447          break;
 448       }
 449    }
 450
 451    list_addtail(&insert_node->list, insert_pos);
 452    insert_node->sched.inserted = true;
 453    ctx->ready_list_slots += gpir_get_slots_required(insert_node);
 454 }
 455
 456 static int gpir_get_max_start(gpir_node *node)
 457 {
 458    int max_start = 0;
 459
 460    /* find the max start instr constrainted by all successors */
 461    gpir_node_foreach_succ(node, dep) {
 462       gpir_node *succ = dep->succ;
 463       if (!succ->sched.instr)
 464          continue;
 465
 466       int start = succ->sched.instr->index + gpir_get_min_dist(dep);
 467       if (start > max_start)
 468          max_start = start;
 469    }
 470
 471    return max_start;
 472 }
 473
 474 static int gpir_get_min_end(gpir_node *node)
 475 {
 476    int min_end = INT_MAX;
 477
 478    /* find the min end instr constrainted by all successors */
 479    gpir_node_foreach_succ(node, dep) {
 480       gpir_node *succ = dep->succ;
 481       if (!succ->sched.instr)
 482          continue;
 483
 484       int end = succ->sched.instr->index + gpir_get_max_dist(dep);
 485       if (end < min_end)
 486          min_end = end;
 487    }
 488
 489    return min_end;
 490 }
 491
 492 static gpir_node *gpir_sched_instr_has_load(gpir_instr *instr, gpir_node *node)
 493 {
 494    gpir_load_node *load = gpir_node_to_load(node);
 495
 496    for (int i = GPIR_INSTR_SLOT_REG0_LOAD0; i <= GPIR_INSTR_SLOT_MEM_LOAD3; i++) {
 497       if (!instr->slots[i])
 498          continue;
 499
 500       gpir_load_node *iload = gpir_node_to_load(instr->slots[i]);
 501       if (load->node.op == iload->node.op &&
 502           load->index == iload->index &&
 503           load->component == iload->component)
 504          return &iload->node;
 505    }
 506    return NULL;
 507 }
 508
 509 /* Simply place the node into the given instruction without trying to deal
 510  * with liveness or the ready list. This will only fail if the instruction
 511  * cannot be placed due to a lack of available slots. In addition to normal
 512  * node placement, this is also used for placing loads when spilling to
 513  * registers.
 514  */
 515 static bool _try_place_node(sched_ctx *ctx, gpir_instr *instr, gpir_node *node)
 516 {
 517    if (node->type == gpir_node_type_load) {
 518       gpir_node *load = gpir_sched_instr_has_load(instr, node);
 519       if (load) {
 520          /* This node may have a store as a successor, in which case we have to
 521           * fail it exactly like below in order to later create a move node in
 522           * between.
 523           */
 524          if (instr->index < gpir_get_max_start(node))
 525             return false;
 526
 527          gpir_debug("same load %d in instr %d for node %d\n",
 528                     load->index, instr->index, node->index);
 529
 530          /* not really merge two node, just fake scheduled same place */
 531          node->sched.instr = load->sched.instr;
 532          node->sched.pos = load->sched.pos;
 533          return true;
 534       }
 535    }
 536
 537    node->sched.instr = instr;
 538
 539    int max_node_spill_needed = INT_MAX;
 540    int total_spill_needed = INT_MAX;
 541    int *slots = gpir_op_infos[node->op].slots;
 542    for (int i = 0; slots[i] != GPIR_INSTR_SLOT_END; i++) {
 543       node->sched.pos = slots[i];
 544       if (instr->index >= gpir_get_max_start(node) &&
 545           instr->index <= gpir_get_min_end(node) &&
 546           gpir_instr_try_insert_node(instr, node))
 547          return true;
 548       if (ctx->instr->non_cplx_slot_difference ||
 549           ctx->instr->slot_difference) {
 550          /* If one of these fields is non-zero, then we could insert the node
 551           * here after spilling. To get an accurate count of how many nodes we
 552           * need to spill, we need to choose one of the positions where there
 553           * were nonzero slot differences, preferably one with the smallest
 554           * difference (so we don't have to spill as much).
 555           */
 556          if (ctx->instr->non_cplx_slot_difference < max_node_spill_needed ||
 557              ctx->instr->slot_difference < total_spill_needed) {
 558             max_node_spill_needed = ctx->instr->non_cplx_slot_difference;
 559             total_spill_needed = ctx->instr->slot_difference;
 560          }
 561       }
 562    }
 563
 564    if (max_node_spill_needed != INT_MAX) {
 565       /* Indicate how many spill nodes are needed. */
 566       ctx->max_node_spill_needed = MAX2(ctx->max_node_spill_needed,
 567                                         max_node_spill_needed);
 568       ctx->total_spill_needed = MAX2(ctx->total_spill_needed,
 569                                      total_spill_needed);
 570    }
 571    node->sched.instr = NULL;
 572    node->sched.pos = -1;
 573    return false;
 574 }
 575
 576 /* Try to place just the node given, updating the ready list. If "speculative"
 577  * is true, then this is part ofthe pre-commit phase. If false, then we have
 578  * committed to placing this node, so update liveness and ready list
 579  * information.
 580  */
 581
 582 static bool schedule_try_place_node(sched_ctx *ctx, gpir_node *node,
 583                                     bool speculative)
 584 {
 585    if (!_try_place_node(ctx, ctx->instr, node)) {
 586       if (!speculative)
 587          gpir_debug("failed to place %d\n", node->index);
 588       return false;
 589    }
 590
 591    ctx->ready_list_slots -= gpir_get_slots_required(node);
 592
 593    if (!speculative) {
 594       gpir_debug("placed node %d\n", node->index);
 595
 596       /* We assume here that writes are placed before reads. If this changes,
 597        * then this needs to be updated.
 598        */
 599       if (node->op == gpir_op_store_reg) {
 600          gpir_store_node *store = gpir_node_to_store(node);
 601          ctx->live_physregs &=
 602             ~(1ull << (4 * store->index + store->component));
 603          if (store->child->sched.physreg_store == store)
 604             store->child->sched.physreg_store = NULL;
 605       }
 606
 607       if (node->op == gpir_op_load_reg) {
 608          gpir_load_node *load = gpir_node_to_load(node);
 609          ctx->live_physregs |=
 610             (1ull << (4 * load->index + load->component));
 611       }
 612
 613       list_del(&node->list);
 614       list_add(&node->list, &ctx->block->node_list);
 615       gpir_node_foreach_pred(node, dep) {
 616          gpir_node *pred = dep->pred;
 617          schedule_insert_ready_list(ctx, pred);
 618       }
 619    } else {
 620       gpir_node_foreach_pred(node, dep) {
 621          gpir_node *pred = dep->pred;
 622          if (!pred->sched.inserted && dep->type == GPIR_DEP_INPUT)
 623             ctx->ready_list_slots += gpir_get_slots_required(pred);
 624       }
 625    }
 626
 627    return true;
 628 }
 629
 630 /* Create a new node with "node" as the child, replace all uses of "node" with
 631  * this new node, and replace "node" with it in the ready list.
 632  */
 633 static gpir_node *create_replacement(sched_ctx *ctx, gpir_node *node,
 634                                      gpir_op op)
 635 {
 636
 637    gpir_alu_node *new_node = gpir_node_create(node->block, op);
 638    if (unlikely(!new_node))
 639       return NULL;
 640
 641    new_node->children[0] = node;
 642    new_node->num_child = 1;
 643
 644    new_node->node.sched.instr = NULL;
 645    new_node->node.sched.pos = -1;
 646    new_node->node.sched.dist = node->sched.dist;
 647    new_node->node.sched.max_node = node->sched.max_node;
 648    new_node->node.sched.next_max_node = node->sched.next_max_node;
 649    new_node->node.sched.complex_allowed = node->sched.complex_allowed;
 650
 651    ctx->ready_list_slots--;
 652    list_del(&node->list);
 653    node->sched.max_node = false;
 654    node->sched.next_max_node = false;
 655    node->sched.ready = false;
 656    node->sched.inserted = false;
 657    gpir_node_replace_succ(&new_node->node, node);
 658    gpir_node_add_dep(&new_node->node, node, GPIR_DEP_INPUT);
 659    schedule_insert_ready_list(ctx, &new_node->node);
 660    return &new_node->node;
 661 }
 662
 663 static gpir_node *create_move(sched_ctx *ctx, gpir_node *node)
 664 {
 665    gpir_node *move = create_replacement(ctx, node, gpir_op_mov);
 666    gpir_debug("create move %d for %d\n", move->index, node->index);
 667    return move;
 668 }
 669
 670 static gpir_node *create_postlog2(sched_ctx *ctx, gpir_node *node)
 671 {
 672    assert(node->op == gpir_op_complex1);
 673    gpir_node *postlog2 = create_replacement(ctx, node, gpir_op_postlog2);
 674    gpir_debug("create postlog2 %d for %d\n", postlog2->index, node->index);
 675    return postlog2;
 676 }
 677
 678 /* Once we schedule the successor, would the predecessor be fully ready? */
 679 static bool pred_almost_ready(gpir_dep *dep)
 680 {
 681    bool fully_ready = true;
 682    gpir_node_foreach_succ(dep->pred, other_dep) {
 683       gpir_node *succ = other_dep->succ;
 684       if (!succ->sched.instr && dep->succ != other_dep->succ) {
 685          fully_ready = false;
 686          break;
 687       }
 688    }
 689
 690    return fully_ready;
 691 }
 692
 693 /* Recursively try to schedule a node and all its dependent nodes that can fit
 694  * in the same  instruction. There is a simple heuristic scoring system to try
 695  * to group together nodes that load different components of the same input,
 696  * to avoid bottlenecking for operations like matrix multiplies that are
 697  * mostly input-bound.
 698  */
 699
 700 static int _schedule_try_node(sched_ctx *ctx, gpir_node *node, bool speculative)
 701 {
 702    if (!schedule_try_place_node(ctx, node, speculative))
 703       return INT_MIN;
 704
 705    int score = 0;
 706
 707    gpir_node_foreach_pred(node, dep) {
 708       if (!gpir_is_input_node(dep->pred))
 709          continue;
 710
 711       int pred_score = INT_MIN;
 712       if (pred_almost_ready(dep)) {
 713          if (dep->pred->type == gpir_node_type_load ||
 714              node->type == gpir_node_type_store) {
 715             pred_score = _schedule_try_node(ctx, dep->pred, speculative);
 716          }
 717       }
 718       if (dep->pred->type == gpir_node_type_load ||
 719           node->type == gpir_node_type_store) {
 720          if (pred_score == INT_MIN) {
 721             if (node->op == gpir_op_mov) {
 722                /* The only moves on the ready list are for loads that we
 723                 * couldn't schedule immediately, as created below. If we
 724                 * couldn't schedule the load, there's no point scheduling
 725                 * the move. The normal move threading logic will ensure
 726                 * that another move is created if we're about to go too far
 727                 * from the uses of this move.
 728                 */
 729                assert(speculative);
 730                return INT_MIN;
 731             } else if (!speculative && dep->pred->type == gpir_node_type_load) {
 732                /* We couldn't schedule the load right away, so it will have
 733                 * to happen in some earlier instruction and then be moved
 734                 * into a value register and threaded to the use by "node".
 735                 * We create the move right away, so that later we'll fail
 736                 * to schedule it if there isn't a slot for a move
 737                 * available.
 738                 */
 739                create_move(ctx, dep->pred);
 740             }
 741             /* Penalize nodes whose dependent ops we couldn't schedule.
 742              */
 743             score--;
 744          } else {
 745             score += pred_score;
 746             continue;
 747          }
 748       }
 749    }
 750
 751    return score;
 752 }
 753
 754 /* If we speculatively tried a node, undo everything.
 755  */
 756
 757 static void schedule_undo_node(sched_ctx *ctx, gpir_node *node)
 758 {
 759    gpir_instr_remove_node(ctx->instr, node);
 760
 761    gpir_node_foreach_pred(node, dep) {
 762       gpir_node *pred = dep->pred;
 763       if (pred->sched.instr) {
 764          schedule_undo_node(ctx, pred);
 765       }
 766    }
 767 }
 768
 769 /* Try to schedule a node. We also try to schedule any predecessors that can
 770  * be part of the same instruction. If "speculative" is true, then we don't
 771  * actually change any state, only returning the score were the node to be
 772  * scheduled, with INT_MIN meaning "cannot be scheduled at all".
 773  */
 774 static int schedule_try_node(sched_ctx *ctx, gpir_node *node, bool speculative)
 775 {
 776    int prev_slots = ctx->ready_list_slots;
 777
 778    int score = _schedule_try_node(ctx, node, speculative);
 779
 780    if (ctx->ready_list_slots > GPIR_VALUE_REG_NUM) {
 781       assert(speculative);
 782       ctx->total_spill_needed = MAX2(ctx->total_spill_needed,
 783                                      ctx->ready_list_slots - GPIR_VALUE_REG_NUM);
 784       score = INT_MIN;
 785    }
 786
 787    if (speculative) {
 788       ctx->ready_list_slots = prev_slots;
 789       if (node->sched.instr)
 790          schedule_undo_node(ctx, node);
 791    }
 792
 793    return score;
 794 }
 795
 796 /* This is called when we want to spill "node" by inserting loads at its uses.
 797  * It returns all the possible registers we can use so that all the loads will
 798  * successfully be inserted. Also return the first instruction we'll need to
 799  * insert a load for.
 800  */
 801
 802 static uint64_t get_available_regs(sched_ctx *ctx, gpir_node *node,
 803                                    int *min_index)
 804 {
 805    uint64_t available = ~0ull;
 806    gpir_node_foreach_succ(node, dep) {
 807       if (dep->type != GPIR_DEP_INPUT)
 808          continue;
 809
 810       gpir_node *use = dep->succ;
 811       gpir_instr *instr = use->sched.instr;
 812
 813       if (!instr) {
 814          /* This use isn't scheduled, so no need to spill it. */
 815          continue;
 816       }
 817
 818       if (use->type == gpir_node_type_store) {
 819          /* We're trying to spill something that was recently stored... just
 820           * bail out.
 821           */
 822          return 0;
 823       }
 824
 825       if (use->op == gpir_op_mov && instr == ctx->instr) {
 826          /* We try to spill the sources of this move, so we can free up space
 827           * in the current instruction.
 828           *
 829           * TODO: should we go back further? It might let us schedule the
 830           * write earlier in some cases, but then we might fail to spill.
 831           */
 832          available &= get_available_regs(ctx, use, min_index);
 833       } else {
 834          if (instr->index < *min_index)
 835             *min_index = instr->index;
 836
 837          uint64_t use_available = 0;
 838
 839          if (instr->reg0_use_count == 0)
 840             use_available = ~0ull;
 841          else if (!instr->reg0_is_attr)
 842             use_available = 0xf << (4 * instr->reg0_index);
 843
 844          if (instr->reg1_use_count == 0)
 845             use_available = ~0ull;
 846          else
 847             use_available |= 0xf << (4 * instr->reg1_index);
 848
 849          available &= use_available;
 850       }
 851    }
 852
 853    return available;
 854 }
 855
 856 /* Using "min_index" returned by get_available_regs(), figure out which
 857  * registers are killed by a write after or during the current instruction and
 858  * hence we can't use for spilling. Writes that haven't been scheduled yet
 859  * should be reflected in live_physregs.
 860  */
 861
 862 static uint64_t get_killed_regs(sched_ctx *ctx, int min_index)
 863 {
 864    uint64_t killed = 0;
 865
 866    list_for_each_entry(gpir_instr, instr, &ctx->block->instr_list, list) {
 867       if (instr->index <= min_index)
 868          break;
 869
 870       for (int slot = GPIR_INSTR_SLOT_STORE0; slot <= GPIR_INSTR_SLOT_STORE3;
 871            slot++) {
 872          if (!instr->slots[slot])
 873             continue;
 874
 875          gpir_store_node *store = gpir_node_to_store(instr->slots[slot]);
 876          if (store->node.op != gpir_op_store_reg)
 877             continue;
 878
 879          killed |= 1ull << (4 * store->index + store->component);
 880       }
 881    }
 882
 883    return killed;
 884 }
 885
 886 /* Actually spill a node so that it is no longer in the ready list. Note that
 887  * this must exactly follow the logic of get_available_regs() or else the
 888  * loads could fail to schedule.
 889  */
 890
 891 static void spill_node(sched_ctx *ctx, gpir_node *node, gpir_store_node *store)
 892 {
 893    gpir_node_foreach_succ_safe(node, dep) {
 894       if (dep->type != GPIR_DEP_INPUT)
 895          continue;
 896
 897       gpir_node *use = dep->succ;
 898       gpir_instr *instr = use->sched.instr;
 899
 900       if (!instr)
 901          continue;
 902
 903       if (use->op == gpir_op_mov && instr == ctx->instr) {
 904          spill_node(ctx, use, store);
 905       } else {
 906          gpir_load_node *load = gpir_node_create(ctx->block, gpir_op_load_reg);
 907          load->index = store->index;
 908          load->component = store->component;
 909          list_add(&load->node.list, &ctx->block->node_list);
 910          gpir_node_replace_child(dep->succ, dep->pred, &load->node);
 911          gpir_node_replace_pred(dep, &load->node);
 912          gpir_node_add_dep(&load->node, &store->node, GPIR_DEP_READ_AFTER_WRITE);
 913          gpir_debug("spilling use %d of node %d to load node %d\n",
 914                     use->index, node->index, load->node.index);
 915          ASSERTED bool result = _try_place_node(ctx, use->sched.instr, &load->node);
 916          assert(result);
 917       }
 918    }
 919
 920    if (node->op == gpir_op_mov) {
 921       /* We replaced all the uses of the move, so it's dead now. */
 922       gpir_instr_remove_node(node->sched.instr, node);
 923       gpir_node_delete(node);
 924    } else {
 925       /* We deleted all the uses of the node except the store, so it's not
 926        * live anymore.
 927        */
 928       list_del(&node->list);
 929       node->sched.inserted = false;
 930       ctx->ready_list_slots--;
 931       if (node->sched.max_node) {
 932          node->sched.max_node = false;
 933          ctx->instr->alu_num_slot_needed_by_max--;
 934       }
 935       if (node->sched.next_max_node) {
 936          node->sched.next_max_node = false;
 937          ctx->instr->alu_num_unscheduled_next_max--;
 938       }
 939    }
 940 }
 941
 942 static bool used_by_store(gpir_node *node, gpir_instr *instr)
 943 {
 944    gpir_node_foreach_succ(node, dep) {
 945       if (dep->type != GPIR_DEP_INPUT)
 946          continue;
 947
 948       if (dep->succ->type == gpir_node_type_store &&
 949           dep->succ->sched.instr == instr)
 950          return true;
 951    }
 952
 953    return false;
 954 }
 955
 956 static gpir_node *consuming_postlog2(gpir_node *node)
 957 {
 958    if (node->op != gpir_op_complex1)
 959       return NULL;
 960
 961    gpir_node_foreach_succ(node, dep) {
 962       if (dep->type != GPIR_DEP_INPUT)
 963          continue;
 964       if (dep->succ->op == gpir_op_postlog2)
 965          return dep->succ;
 966       else
 967          return NULL;
 968    }
 969
 970    return NULL;
 971 }
 972
 973 static bool try_spill_node(sched_ctx *ctx, gpir_node *node)
 974 {
 975    assert(node->op != gpir_op_mov);
 976
 977    if (used_by_store(node, ctx->instr))
 978       return false;
 979
 980    gpir_debug("trying to spill %d\n", node->index);
 981
 982    int min_instr = INT_MAX;
 983    uint64_t available = get_available_regs(ctx, node, &min_instr);
 984    available &= ~get_killed_regs(ctx, min_instr);
 985
 986    if (node->sched.physreg_store) {
 987       gpir_store_node *store = node->sched.physreg_store;
 988       if (!(available & (1ull << (4 * store->index + store->component))))
 989          return false;
 990    } else {
 991       available &= ~ctx->live_physregs;
 992
 993       if (available == 0)
 994          return false;
 995
 996       /* Don't spill complex1 if it's used postlog2, turn the postlog2 into a
 997        * move, replace the complex1 with postlog2 and spill that instead. The
 998        * store needs a move anyways so the postlog2 is usually free.
 999        */
1000       gpir_node *postlog2 = consuming_postlog2(node);
1001       if (postlog2) {
1002          postlog2->op = gpir_op_mov;
1003          node = create_postlog2(ctx, node);
1004       }
1005
1006       /* TODO: use a better heuristic for choosing an available register? */
1007       int physreg = ffsll(available) - 1;
1008
1009       ctx->live_physregs |= (1ull << physreg);
1010
1011       /* TODO: when we support multiple basic blocks, there may be register
1012        * loads/stores to this register other than this one that haven't been
1013        * scheduled yet so we may need to insert write-after-read dependencies.
1014        */
1015       gpir_store_node *store = gpir_node_create(ctx->block, gpir_op_store_reg);
1016       store->index = physreg / 4;
1017       store->component = physreg % 4;
1018       store->child = node;
1019       store->node.sched.max_node = false;
1020       store->node.sched.next_max_node = false;
1021       store->node.sched.complex_allowed = false;
1022       store->node.sched.pos = -1;
1023       store->node.sched.instr = NULL;
1024       store->node.sched.inserted = false;
1025       store->node.sched.dist = node->sched.dist;
1026       if (node->op == gpir_op_complex1) {
1027          /* Complex1 cannot be directly stored, and has a latency of 2 */
1028          store->node.sched.dist += 2;
1029       }
1030       node->sched.physreg_store = store;
1031       gpir_node_add_dep(&store->node, node, GPIR_DEP_INPUT);
1032       node->sched.ready = false;
1033       schedule_insert_ready_list(ctx, &store->node);
1034    }
1035
1036    gpir_debug("spilling %d to $%d.%c, store %d\n", node->index,
1037               node->sched.physreg_store->index,
1038               "xyzw"[node->sched.physreg_store->component],
1039               node->sched.physreg_store->node.index);
1040
1041    spill_node(ctx, node, node->sched.physreg_store);
1042
1043    return true;
1044 }
1045
1046 static bool try_spill_nodes(sched_ctx *ctx, gpir_node *orig_node)
1047 {
1048    /* First, try to spill max nodes. */
1049    list_for_each_entry_safe_rev(gpir_node, node, &ctx->ready_list, list) {
1050       if (ctx->max_node_spill_needed <= 0)
1051          break;
1052
1053       /* orig_node is the node we're trying to schedule, so spilling it makes
1054        * no sense. Also don't try to spill any nodes in front of it, since
1055        * they might be scheduled instead.
1056        */
1057       if (node == orig_node)
1058          break;
1059
1060       if (node->op == gpir_op_mov) {
1061          /* Don't try to spill loads, since that only adds another load and
1062           * store which is likely pointless.
1063           */
1064          continue;
1065       }
1066
1067       if (!gpir_is_input_node(node) || !node->sched.max_node)
1068          continue;
1069
1070       if (try_spill_node(ctx, node)) {
1071          ctx->max_node_spill_needed--;
1072          ctx->total_spill_needed--;
1073       }
1074    }
1075
1076    /* Now, try to spill the remaining nodes. */
1077    list_for_each_entry_safe_rev(gpir_node, node, &ctx->ready_list, list) {
1078       if (ctx->total_spill_needed <= 0)
1079          break;
1080
1081       if (node == orig_node)
1082          break;
1083
1084       if (node->op == gpir_op_mov)
1085          continue;
1086
1087       if (!gpir_is_input_node(node) ||
1088           !(node->sched.max_node || node->sched.next_max_node))
1089          continue;
1090
1091       if (try_spill_node(ctx, node))
1092          ctx->total_spill_needed--;
1093    }
1094
1095    return ctx->total_spill_needed <= 0 && ctx->max_node_spill_needed <= 0;
1096 }
1097
1098 static int gpir_get_curr_ready_list_slots(sched_ctx *ctx)
1099 {
1100    int total = 0;
1101    list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
1102       total += gpir_get_slots_required(node);
1103    }
1104
1105    return total;
1106 }
1107
1108 /* What gpir_get_min_end() would return if node were replaced with a move
1109  * instruction not in the complex slot. Normally this is 2 + min_end, except
1110  * for some store instructions which must have the move node in the same
1111  * instruction.
1112  */
1113 static int gpir_get_min_end_as_move(gpir_node *node)
1114 {
1115    int min = INT_MAX;
1116    gpir_node_foreach_succ(node, dep) {
1117       gpir_node *succ = dep->succ;
1118       if (succ->sched.instr && dep->type == GPIR_DEP_INPUT) {
1119          switch (succ->op) {
1120          case gpir_op_store_temp:
1121          case gpir_op_store_reg:
1122          case gpir_op_store_varying:
1123             continue;
1124          default:
1125             break;
1126          }
1127          if (min > succ->sched.instr->index + 2)
1128             min = succ->sched.instr->index + 2;
1129       }
1130    }
1131    return min;
1132 }
1133
1134 /* The second source for add0, add1, mul0, and mul1 units cannot be complex.
1135  * The hardware overwrites the add second sources with 0 and mul second
1136  * sources with 1. This can be a problem if we need to insert more next-max
1137  * moves but we only have values that can't use the complex unit for moves.
1138  *
1139  * Fortunately, we only need to insert a next-max move if there are more than
1140  * 5 next-max nodes, but there are only 4 sources in the previous instruction
1141  * that make values not complex-capable, which means there can be at most 4
1142  * non-complex-capable values. Hence there will always be at least two values
1143  * that can be rewritten to use a move in the complex slot. However, we have
1144  * to be careful not to waste those values by putting both of them in a
1145  * non-complex slot. This is handled for us by gpir_instr, which will reject
1146  * such instructions. We just need to tell it which nodes can use complex, and
1147  * it will do the accounting to figure out what is safe.
1148  */
1149
1150 static bool can_use_complex(gpir_node *node)
1151 {
1152    gpir_node_foreach_succ(node, dep) {
1153       if (dep->type != GPIR_DEP_INPUT)
1154          continue;
1155
1156       gpir_node *succ = dep->succ;
1157       if (succ->type != gpir_node_type_alu)
1158          continue;
1159
1160       /* Note: this must be consistent with gpir_codegen_{mul,add}_slot{0,1}
1161        */
1162       gpir_alu_node *alu = gpir_node_to_alu(succ);
1163       switch (alu->node.op) {
1164       case gpir_op_complex1:
1165          /* complex1 puts its third source in the fourth slot */
1166          if (alu->children[1] == node || alu->children[2] == node)
1167             return false;
1168          break;
1169       case gpir_op_complex2:
1170          /* complex2 has its source duplicated, since it actually takes two
1171           * sources but we only ever use it with both sources the same. Hence
1172           * its source can never be the complex slot.
1173           */
1174          return false;
1175       case gpir_op_select:
1176          /* Select has its sources rearranged */
1177          if (alu->children[0] == node)
1178             return false;
1179          break;
1180       default:
1181          assert(alu->num_child <= 2);
1182          if (alu->num_child == 2 && alu->children[1] == node)
1183             return false;
1184          break;
1185       }
1186    }
1187
1188    return true;
1189 }
1190
1191 /* Initialize node->sched.max_node and node->sched.next_max_node for every
1192  * input node on the ready list. We should only need to do this once per
1193  * instruction, at the beginning, since we never add max nodes to the ready
1194  * list.
1195  */
1196
1197 static void sched_find_max_nodes(sched_ctx *ctx)
1198 {
1199    ctx->instr->alu_num_unscheduled_next_max = 0;
1200    ctx->instr->alu_num_slot_needed_by_max = 0;
1201
1202    list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
1203       if (!gpir_is_input_node(node))
1204          continue;
1205
1206       int min_end_move = gpir_get_min_end_as_move(node);
1207       node->sched.max_node = (min_end_move == ctx->instr->index);
1208       node->sched.next_max_node = (min_end_move == ctx->instr->index + 1);
1209       if (node->sched.next_max_node)
1210          node->sched.complex_allowed = can_use_complex(node);
1211
1212       if (node->sched.max_node)
1213          ctx->instr->alu_num_slot_needed_by_max++;
1214       if (node->sched.next_max_node)
1215          ctx->instr->alu_num_unscheduled_next_max++;
1216    }
1217 }
1218
1219 /* Verify the invariants described in gpir.h, as well as making sure the
1220  * counts are correct.
1221  */
1222 static void verify_max_nodes(sched_ctx *ctx)
1223 {
1224    int alu_num_slot_needed_by_max = 0;
1225    int alu_num_unscheduled_next_max = 0;
1226    int alu_num_slot_needed_by_store = 0;
1227    int alu_num_slot_needed_by_non_cplx_store = 0;
1228    int alu_max_allowed_next_max = 5;
1229
1230    list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
1231       if (!gpir_is_input_node(node))
1232          continue;
1233
1234       if (node->sched.max_node)
1235          alu_num_slot_needed_by_max++;
1236       if (node->sched.next_max_node)
1237          alu_num_unscheduled_next_max++;
1238       if (used_by_store(node, ctx->instr)) {
1239          alu_num_slot_needed_by_store++;
1240          if (node->sched.next_max_node && !node->sched.complex_allowed)
1241             alu_num_slot_needed_by_non_cplx_store++;
1242       }
1243    }
1244
1245    if (ctx->instr->slots[GPIR_INSTR_SLOT_MUL0] &&
1246        ctx->instr->slots[GPIR_INSTR_SLOT_MUL0]->op == gpir_op_complex1)
1247       alu_max_allowed_next_max = 4;
1248
1249    assert(ctx->instr->alu_num_slot_needed_by_max == alu_num_slot_needed_by_max);
1250    assert(ctx->instr->alu_num_unscheduled_next_max == alu_num_unscheduled_next_max);
1251    assert(ctx->instr->alu_max_allowed_next_max == alu_max_allowed_next_max);
1252    assert(ctx->instr->alu_num_slot_needed_by_store == alu_num_slot_needed_by_store);
1253    assert(ctx->instr->alu_num_slot_needed_by_non_cplx_store ==
1254           alu_num_slot_needed_by_non_cplx_store);
1255    assert(ctx->instr->alu_num_slot_free >= alu_num_slot_needed_by_store + alu_num_slot_needed_by_max + MAX2(alu_num_unscheduled_next_max - alu_max_allowed_next_max, 0));
1256    assert(ctx->instr->alu_non_cplx_slot_free >= alu_num_slot_needed_by_max + alu_num_slot_needed_by_non_cplx_store);
1257 }
1258
1259 static bool try_node(sched_ctx *ctx)
1260 {
1261    gpir_node *best_node = NULL;
1262    int best_score = INT_MIN;
1263
1264    /* Spilling will delete arbitrary nodes after the current one in the ready
1265     * list, which means that we always need to look up the next node in the
1266     * list at the end of each iteration. While list_for_each_entry() works for
1267     * this purpose, its sanity checking assumes that you don't want to modify
1268     * the list at all. We know better here, so we have to open-code
1269     * list_for_each_entry() without the check in order to not assert.
1270     */
1271    for (gpir_node *node = LIST_ENTRY(gpir_node, ctx->ready_list.next, list);
1272         &node->list != &ctx->ready_list;
1273         node = LIST_ENTRY(gpir_node, node->list.next, list)) {
1274       if (best_score != INT_MIN) {
1275          if (node->sched.dist < best_node->sched.dist)
1276             break;
1277       }
1278
1279       if (node->sched.ready) {
1280          ctx->total_spill_needed = 0;
1281          ctx->max_node_spill_needed = 0;
1282          int score = schedule_try_node(ctx, node, true);
1283          if (score == INT_MIN && !best_node &&
1284              ctx->total_spill_needed > 0 &&
1285              try_spill_nodes(ctx, node)) {
1286             score = schedule_try_node(ctx, node, true);
1287          }
1288
1289          /* schedule_first nodes must be scheduled if possible */
1290          if (gpir_op_infos[node->op].schedule_first && score != INT_MIN) {
1291             best_node = node;
1292             best_score = score;
1293             break;
1294          }
1295
1296          if (score > best_score) {
1297             best_score = score;
1298             best_node = node;
1299          }
1300       }
1301    }
1302
1303    if (best_node) {
1304       gpir_debug("scheduling %d (score = %d)%s\n", best_node->index,
1305                  best_score, best_node->sched.max_node ? " (max)" : "");
1306       ASSERTED int score = schedule_try_node(ctx, best_node, false);
1307       assert(score != INT_MIN);
1308       return true;
1309    }
1310
1311    return false;
1312 }
1313
1314 static void place_move(sched_ctx *ctx, gpir_node *node)
1315 {
1316    gpir_node *move = create_move(ctx, node);
1317    gpir_node_foreach_succ_safe(move, dep) {
1318       gpir_node *succ = dep->succ;
1319       if (!succ->sched.instr ||
1320           ctx->instr->index < succ->sched.instr->index + gpir_get_min_dist(dep)) {
1321          gpir_node_replace_pred(dep, node);
1322          if (dep->type == GPIR_DEP_INPUT)
1323             gpir_node_replace_child(succ, move, node);
1324       }
1325    }
1326    ASSERTED int score = schedule_try_node(ctx, move, false);
1327    assert(score != INT_MIN);
1328 }
1329
1330 /* For next-max nodes, not every node can be offloaded to a move in the
1331  * complex slot. If we run out of non-complex slots, then such nodes cannot
1332  * have moves placed for them. There should always be sufficient
1333  * complex-capable nodes so that this isn't a problem.
1334  */
1335 static bool can_place_move(sched_ctx *ctx, gpir_node *node)
1336 {
1337    if (!node->sched.next_max_node)
1338       return true;
1339
1340    if (node->sched.complex_allowed)
1341       return true;
1342
1343    return ctx->instr->alu_non_cplx_slot_free > 0;
1344 }
1345
1346 static bool sched_move(sched_ctx *ctx)
1347 {
1348    list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
1349       if (node->sched.max_node) {
1350          /* For complex1 that is consumed by a postlog2, we cannot allow any
1351           * moves in between. Convert the postlog2 to a move and insert a new
1352           * postlog2, and try to schedule it again in try_node().
1353           */
1354          gpir_node *postlog2 = consuming_postlog2(node);
1355          if (postlog2) {
1356             postlog2->op = gpir_op_mov;
1357             create_postlog2(ctx, node);
1358          } else {
1359             place_move(ctx, node);
1360          }
1361          return true;
1362       }
1363    }
1364
1365    if (ctx->instr->alu_num_slot_needed_by_store > 0) {
1366       list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
1367          if (used_by_store(node, ctx->instr)) {
1368             place_move(ctx, node);
1369             /* If we have a store of a load, then we need to make sure that we
1370              * immediately schedule the dependent load, or create a move
1371              * instruction for it, like we would with a normal instruction.
1372              * The rest of the code isn't set up to handle load nodes in the
1373              * ready list -- see the comments in _schedule_try_node().
1374              */
1375             if (node->type == gpir_node_type_load) {
1376                if (!schedule_try_place_node(ctx, node, false)) {
1377                   create_move(ctx, node);
1378                }
1379             }
1380             return true;
1381          }
1382       }
1383    }
1384
1385    /* complex1 is a bit a special case, since it has a latency of 2 cycles.
1386     * Once it is fully ready, we need to group all its uses in the same
1387     * instruction, and then we need to avoid creating any moves in the next
1388     * cycle in order to get it scheduled. Failing to do any of these things
1389     * could result in a cycle penalty, or even worse, an infinite loop of
1390     * inserting moves. If it is a next-max node and ready, then it has a use
1391     * in the previous cycle. If it has a use in the current cycle as well,
1392     * then we want to insert a move node to make it ready in two cycles -- if
1393     * we don't, then there will be at least a one cycle penalty. Otherwise, it
1394     * will be ready next cycle, and we shouldn't insert a move node, or else
1395     * we'll also have a one cycle penalty.
1396     */
1397    if (ctx->instr->alu_num_slot_free > 0) {
1398       list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
1399          if (!can_place_move(ctx, node))
1400             continue;
1401
1402          if (node->sched.next_max_node && node->op == gpir_op_complex1 &&
1403              node->sched.ready) {
1404             bool skip = true;
1405             gpir_node_foreach_succ(node, dep) {
1406                if (dep->type != GPIR_DEP_INPUT)
1407                   continue;
1408
1409                gpir_node *succ = dep->succ;
1410
1411                if (!succ->sched.instr ||
1412                    succ->sched.instr->index != ctx->instr->index - 1) {
1413                   skip = false;
1414                   break;
1415                }
1416             }
1417
1418             if (skip)
1419                continue;
1420
1421             place_move(ctx, node);
1422             return true;
1423          }
1424       }
1425    }
1426
1427    /* Once we've made all the required moves, we're free to use any extra
1428     * slots to schedule more moves for next max nodes. Besides sometimes being
1429     * necessary, this can free up extra space in the next instruction. We walk
1430     * from back to front so that we pick nodes less likely to be scheduled
1431     * next first -- an extra move would be unnecessary there. But make sure
1432     * not to handle the complex1 case handled above.
1433     */
1434    if (ctx->instr->alu_num_slot_free > 0) {
1435       list_for_each_entry_rev(gpir_node, node, &ctx->ready_list, list) {
1436          if (!can_place_move(ctx, node))
1437             continue;
1438
1439          if (node->sched.next_max_node &&
1440              !(node->op == gpir_op_complex1 && node->sched.ready)) {
1441             place_move(ctx, node);
1442             return true;
1443          }
1444       }
1445    }
1446
1447    /* We may have skipped complex1 above, but if we run out of space, we still
1448     * need to insert the move.
1449     */
1450
1451    if (ctx->instr->alu_num_unscheduled_next_max >
1452        ctx->instr->alu_max_allowed_next_max) {
1453       list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
1454          if (!can_place_move(ctx, node))
1455             continue;
1456
1457          if (node->sched.next_max_node) {
1458             place_move(ctx, node);
1459             return true;
1460          }
1461       }
1462    }
1463
1464
1465    return false;
1466 }
1467
1468 static bool gpir_sched_instr_pass(sched_ctx *ctx)
1469 {
1470    if (try_node(ctx))
1471       return true;
1472
1473    if (sched_move(ctx))
1474       return true;
1475
1476    return false;
1477 }
1478
1479 static void schedule_print_pre_one_instr(sched_ctx *ctx)
1480 {
1481    if (!(lima_debug & LIMA_DEBUG_GP))
1482       return;
1483
1484    printf("instr %d for ready list:", ctx->instr->index);
1485    list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
1486       printf(" %d/%c (%d, %d, %s)", node->index, node->sched.ready ? 'r' : 'p',
1487              node->sched.dist, gpir_get_slots_required(node),
1488              node->sched.max_node ? "max" : (node->sched.next_max_node ? "next" : "none"));
1489    }
1490    printf("\nlive physregs: ");
1491    for (unsigned i = 0; i < 16; i++) {
1492       if (ctx->live_physregs & (0xfull << (4 * i))) {
1493          printf("$%d.", i);
1494          for (unsigned j = 0; j < 4; j++) {
1495             if (ctx->live_physregs & (1ull << (4 * i + j)))
1496                printf("%c", "xyzw"[j]);
1497          }
1498          printf(" ");
1499       }
1500    }
1501    printf("\n");
1502 }
1503
1504 static void schedule_print_post_one_instr(gpir_instr *instr)
1505 {
1506    if (!(lima_debug & LIMA_DEBUG_GP))
1507       return;
1508
1509    printf("post schedule instr");
1510    for (int i = 0; i < GPIR_INSTR_SLOT_NUM; i++) {
1511       if (instr->slots[i])
1512          printf(" %d/%d", i, instr->slots[i]->index);
1513    }
1514    printf("\n");
1515 }
1516
1517
1518 static bool schedule_one_instr(sched_ctx *ctx)
1519 {
1520    gpir_instr *instr = gpir_instr_create(ctx->block);
1521    if (unlikely(!instr))
1522       return false;
1523
1524    ctx->instr = instr;
1525
1526    sched_find_max_nodes(ctx);
1527    schedule_print_pre_one_instr(ctx);
1528
1529    while (gpir_sched_instr_pass(ctx)) {
1530       assert(ctx->ready_list_slots == gpir_get_curr_ready_list_slots(ctx));
1531 #ifndef NDEBUG
1532       verify_max_nodes(ctx);
1533       verify_ready_list(ctx);
1534 #endif
1535    }
1536
1537    schedule_print_post_one_instr(instr);
1538    return true;
1539 }
1540
1541 static bool schedule_block(gpir_block *block)
1542 {
1543    /* calculate distance */
1544    list_for_each_entry(gpir_node, node, &block->node_list, list) {
1545       if (gpir_node_is_root(node))
1546          schedule_update_distance(node);
1547    }
1548
1549    sched_ctx ctx;
1550    list_inithead(&ctx.ready_list);
1551    ctx.block = block;
1552    ctx.ready_list_slots = 0;
1553    /* TODO initialize with block live out once we have proper liveness
1554     * tracking
1555     */
1556    ctx.live_physregs = 0;
1557
1558    /* construct the ready list from root nodes */
1559    list_for_each_entry_safe(gpir_node, node, &block->node_list, list) {
1560       if (gpir_node_is_root(node))
1561          schedule_insert_ready_list(&ctx, node);
1562    }
1563
1564    list_inithead(&block->node_list);
1565    while (!list_empty(&ctx.ready_list)) {
1566       if (!schedule_one_instr(&ctx))
1567          return false;
1568    }
1569
1570    return true;
1571 }
1572
1573 static void schedule_build_dependency(gpir_block *block)
1574 {
1575    gpir_node *last_written[GPIR_VALUE_REG_NUM + GPIR_PHYSICAL_REG_NUM] = {0};
1576
1577    /* merge dummy_f/m to the node created from */
1578    list_for_each_entry_safe(gpir_node, node, &block->node_list, list) {
1579       if (node->op == gpir_op_dummy_m) {
1580          gpir_alu_node *alu = gpir_node_to_alu(node);
1581          gpir_node *origin = alu->children[0];
1582          gpir_node *dummy_f = alu->children[1];
1583
1584          gpir_node_foreach_succ(node, dep) {
1585             gpir_node *succ = dep->succ;
1586             /* origin and node may have same succ (by VREG/INPUT or
1587              * VREG/VREG dep), so use gpir_node_add_dep() instead of
1588              * gpir_node_replace_pred() */
1589             gpir_node_add_dep(succ, origin, dep->type);
1590             gpir_node_replace_child(succ, node, origin);
1591          }
1592          gpir_node_delete(dummy_f);
1593          gpir_node_delete(node);
1594       }
1595    }
1596
1597    /* Forward dependencies. We only need to add these for register loads,
1598     * since value registers already have an input dependency.
1599     */
1600    list_for_each_entry(gpir_node, node, &block->node_list, list) {
1601       if (node->op == gpir_op_load_reg) {
1602          gpir_load_node *load = gpir_node_to_load(node);
1603          unsigned index = 4 * load->index + load->component;
1604          if (last_written[index]) {
1605             gpir_node_add_dep(node, last_written[index], GPIR_DEP_READ_AFTER_WRITE);
1606          }
1607       }
1608
1609       if (node->value_reg >= 0)
1610          last_written[node->value_reg] = node;
1611    }
1612
1613    memset(last_written, 0, sizeof(last_written));
1614
1615    /* False dependencies. For value registers, these exist only to make sure
1616     * that the maximum pressure isn't exceeded and are hence "fake".
1617     */
1618    list_for_each_entry_rev(gpir_node, node, &block->node_list, list) {
1619       if (node->op == gpir_op_load_reg) {
1620          gpir_load_node *load = gpir_node_to_load(node);
1621          unsigned index = 4 * load->index + load->component;
1622          if (last_written[index]) {
1623             gpir_node_add_dep(last_written[index], node, GPIR_DEP_WRITE_AFTER_READ);
1624          }
1625       } else {
1626          gpir_node_foreach_pred(node, dep) {
1627             if (dep->type == GPIR_DEP_INPUT) {
1628                int index = dep->pred->value_reg;
1629                if (index >= 0 && last_written[index]) {
1630                   gpir_node_add_dep(last_written[index], node,
1631                                     GPIR_DEP_WRITE_AFTER_READ);
1632                }
1633             }
1634          }
1635       }
1636
1637       if (node->value_reg >= 0)
1638          last_written[node->value_reg] = node;
1639    }
1640 }
1641
1642 static void print_statistic(gpir_compiler *comp, int save_index)
1643 {
1644    int num_nodes[gpir_op_num] = {0};
1645    int num_created_nodes[gpir_op_num] = {0};
1646
1647    list_for_each_entry(gpir_block, block, &comp->block_list, list) {
1648       list_for_each_entry(gpir_node, node, &block->node_list, list) {
1649          num_nodes[node->op]++;
1650          if (node->index >= save_index)
1651             num_created_nodes[node->op]++;
1652       }
1653    }
1654
1655    printf("====== gpir scheduler statistic ======\n");
1656    printf("---- how many nodes are scheduled ----\n");
1657    int n = 0, l = 0;
1658    for (int i = 0; i < gpir_op_num; i++) {
1659       if (num_nodes[i]) {
1660          printf("%10s:%-6d", gpir_op_infos[i].name, num_nodes[i]);
1661          n += num_nodes[i];
1662          if (!(++l % 4))
1663             printf("\n");
1664       }
1665    }
1666    if (l % 4)
1667       printf("\n");
1668    printf("\ntotal: %d\n", n);
1669
1670    printf("---- how many nodes are created ----\n");
1671    n = l = 0;
1672    for (int i = 0; i < gpir_op_num; i++) {
1673       if (num_created_nodes[i]) {
1674          printf("%10s:%-6d", gpir_op_infos[i].name, num_created_nodes[i]);
1675          n += num_created_nodes[i];
1676          if (!(++l % 4))
1677             printf("\n");
1678       }
1679    }
1680    if (l % 4)
1681       printf("\n");
1682    printf("\ntotal: %d\n", n);
1683    printf("------------------------------------\n");
1684 }
1685
1686 bool gpir_schedule_prog(gpir_compiler *comp)
1687 {
1688    int save_index = comp->cur_index;
1689
1690    /* init schedule info */
1691    int index = 0;
1692    list_for_each_entry(gpir_block, block, &comp->block_list, list) {
1693       block->sched.instr_index = 0;
1694       list_for_each_entry(gpir_node, node, &block->node_list, list) {
1695          node->sched.instr = NULL;
1696          node->sched.pos = -1;
1697          node->sched.index = index++;
1698          node->sched.dist = -1;
1699          /* TODO when we support multiple basic blocks, we need a way to keep
1700           * track of this for physregs allocated before the scheduler.
1701           */
1702          node->sched.physreg_store = NULL;
1703          node->sched.ready = false;
1704          node->sched.inserted = false;
1705          node->sched.complex_allowed = false;
1706          node->sched.max_node = false;
1707          node->sched.next_max_node = false;
1708       }
1709    }
1710
1711    /* build dependency */
1712    list_for_each_entry(gpir_block, block, &comp->block_list, list) {
1713       schedule_build_dependency(block);
1714    }
1715
1716    //gpir_debug("after scheduler build reg dependency\n");
1717    //gpir_node_print_prog_dep(comp);
1718
1719    list_for_each_entry(gpir_block, block, &comp->block_list, list) {
1720       if (!schedule_block(block)) {
1721          gpir_error("fail schedule block\n");
1722          return false;
1723       }
1724    }
1725
1726    if (lima_debug & LIMA_DEBUG_GP) {
1727       print_statistic(comp, save_index);
1728       gpir_instr_print_prog(comp);
1729    }
1730
1731    return true;
1732 }