src/gallium/drivers/lima/ir/gp/scheduler.c

   1 /*
   2  * Copyright (c) 2017 Lima Project
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sub license,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the
  12  * next paragraph) shall be included in all copies or substantial portions
  13  * of the Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  21  * DEALINGS IN THE SOFTWARE.
  22  *
  23  */
  24
  25 #include <limits.h>
  26
  27 #include "gpir.h"
  28
  29 /*
  30  * GP scheduling algorithm (by Connor Abbott <cwabbott0@gmail.com>)
  31  *
  32  * The GP pipeline has three main stages:
  33  *
  34  * --------------------------------------------------------
  35  * |                                                      |
  36  * |               Register/Attr/Temp Fetch               |
  37  * |                                                      |
  38  * --------------------------------------------------------
  39  * |        |        |        |        |        |         |
  40  * |  Mul0  |  Mul1  |  Add0  |  Add1  |  Cplx  |  Pass   |
  41  * |        |        |        |        |        |         |
  42  * --------------------------------------------------------
  43  * |                 |                          |         |
  44  * |    Complex1     |   Temp/Register/Varying  |  Pass   |
  45  * |    Stage 2      |         Store            | Stage 2 |
  46  * |                 |                          |         |
  47  * --------------------------------------------------------
  48  *
  49  * Because of this setup, storing a register has a latency of three cycles.
  50  * Also, the register file is organized into 4-component vectors, and the
  51  * load stage can only load two vectors at a time. Aside from these highly
  52  * constrained register load/store units, there is an explicit bypass
  53  * network, where each unit (mul0/mul1/etc.) can access the results of the
  54  * any unit from the previous two cycles directly, except for the complex
  55  * unit whose result can only be accessed for one cycle (since it's expected
  56  * to be used directly by the complex2 instruction in the following cycle).
  57  *
  58  * Because of the very restricted register file, and because only rarely are
  59  * all the units in use at the same time, it can be very beneficial to use
  60  * the unused units to "thread" a value from source to destination by using
  61  * moves in the otherwise-unused units, without involving the register file
  62  * at all. It's very difficult to fully exploit this with a traditional
  63  * scheduler, so we need to do something a little un-traditional. The 512
  64  * instruction limit means that for more complex shaders, we need to do as
  65  * well as possible or else the app won't even work.
  66  *
  67  * The scheduler works by considering the bypass network as a kind of
  68  * register file. It's a quite unusual register file, since registers have to
  69  * be assigned "on the fly" as we schedule operations, but with some care, we
  70  * can use something conceptually similar to a linear-scan allocator to
  71  * successfully schedule nodes to instructions without running into
  72  * conflicts.
  73  *
  74  * Values in the IR are separated into normal values, or "value registers",
  75  * which is what normal nodes like add, mul, etc. produce, and which only
  76  * live inside one basic block, and registers, which can span multiple basic
  77  * blocks but have to be accessed via special load_reg/store_reg nodes. RA
  78  * assigns physical registers to both value registers and normal registers,
  79  * treating load_reg/store_reg as a move instruction, but these are only used
  80  * directly for normal registers -- the physreg assigned to a value register
  81  * is "fake," and is only used inside the scheduler. Before scheduling we
  82  * insert read-after-write dependencies, even for value registers, as if
  83  * we're going to use those, but then we throw them away. For example, if we
  84  * had something like:
  85  *
  86  * (*)r2 = add (*)r1, (*)r2
  87  * (*)r1 = load_reg r0
  88  *
  89  * we'd insert a write-after-read dependency between the add and load_reg,
  90  * even though the starred registers aren't actually used by the scheduler
  91  * after this step. This step is crucial since it guarantees that during any
  92  * point in the schedule, the number of live registers + live value registers
  93  * will never exceed the capacity of the register file and the bypass network
  94  * combined. This is because each live register/value register will have a
  95  * different fake number, thanks to the fake dependencies inserted before
  96  * scheduling. This allows us to not have to worry about spilling to
  97  * temporaries, which is only done ahead of time.
  98  *
  99  * The scheduler is a bottom-up scheduler. It keeps track of each live value
 100  * register, and decides on-the-fly which value registers to keep in the
 101  * bypass network and which to "spill" to registers. Of particular importance
 102  * is the "ready list," which consists of "input nodes" (nodes that produce a
 103  * value that can be consumed via the bypass network), both "partially ready"
 104  * (only some of the uses have been scheduled) and "fully ready" (all uses
 105  * have been scheduled), as well as other non-input nodes like register
 106  * stores. Each input node on the ready list represents a live value register
 107  * before the current instruction. There must be at most 11 such input nodes
 108  * at all times, since there are only 11 slots in the next two instructions
 109  * which can reach the current instruction.
 110  *
 111  * An input node is a "max node" if it has a use two cycles ago, which must be
 112  * connected to a definition this cycle. Otherwise it may be a "next max node"
 113  * if it will be a max node on the next instruction (i.e. it has a use at most
 114  * one cycle ago), or it may be neither if all of its uses are this cycle. As
 115  * we keep adding instructions to the front, input nodes graduate from
 116  * neither, to next max, to max, unless we decide to insert a move to keep it
 117  * alive longer, at which point any uses after the current instruction are
 118  * rewritten to be uses of the move so that the original node returns to
 119  * neither. The scheduler decides which nodes to try freely, but we have to
 120  * reserve slots for two different reasons: (1) out of the 5 non-complex
 121  * slots, we reserve a slot for each max node, so that we can connect a
 122  * definition to the use 2 cycles ago. (2) Out of all 6 slots, we reserve a
 123  * slot for every next-max node above 5, so that for the next instruction
 124  * there are no more than 5 max nodes. When a max or next-max node gets
 125  * scheduled, the corresponding reservation is reduced by one. At the end, we
 126  * insert moves for every slot that was reserved. The reservation is actually
 127  * managed by nir_instr, and all we have to do is tell it how many to reserve
 128  * at the beginning and then tell it which nodes are max/next-max nodes. When
 129  * we start scheduling an instruction, there will be at most 5 max nodes
 130  * thanks to the previous instruction's next-max reservation/move insertion.
 131  * Since there are at most 11 total input nodes, if there are N max nodes,
 132  * there are at most 11 - N next-max nodes, and therefore at most 11 - N - 5 =
 133  * 6 - N slots need to be reserved for next-max nodes, and so at most
 134  * 6 - N + N = 6 slots need to be reserved in total, exactly the total number
 135  * of slots. So, thanks to the total input node restriction, we will never
 136  * need to reserve too many slots.
 137  *
 138  * It sometimes happens that scheduling a given node will violate this total
 139  * input node restriction, or that a reservation will mean that we can't
 140  * schedule it. We first schedule a node "speculatively" to see if this is a
 141  * problem. If some of the node's sources are loads, then we can schedule
 142  * the node and its dependent loads in one swoop to avoid going over the
 143  * pressure limit. If that fails, we can try to spill a ready or
 144  * partially-ready input node to a register by rewriting all of its uses to
 145  * refer to a register load. This removes it from the list of ready and
 146  * partially ready input nodes as all of its uses are now unscheduled. If
 147  * successful, we can then proceed with scheduling the original node. All of
 148  * this happens "speculatively," meaning that afterwards the node is removed
 149  * and the entire state of the scheduler is reverted to before it was tried, to
 150  * ensure that we never get into an invalid state and run out of spots for
 151  * moves. In try_nodes(), we try to schedule each node speculatively on the
 152  * ready list, keeping only the nodes that could be successfully scheduled, so
 153  * that when we finally decide which node to actually schedule, we know it
 154  * will succeed.  This is how we decide on the fly which values go in
 155  * registers and which go in the bypass network. Note that "unspilling" a
 156  * value is simply a matter of scheduling the store_reg instruction created
 157  * when we spill.
 158  *
 159  * The careful accounting of live value registers, reservations for moves, and
 160  * speculative scheduling guarantee that we never run into a failure case
 161  * while scheduling. However, we need to make sure that this scheduler will
 162  * not get stuck in an infinite loop, i.e. that we'll always make forward
 163  * progress by eventually scheduling a non-move node. If we run out of value
 164  * registers, then we may have to spill a node to a register. If we
 165  * were to schedule one of the fully-ready nodes, then we'd have 11 + N live
 166  * value registers before the current instruction. But since there are at most
 167  * 64+11 live registers and register values total thanks to the fake
 168  * dependencies we inserted before scheduling, there are at most 64 - N live
 169  * physical registers, and therefore there are at least N registers available
 170  * for spilling. Not all these registers will be available immediately, since
 171  * in order to spill a node to a given register we have to ensure that there
 172  * are slots available to rewrite every use to a load instruction, and that
 173  * may not be the case. There may also be intervening writes which prevent
 174  * some registers from being used. However, these are all temporary problems,
 175  * since as we create each instruction, we create additional register load
 176  * slots that can be freely used for spilling, and we create more move nodes
 177  * which means that the uses of the nodes we're trying to spill keep moving
 178  * forward. This means that eventually, these problems will go away, at which
 179  * point we'll be able to spill a node successfully, so eventually we'll be
 180  * able to schedule the first node on the ready list.
 181  */
 182
 183 typedef struct {
 184    /* This is the list of ready and partially-ready nodes. A partially-ready
 185     * node must have at least one input dependency already scheduled.
 186     */
 187    struct list_head ready_list;
 188
 189    /* The number of ready or partially-ready nodes with at least one input
 190     * dependency already scheduled. In other words, the number of live value
 191     * registers. This must be at most 11.
 192     */
 193    int ready_list_slots;
 194
 195    /* The physical registers live into the current instruction. */
 196    uint64_t live_physregs;
 197
 198    /* The current instruction. */
 199    gpir_instr *instr;
 200
 201    /* The current basic block. */
 202    gpir_block *block;
 203
 204    /* True if at least one node failed to schedule due to lack of available
 205     * value registers.
 206     */
 207    bool try_spill_all;
 208
 209    /* The number of max nodes needed to spill to successfully schedule the
 210     * instruction.
 211     */
 212    int max_node_spill_needed;
 213
 214    /* The number of max and next-max nodes needed to spill to successfully
 215     * schedule the instruction.
 216     */
 217    int total_spill_needed;
 218
 219    /* For each physical register, a linked list of loads associated with it in
 220     * this block. When we spill a value to a given register, and there are
 221     * existing loads associated with it that haven't been scheduled yet, we
 222     * have to make sure that the corresponding unspill happens after the last
 223     * original use has happened, i.e. is scheduled before.
 224     */
 225    struct list_head physreg_reads[GPIR_PHYSICAL_REG_NUM];
 226 } sched_ctx;
 227
 228 static int gpir_min_dist_alu(gpir_dep *dep)
 229 {
 230    switch (dep->pred->op) {
 231    case gpir_op_load_uniform:
 232    case gpir_op_load_temp:
 233    case gpir_op_load_reg:
 234    case gpir_op_load_attribute:
 235       return 0;
 236
 237    case gpir_op_complex1:
 238       return 2;
 239
 240    default:
 241       return 1;
 242    }
 243 }
 244
 245 static int gpir_get_min_dist(gpir_dep *dep)
 246 {
 247    switch (dep->type) {
 248    case GPIR_DEP_INPUT:
 249       switch (dep->succ->op) {
 250       case gpir_op_store_temp:
 251       case gpir_op_store_reg:
 252       case gpir_op_store_varying:
 253          /* Stores must use an alu node as input. Also, complex1 takes two
 254           * cycles, which means that its result cannot be stored to a register
 255           * as part of the normal path, and therefore it must also have a move
 256           * inserted.
 257           */
 258          if (dep->pred->type == gpir_node_type_load ||
 259              dep->pred->op == gpir_op_complex1)
 260             return INT_MAX >> 2;
 261          else
 262             return 0;
 263
 264       default:
 265          return gpir_min_dist_alu(dep);
 266       }
 267
 268    case GPIR_DEP_OFFSET:
 269       assert(dep->succ->op == gpir_op_store_temp);
 270       return gpir_min_dist_alu(dep);
 271
 272    case GPIR_DEP_READ_AFTER_WRITE:
 273       if (dep->succ->op == gpir_op_load_temp &&
 274           dep->pred->op == gpir_op_store_temp) {
 275          return 4;
 276       } else if (dep->succ->op == gpir_op_load_reg &&
 277                  dep->pred->op == gpir_op_store_reg) {
 278          return 3;
 279       } else if ((dep->pred->op == gpir_op_store_temp_load_off0 ||
 280                   dep->pred->op == gpir_op_store_temp_load_off1 ||
 281                   dep->pred->op == gpir_op_store_temp_load_off2) &&
 282                  dep->succ->op == gpir_op_load_uniform) {
 283          return 4;
 284       } else {
 285          /* Fake dependency */
 286          return 0;
 287       }
 288
 289    case GPIR_DEP_WRITE_AFTER_READ:
 290       return 0;
 291    }
 292
 293    return 0;
 294 }
 295
 296 static int gpir_max_dist_alu(gpir_dep *dep)
 297 {
 298    switch (dep->pred->op) {
 299    case gpir_op_load_uniform:
 300    case gpir_op_load_temp:
 301       return 0;
 302    case gpir_op_load_attribute:
 303       return 1;
 304    case gpir_op_load_reg:
 305       if (dep->pred->sched.pos < GPIR_INSTR_SLOT_REG0_LOAD0 ||
 306           dep->pred->sched.pos > GPIR_INSTR_SLOT_REG0_LOAD3)
 307          return 0;
 308       else
 309          return 1;
 310    case gpir_op_exp2_impl:
 311    case gpir_op_log2_impl:
 312    case gpir_op_rcp_impl:
 313    case gpir_op_rsqrt_impl:
 314    case gpir_op_store_temp_load_off0:
 315    case gpir_op_store_temp_load_off1:
 316    case gpir_op_store_temp_load_off2:
 317       return 1;
 318    case gpir_op_mov:
 319       if (dep->pred->sched.pos == GPIR_INSTR_SLOT_COMPLEX)
 320          return 1;
 321       else
 322          return 2;
 323    default:
 324       return 2;
 325    }
 326 }
 327
 328 static int gpir_get_max_dist(gpir_dep *dep)
 329 {
 330    switch (dep->type) {
 331    case GPIR_DEP_INPUT:
 332       switch (dep->succ->op) {
 333       case gpir_op_store_temp:
 334       case gpir_op_store_reg:
 335       case gpir_op_store_varying:
 336          return 0;
 337
 338       default:
 339          return gpir_max_dist_alu(dep);
 340       }
 341
 342    case GPIR_DEP_OFFSET:
 343       assert(dep->succ->op == gpir_op_store_temp);
 344       return gpir_max_dist_alu(dep);
 345
 346    default:
 347       return INT_MAX >> 2; /* Don't want to overflow... */
 348    }
 349 }
 350
 351 static void schedule_update_distance(gpir_node *node)
 352 {
 353    if (gpir_node_is_leaf(node)) {
 354       node->sched.dist = 0;
 355       return;
 356    }
 357
 358    gpir_node_foreach_pred(node, dep) {
 359       gpir_node *pred = dep->pred;
 360
 361       if (pred->sched.dist < 0)
 362          schedule_update_distance(pred);
 363
 364       int dist = pred->sched.dist + gpir_min_dist_alu(dep);
 365       if (node->sched.dist < dist)
 366          node->sched.dist = dist;
 367    }
 368 }
 369
 370 static bool gpir_is_input_node(gpir_node *node)
 371 {
 372    gpir_node_foreach_succ(node, dep) {
 373       if (dep->type == GPIR_DEP_INPUT)
 374          return true;
 375    }
 376    return false;
 377 }
 378
 379
 380 /* Get the number of slots required for a node on the ready list.
 381  */
 382 static int gpir_get_slots_required(gpir_node *node)
 383 {
 384    if (!gpir_is_input_node(node))
 385       return 0;
 386
 387    /* Note that we assume every node only consumes one slot, even dual-slot
 388     * instructions. While dual-slot instructions may consume more than one
 389     * slot, we can always safely insert a move if it turns out that there
 390     * isn't enough space for them. There's the risk that we get stuck in an
 391     * infinite loop if all the fully ready nodes are dual-slot nodes, but we
 392     * rely on spilling to registers to save us here.
 393     */
 394    return 1;
 395 }
 396
 397 static void verify_ready_list(sched_ctx *ctx)
 398 {
 399    list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
 400       if (!gpir_is_input_node(node)) {
 401          assert(node->sched.ready);
 402       }
 403
 404       if (node->sched.ready) {
 405          /* Every successor must have been scheduled */
 406          gpir_node_foreach_succ(node, dep) {
 407             assert(dep->succ->sched.instr);
 408          }
 409       } else {
 410          /* There must be at least one successor that's not scheduled. */
 411          bool unscheduled = false;
 412          gpir_node_foreach_succ(node, dep) {
 413             unscheduled |= !(dep->succ->sched.instr);
 414          }
 415
 416          assert(unscheduled);
 417       }
 418    }
 419 }
 420
 421 static void schedule_insert_ready_list(sched_ctx *ctx,
 422                                        gpir_node *insert_node)
 423 {
 424    /* if this node is fully ready or partially ready
 425     *   fully ready: all successors have been scheduled
 426     *   partially ready: part of input successors have been scheduled
 427     *
 428     * either fully ready or partially ready node need be inserted to
 429     * the ready list, but we only schedule a move node for partially
 430     * ready node.
 431     */
 432    bool ready = true, insert = false;
 433    gpir_node_foreach_succ(insert_node, dep) {
 434       gpir_node *succ = dep->succ;
 435       if (succ->sched.instr) {
 436          if (dep->type == GPIR_DEP_INPUT)
 437             insert = true;
 438       }
 439       else
 440          ready = false;
 441    }
 442
 443    insert_node->sched.ready = ready;
 444    /* for root node */
 445    insert |= ready;
 446
 447    if (!insert || insert_node->sched.inserted)
 448       return;
 449
 450    struct list_head *insert_pos = &ctx->ready_list;
 451    list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
 452       if ((insert_node->sched.dist > node->sched.dist ||
 453           gpir_op_infos[insert_node->op].schedule_first) &&
 454           !gpir_op_infos[node->op].schedule_first) {
 455          insert_pos = &node->list;
 456          break;
 457       }
 458    }
 459
 460    list_addtail(&insert_node->list, insert_pos);
 461    insert_node->sched.inserted = true;
 462    ctx->ready_list_slots += gpir_get_slots_required(insert_node);
 463 }
 464
 465 static int gpir_get_max_start(gpir_node *node)
 466 {
 467    int max_start = 0;
 468
 469    /* find the max start instr constrainted by all successors */
 470    gpir_node_foreach_succ(node, dep) {
 471       gpir_node *succ = dep->succ;
 472       if (!succ->sched.instr)
 473          continue;
 474
 475       int start = succ->sched.instr->index + gpir_get_min_dist(dep);
 476       if (start > max_start)
 477          max_start = start;
 478    }
 479
 480    return max_start;
 481 }
 482
 483 static int gpir_get_min_end(gpir_node *node)
 484 {
 485    int min_end = INT_MAX;
 486
 487    /* find the min end instr constrainted by all successors */
 488    gpir_node_foreach_succ(node, dep) {
 489       gpir_node *succ = dep->succ;
 490       if (!succ->sched.instr)
 491          continue;
 492
 493       int end = succ->sched.instr->index + gpir_get_max_dist(dep);
 494       if (end < min_end)
 495          min_end = end;
 496    }
 497
 498    return min_end;
 499 }
 500
 501 static gpir_node *gpir_sched_instr_has_load(gpir_instr *instr, gpir_node *node)
 502 {
 503    gpir_load_node *load = gpir_node_to_load(node);
 504
 505    for (int i = GPIR_INSTR_SLOT_REG0_LOAD0; i <= GPIR_INSTR_SLOT_MEM_LOAD3; i++) {
 506       if (!instr->slots[i])
 507          continue;
 508
 509       gpir_load_node *iload = gpir_node_to_load(instr->slots[i]);
 510       if (load->node.op == iload->node.op &&
 511           load->index == iload->index &&
 512           load->component == iload->component)
 513          return &iload->node;
 514    }
 515    return NULL;
 516 }
 517
 518 /* Simply place the node into the given instruction without trying to deal
 519  * with liveness or the ready list. This will only fail if the instruction
 520  * cannot be placed due to a lack of available slots. In addition to normal
 521  * node placement, this is also used for placing loads when spilling to
 522  * registers.
 523  */
 524 static bool _try_place_node(sched_ctx *ctx, gpir_instr *instr, gpir_node *node)
 525 {
 526    if (node->type == gpir_node_type_load) {
 527       gpir_node *load = gpir_sched_instr_has_load(instr, node);
 528       if (load) {
 529          /* This node may have a store as a successor, in which case we have to
 530           * fail it exactly like below in order to later create a move node in
 531           * between.
 532           */
 533          if (instr->index < gpir_get_max_start(node))
 534             return false;
 535
 536          gpir_debug("same load %d in instr %d for node %d\n",
 537                     load->index, instr->index, node->index);
 538
 539          /* not really merge two node, just fake scheduled same place */
 540          node->sched.instr = load->sched.instr;
 541          node->sched.pos = load->sched.pos;
 542          return true;
 543       }
 544    }
 545
 546    if (node->op == gpir_op_store_reg) {
 547       /* This register may be loaded in the next basic block, in which case
 548        * there still needs to be a 2 instruction gap. We do what the blob
 549        * seems to do and simply disable stores in the last two instructions of
 550        * the basic block.
 551        *
 552        * TODO: We may be able to do better than this, but we have to check
 553        * first if storing a register works across branches.
 554        */
 555       if (instr->index < 2)
 556          return false;
 557    }
 558
 559    node->sched.instr = instr;
 560
 561    int max_node_spill_needed = INT_MAX;
 562    int total_spill_needed = INT_MAX;
 563    int *slots = gpir_op_infos[node->op].slots;
 564    for (int i = 0; slots[i] != GPIR_INSTR_SLOT_END; i++) {
 565       node->sched.pos = slots[i];
 566       if (instr->index >= gpir_get_max_start(node) &&
 567           instr->index <= gpir_get_min_end(node) &&
 568           gpir_instr_try_insert_node(instr, node))
 569          return true;
 570       if (ctx->instr->non_cplx_slot_difference ||
 571           ctx->instr->slot_difference) {
 572          /* If one of these fields is non-zero, then we could insert the node
 573           * here after spilling. To get an accurate count of how many nodes we
 574           * need to spill, we need to choose one of the positions where there
 575           * were nonzero slot differences, preferably one with the smallest
 576           * difference (so we don't have to spill as much).
 577           */
 578          if (ctx->instr->non_cplx_slot_difference < max_node_spill_needed ||
 579              ctx->instr->slot_difference < total_spill_needed) {
 580             max_node_spill_needed = ctx->instr->non_cplx_slot_difference;
 581             total_spill_needed = ctx->instr->slot_difference;
 582          }
 583       }
 584    }
 585
 586    if (max_node_spill_needed != INT_MAX) {
 587       /* Indicate how many spill nodes are needed. */
 588       ctx->max_node_spill_needed = MAX2(ctx->max_node_spill_needed,
 589                                         max_node_spill_needed);
 590       ctx->total_spill_needed = MAX2(ctx->total_spill_needed,
 591                                      total_spill_needed);
 592    }
 593    node->sched.instr = NULL;
 594    node->sched.pos = -1;
 595    return false;
 596 }
 597
 598 /* Try to place just the node given, updating the ready list. If "speculative"
 599  * is true, then this is part ofthe pre-commit phase. If false, then we have
 600  * committed to placing this node, so update liveness and ready list
 601  * information.
 602  */
 603
 604 static bool schedule_try_place_node(sched_ctx *ctx, gpir_node *node,
 605                                     bool speculative)
 606 {
 607    if (!_try_place_node(ctx, ctx->instr, node)) {
 608       if (!speculative)
 609          gpir_debug("failed to place %d\n", node->index);
 610       return false;
 611    }
 612
 613    ctx->ready_list_slots -= gpir_get_slots_required(node);
 614
 615    if (!speculative) {
 616       gpir_debug("placed node %d\n", node->index);
 617
 618       /* We assume here that writes are placed before reads. If this changes,
 619        * then this needs to be updated.
 620        */
 621       if (node->op == gpir_op_store_reg) {
 622          gpir_store_node *store = gpir_node_to_store(node);
 623          ctx->live_physregs &=
 624             ~(1ull << (4 * store->index + store->component));
 625          if (store->child->sched.physreg_store == store)
 626             store->child->sched.physreg_store = NULL;
 627       }
 628
 629       if (node->op == gpir_op_load_reg) {
 630          gpir_load_node *load = gpir_node_to_load(node);
 631          ctx->live_physregs |=
 632             (1ull << (4 * load->index + load->component));
 633       }
 634
 635       list_del(&node->list);
 636       list_add(&node->list, &ctx->block->node_list);
 637       gpir_node_foreach_pred(node, dep) {
 638          gpir_node *pred = dep->pred;
 639          schedule_insert_ready_list(ctx, pred);
 640       }
 641    } else {
 642       gpir_node_foreach_pred(node, dep) {
 643          gpir_node *pred = dep->pred;
 644          if (!pred->sched.inserted && dep->type == GPIR_DEP_INPUT)
 645             ctx->ready_list_slots += gpir_get_slots_required(pred);
 646       }
 647    }
 648
 649    return true;
 650 }
 651
 652 /* Create a new node with "node" as the child, replace all uses of "node" with
 653  * this new node, and replace "node" with it in the ready list.
 654  */
 655 static gpir_node *create_replacement(sched_ctx *ctx, gpir_node *node,
 656                                      gpir_op op)
 657 {
 658
 659    gpir_alu_node *new_node = gpir_node_create(node->block, op);
 660    if (unlikely(!new_node))
 661       return NULL;
 662
 663    new_node->children[0] = node;
 664    new_node->num_child = 1;
 665
 666    new_node->node.sched.instr = NULL;
 667    new_node->node.sched.pos = -1;
 668    new_node->node.sched.dist = node->sched.dist;
 669    new_node->node.sched.max_node = node->sched.max_node;
 670    new_node->node.sched.next_max_node = node->sched.next_max_node;
 671    new_node->node.sched.complex_allowed = node->sched.complex_allowed;
 672
 673    ctx->ready_list_slots--;
 674    list_del(&node->list);
 675    node->sched.max_node = false;
 676    node->sched.next_max_node = false;
 677    node->sched.ready = false;
 678    node->sched.inserted = false;
 679    gpir_node_replace_succ(&new_node->node, node);
 680    gpir_node_add_dep(&new_node->node, node, GPIR_DEP_INPUT);
 681    schedule_insert_ready_list(ctx, &new_node->node);
 682    return &new_node->node;
 683 }
 684
 685 static gpir_node *create_move(sched_ctx *ctx, gpir_node *node)
 686 {
 687    gpir_node *move = create_replacement(ctx, node, gpir_op_mov);
 688    gpir_debug("create move %d for %d\n", move->index, node->index);
 689    return move;
 690 }
 691
 692 static gpir_node *create_postlog2(sched_ctx *ctx, gpir_node *node)
 693 {
 694    assert(node->op == gpir_op_complex1);
 695    gpir_node *postlog2 = create_replacement(ctx, node, gpir_op_postlog2);
 696    gpir_debug("create postlog2 %d for %d\n", postlog2->index, node->index);
 697    return postlog2;
 698 }
 699
 700 /* Once we schedule the successor, would the predecessor be fully ready? */
 701 static bool pred_almost_ready(gpir_dep *dep)
 702 {
 703    bool fully_ready = true;
 704    gpir_node_foreach_succ(dep->pred, other_dep) {
 705       gpir_node *succ = other_dep->succ;
 706       if (!succ->sched.instr && dep->succ != other_dep->succ) {
 707          fully_ready = false;
 708          break;
 709       }
 710    }
 711
 712    return fully_ready;
 713 }
 714
 715 /* Recursively try to schedule a node and all its dependent nodes that can fit
 716  * in the same  instruction. There is a simple heuristic scoring system to try
 717  * to group together nodes that load different components of the same input,
 718  * to avoid bottlenecking for operations like matrix multiplies that are
 719  * mostly input-bound.
 720  */
 721
 722 static int _schedule_try_node(sched_ctx *ctx, gpir_node *node, bool speculative)
 723 {
 724    if (!schedule_try_place_node(ctx, node, speculative))
 725       return INT_MIN;
 726
 727    int score = 0;
 728
 729    gpir_node_foreach_pred(node, dep) {
 730       if (dep->type != GPIR_DEP_INPUT)
 731          continue;
 732
 733       int pred_score = INT_MIN;
 734       if (pred_almost_ready(dep)) {
 735          if (dep->pred->type == gpir_node_type_load ||
 736              node->type == gpir_node_type_store) {
 737             pred_score = _schedule_try_node(ctx, dep->pred, speculative);
 738          }
 739       }
 740       if (dep->pred->type == gpir_node_type_load ||
 741           node->type == gpir_node_type_store) {
 742          if (pred_score == INT_MIN) {
 743             if (node->op == gpir_op_mov) {
 744                /* The only moves on the ready list are for loads that we
 745                 * couldn't schedule immediately, as created below. If we
 746                 * couldn't schedule the load, there's no point scheduling
 747                 * the move. The normal move threading logic will ensure
 748                 * that another move is created if we're about to go too far
 749                 * from the uses of this move.
 750                 */
 751                assert(speculative);
 752                return INT_MIN;
 753             } else if (!speculative && dep->pred->type == gpir_node_type_load) {
 754                /* We couldn't schedule the load right away, so it will have
 755                 * to happen in some earlier instruction and then be moved
 756                 * into a value register and threaded to the use by "node".
 757                 * We create the move right away, so that later we'll fail
 758                 * to schedule it if there isn't a slot for a move
 759                 * available.
 760                 */
 761                create_move(ctx, dep->pred);
 762             }
 763             /* Penalize nodes whose dependent ops we couldn't schedule.
 764              */
 765             score--;
 766          } else {
 767             score += pred_score;
 768             continue;
 769          }
 770       }
 771    }
 772
 773    return score;
 774 }
 775
 776 /* If we speculatively tried a node, undo everything.
 777  */
 778
 779 static void schedule_undo_node(sched_ctx *ctx, gpir_node *node)
 780 {
 781    gpir_instr_remove_node(ctx->instr, node);
 782
 783    gpir_node_foreach_pred(node, dep) {
 784       gpir_node *pred = dep->pred;
 785       if (pred->sched.instr) {
 786          schedule_undo_node(ctx, pred);
 787       }
 788    }
 789 }
 790
 791 /* Try to schedule a node. We also try to schedule any predecessors that can
 792  * be part of the same instruction. If "speculative" is true, then we don't
 793  * actually change any state, only returning the score were the node to be
 794  * scheduled, with INT_MIN meaning "cannot be scheduled at all".
 795  */
 796 static int schedule_try_node(sched_ctx *ctx, gpir_node *node, bool speculative)
 797 {
 798    int prev_slots = ctx->ready_list_slots;
 799
 800    int score = _schedule_try_node(ctx, node, speculative);
 801
 802    if (ctx->ready_list_slots > GPIR_VALUE_REG_NUM) {
 803       assert(speculative);
 804       ctx->total_spill_needed = MAX2(ctx->total_spill_needed,
 805                                      ctx->ready_list_slots - GPIR_VALUE_REG_NUM);
 806       score = INT_MIN;
 807    }
 808
 809    if (speculative) {
 810       ctx->ready_list_slots = prev_slots;
 811       if (node->sched.instr)
 812          schedule_undo_node(ctx, node);
 813    }
 814
 815    return score;
 816 }
 817
 818 /* This is called when we want to spill "node" by inserting loads at its uses.
 819  * It returns all the possible registers we can use so that all the loads will
 820  * successfully be inserted. Also return the first instruction we'll need to
 821  * insert a load for.
 822  */
 823
 824 static uint64_t get_available_regs(sched_ctx *ctx, gpir_node *node,
 825                                    int *min_index)
 826 {
 827    uint64_t available = ~0ull;
 828    gpir_node_foreach_succ(node, dep) {
 829       if (dep->type != GPIR_DEP_INPUT)
 830          continue;
 831
 832       gpir_node *use = dep->succ;
 833       gpir_instr *instr = use->sched.instr;
 834
 835       if (!instr) {
 836          /* This use isn't scheduled, so no need to spill it. */
 837          continue;
 838       }
 839
 840       if (use->type == gpir_node_type_store) {
 841          /* We're trying to spill something that was recently stored... just
 842           * bail out.
 843           */
 844          return 0;
 845       }
 846
 847       if (use->op == gpir_op_mov && instr == ctx->instr) {
 848          /* We try to spill the sources of this move, so we can free up space
 849           * in the current instruction.
 850           *
 851           * TODO: should we go back further? It might let us schedule the
 852           * write earlier in some cases, but then we might fail to spill.
 853           */
 854          available &= get_available_regs(ctx, use, min_index);
 855       } else {
 856          if (instr->index < *min_index)
 857             *min_index = instr->index;
 858
 859          uint64_t use_available = 0;
 860
 861          if (instr->reg0_use_count == 0)
 862             use_available = ~0ull;
 863          else if (!instr->reg0_is_attr)
 864             use_available = 0xfull << (4 * instr->reg0_index);
 865
 866          if (instr->reg1_use_count == 0)
 867             use_available = ~0ull;
 868          else
 869             use_available |= 0xfull << (4 * instr->reg1_index);
 870
 871          available &= use_available;
 872       }
 873    }
 874
 875    return available;
 876 }
 877
 878 /* Using "min_index" returned by get_available_regs(), figure out which
 879  * registers are killed by a write after or during the current instruction and
 880  * hence we can't use for spilling. Writes that haven't been scheduled yet
 881  * should be reflected in live_physregs.
 882  */
 883
 884 static uint64_t get_killed_regs(sched_ctx *ctx, int min_index)
 885 {
 886    uint64_t killed = 0;
 887
 888    list_for_each_entry(gpir_instr, instr, &ctx->block->instr_list, list) {
 889       if (instr->index <= min_index)
 890          break;
 891
 892       for (int slot = GPIR_INSTR_SLOT_STORE0; slot <= GPIR_INSTR_SLOT_STORE3;
 893            slot++) {
 894          if (!instr->slots[slot])
 895             continue;
 896
 897          gpir_store_node *store = gpir_node_to_store(instr->slots[slot]);
 898          if (store->node.op != gpir_op_store_reg)
 899             continue;
 900
 901          killed |= 1ull << (4 * store->index + store->component);
 902       }
 903    }
 904
 905    return killed;
 906 }
 907
 908 /* Actually spill a node so that it is no longer in the ready list. Note that
 909  * this must exactly follow the logic of get_available_regs() or else the
 910  * loads could fail to schedule.
 911  */
 912
 913 static void spill_node(sched_ctx *ctx, gpir_node *node, gpir_store_node *store)
 914 {
 915    gpir_node_foreach_succ_safe(node, dep) {
 916       if (dep->type != GPIR_DEP_INPUT)
 917          continue;
 918
 919       gpir_node *use = dep->succ;
 920       gpir_instr *instr = use->sched.instr;
 921
 922       if (!instr)
 923          continue;
 924
 925       if (use->op == gpir_op_mov && instr == ctx->instr) {
 926          spill_node(ctx, use, store);
 927       } else {
 928          gpir_load_node *load = gpir_node_create(ctx->block, gpir_op_load_reg);
 929          load->index = store->index;
 930          load->component = store->component;
 931          list_add(&load->node.list, &ctx->block->node_list);
 932          gpir_node_replace_child(dep->succ, dep->pred, &load->node);
 933          gpir_node_replace_pred(dep, &load->node);
 934          gpir_node_add_dep(&load->node, &store->node, GPIR_DEP_READ_AFTER_WRITE);
 935          gpir_debug("spilling use %d of node %d to load node %d\n",
 936                     use->index, node->index, load->node.index);
 937          ASSERTED bool result = _try_place_node(ctx, use->sched.instr, &load->node);
 938          assert(result);
 939       }
 940    }
 941
 942    if (node->op == gpir_op_mov) {
 943       /* We replaced all the uses of the move, so it's dead now. */
 944       gpir_instr_remove_node(node->sched.instr, node);
 945       gpir_node_delete(node);
 946    } else {
 947       /* We deleted all the uses of the node except the store, so it's not
 948        * live anymore.
 949        */
 950       list_del(&node->list);
 951       node->sched.inserted = false;
 952       ctx->ready_list_slots--;
 953       if (node->sched.max_node) {
 954          node->sched.max_node = false;
 955          ctx->instr->alu_num_slot_needed_by_max--;
 956       }
 957       if (node->sched.next_max_node) {
 958          node->sched.next_max_node = false;
 959          ctx->instr->alu_num_unscheduled_next_max--;
 960       }
 961    }
 962 }
 963
 964 static bool used_by_store(gpir_node *node, gpir_instr *instr)
 965 {
 966    gpir_node_foreach_succ(node, dep) {
 967       if (dep->type != GPIR_DEP_INPUT)
 968          continue;
 969
 970       if (dep->succ->type == gpir_node_type_store &&
 971           dep->succ->sched.instr == instr)
 972          return true;
 973    }
 974
 975    return false;
 976 }
 977
 978 static gpir_node *consuming_postlog2(gpir_node *node)
 979 {
 980    if (node->op != gpir_op_complex1)
 981       return NULL;
 982
 983    gpir_node_foreach_succ(node, dep) {
 984       if (dep->type != GPIR_DEP_INPUT)
 985          continue;
 986       if (dep->succ->op == gpir_op_postlog2)
 987          return dep->succ;
 988       else
 989          return NULL;
 990    }
 991
 992    return NULL;
 993 }
 994
 995 static bool try_spill_node(sched_ctx *ctx, gpir_node *node)
 996 {
 997    assert(node->op != gpir_op_mov);
 998
 999    if (used_by_store(node, ctx->instr))
1000       return false;
1001
1002    gpir_debug("trying to spill %d\n", node->index);
1003
1004    int min_instr = INT_MAX;
1005    uint64_t available = get_available_regs(ctx, node, &min_instr);
1006    available &= ~get_killed_regs(ctx, min_instr);
1007
1008    if (node->sched.physreg_store) {
1009       gpir_store_node *store = node->sched.physreg_store;
1010       if (!(available & (1ull << (4 * store->index + store->component))))
1011          return false;
1012    } else {
1013       available &= ~ctx->live_physregs;
1014
1015       if (available == 0)
1016          return false;
1017
1018       /* Don't spill complex1 if it's used postlog2, turn the postlog2 into a
1019        * move, replace the complex1 with postlog2 and spill that instead. The
1020        * store needs a move anyways so the postlog2 is usually free.
1021        */
1022       gpir_node *postlog2 = consuming_postlog2(node);
1023       if (postlog2) {
1024          postlog2->op = gpir_op_mov;
1025          node = create_postlog2(ctx, node);
1026       }
1027
1028       /* TODO: use a better heuristic for choosing an available register? */
1029       int physreg = ffsll(available) - 1;
1030
1031       ctx->live_physregs |= (1ull << physreg);
1032
1033       gpir_store_node *store = gpir_node_create(ctx->block, gpir_op_store_reg);
1034       store->index = physreg / 4;
1035       store->component = physreg % 4;
1036       store->child = node;
1037       store->node.sched.max_node = false;
1038       store->node.sched.next_max_node = false;
1039       store->node.sched.complex_allowed = false;
1040       store->node.sched.pos = -1;
1041       store->node.sched.instr = NULL;
1042       store->node.sched.inserted = false;
1043       store->node.sched.dist = node->sched.dist;
1044       if (node->op == gpir_op_complex1) {
1045          /* Complex1 cannot be directly stored, and has a latency of 2 */
1046          store->node.sched.dist += 2;
1047       }
1048       node->sched.physreg_store = store;
1049       gpir_node_add_dep(&store->node, node, GPIR_DEP_INPUT);
1050
1051       list_for_each_entry(gpir_load_node, load,
1052                           &ctx->physreg_reads[physreg], reg_link) {
1053          gpir_node_add_dep(&store->node, &load->node, GPIR_DEP_WRITE_AFTER_READ);
1054          if (load->node.sched.ready) {
1055             list_del(&load->node.list);
1056             load->node.sched.ready = false;
1057          }
1058       }
1059
1060       node->sched.ready = false;
1061       schedule_insert_ready_list(ctx, &store->node);
1062    }
1063
1064    gpir_debug("spilling %d to $%d.%c, store %d\n", node->index,
1065               node->sched.physreg_store->index,
1066               "xyzw"[node->sched.physreg_store->component],
1067               node->sched.physreg_store->node.index);
1068
1069    spill_node(ctx, node, node->sched.physreg_store);
1070
1071    return true;
1072 }
1073
1074 static bool try_spill_nodes(sched_ctx *ctx, gpir_node *orig_node)
1075 {
1076    /* First, try to spill max nodes. */
1077    list_for_each_entry_safe_rev(gpir_node, node, &ctx->ready_list, list) {
1078       if (ctx->max_node_spill_needed <= 0)
1079          break;
1080
1081       /* orig_node is the node we're trying to schedule, so spilling it makes
1082        * no sense. Also don't try to spill any nodes in front of it, since
1083        * they might be scheduled instead.
1084        */
1085       if (node == orig_node)
1086          break;
1087
1088       if (node->op == gpir_op_mov) {
1089          /* Don't try to spill loads, since that only adds another load and
1090           * store which is likely pointless.
1091           */
1092          continue;
1093       }
1094
1095       if (!gpir_is_input_node(node) || !node->sched.max_node)
1096          continue;
1097
1098       if (try_spill_node(ctx, node)) {
1099          ctx->max_node_spill_needed--;
1100          ctx->total_spill_needed--;
1101       }
1102    }
1103
1104    /* Now, try to spill the remaining nodes. */
1105    list_for_each_entry_safe_rev(gpir_node, node, &ctx->ready_list, list) {
1106       if (ctx->total_spill_needed <= 0)
1107          break;
1108
1109       if (node == orig_node)
1110          break;
1111
1112       if (node->op == gpir_op_mov)
1113          continue;
1114
1115       if (!gpir_is_input_node(node) ||
1116           !(node->sched.max_node || node->sched.next_max_node))
1117          continue;
1118
1119       if (try_spill_node(ctx, node))
1120          ctx->total_spill_needed--;
1121    }
1122
1123    return ctx->total_spill_needed <= 0 && ctx->max_node_spill_needed <= 0;
1124 }
1125
1126 static int gpir_get_curr_ready_list_slots(sched_ctx *ctx)
1127 {
1128    int total = 0;
1129    list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
1130       total += gpir_get_slots_required(node);
1131    }
1132
1133    return total;
1134 }
1135
1136 /* What gpir_get_min_end() would return if node were replaced with a move
1137  * instruction not in the complex slot. Normally this is 2 + min_end, except
1138  * for some store instructions which must have the move node in the same
1139  * instruction.
1140  */
1141 static int gpir_get_min_end_as_move(gpir_node *node)
1142 {
1143    int min = INT_MAX;
1144    gpir_node_foreach_succ(node, dep) {
1145       gpir_node *succ = dep->succ;
1146       if (succ->sched.instr && dep->type == GPIR_DEP_INPUT) {
1147          switch (succ->op) {
1148          case gpir_op_store_temp:
1149          case gpir_op_store_reg:
1150          case gpir_op_store_varying:
1151             continue;
1152          default:
1153             break;
1154          }
1155          if (min > succ->sched.instr->index + 2)
1156             min = succ->sched.instr->index + 2;
1157       }
1158    }
1159    return min;
1160 }
1161
1162 /* The second source for add0, add1, mul0, and mul1 units cannot be complex.
1163  * The hardware overwrites the add second sources with 0 and mul second
1164  * sources with 1. This can be a problem if we need to insert more next-max
1165  * moves but we only have values that can't use the complex unit for moves.
1166  *
1167  * Fortunately, we only need to insert a next-max move if there are more than
1168  * 5 next-max nodes, but there are only 4 sources in the previous instruction
1169  * that make values not complex-capable, which means there can be at most 4
1170  * non-complex-capable values. Hence there will always be at least two values
1171  * that can be rewritten to use a move in the complex slot. However, we have
1172  * to be careful not to waste those values by putting both of them in a
1173  * non-complex slot. This is handled for us by gpir_instr, which will reject
1174  * such instructions. We just need to tell it which nodes can use complex, and
1175  * it will do the accounting to figure out what is safe.
1176  */
1177
1178 static bool can_use_complex(gpir_node *node)
1179 {
1180    gpir_node_foreach_succ(node, dep) {
1181       if (dep->type != GPIR_DEP_INPUT)
1182          continue;
1183
1184       gpir_node *succ = dep->succ;
1185       if (succ->type != gpir_node_type_alu ||
1186           !succ->sched.instr)
1187          continue;
1188
1189       /* Note: this must be consistent with gpir_codegen_{mul,add}_slot{0,1}
1190        */
1191       gpir_alu_node *alu = gpir_node_to_alu(succ);
1192       switch (alu->node.op) {
1193       case gpir_op_complex1:
1194          /* complex1 puts its third source in the fourth slot */
1195          if (alu->children[1] == node || alu->children[2] == node)
1196             return false;
1197          break;
1198       case gpir_op_complex2:
1199          /* complex2 has its source duplicated, since it actually takes two
1200           * sources but we only ever use it with both sources the same. Hence
1201           * its source can never be the complex slot.
1202           */
1203          return false;
1204       case gpir_op_select:
1205          /* Select has its sources rearranged */
1206          if (alu->children[0] == node)
1207             return false;
1208          break;
1209       default:
1210          assert(alu->num_child <= 2);
1211          if (alu->num_child == 2 && alu->children[1] == node)
1212             return false;
1213          break;
1214       }
1215    }
1216
1217    return true;
1218 }
1219
1220 /* Initialize node->sched.max_node and node->sched.next_max_node for every
1221  * input node on the ready list. We should only need to do this once per
1222  * instruction, at the beginning, since we never add max nodes to the ready
1223  * list.
1224  */
1225
1226 static void sched_find_max_nodes(sched_ctx *ctx)
1227 {
1228    ctx->instr->alu_num_unscheduled_next_max = 0;
1229    ctx->instr->alu_num_slot_needed_by_max = 0;
1230
1231    list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
1232       if (!gpir_is_input_node(node))
1233          continue;
1234
1235       int min_end_move = gpir_get_min_end_as_move(node);
1236       node->sched.max_node = (min_end_move == ctx->instr->index);
1237       node->sched.next_max_node = (min_end_move == ctx->instr->index + 1);
1238       if (node->sched.next_max_node)
1239          node->sched.complex_allowed = can_use_complex(node);
1240
1241       if (node->sched.max_node)
1242          ctx->instr->alu_num_slot_needed_by_max++;
1243       if (node->sched.next_max_node)
1244          ctx->instr->alu_num_unscheduled_next_max++;
1245    }
1246 }
1247
1248 /* Verify the invariants described in gpir.h, as well as making sure the
1249  * counts are correct.
1250  */
1251 static void verify_max_nodes(sched_ctx *ctx)
1252 {
1253    int alu_num_slot_needed_by_max = 0;
1254    int alu_num_unscheduled_next_max = 0;
1255    int alu_num_slot_needed_by_store = 0;
1256    int alu_num_slot_needed_by_non_cplx_store = 0;
1257    int alu_max_allowed_next_max = 5;
1258
1259    list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
1260       if (!gpir_is_input_node(node))
1261          continue;
1262
1263       if (node->sched.max_node)
1264          alu_num_slot_needed_by_max++;
1265       if (node->sched.next_max_node)
1266          alu_num_unscheduled_next_max++;
1267       if (used_by_store(node, ctx->instr)) {
1268          alu_num_slot_needed_by_store++;
1269          if (node->sched.next_max_node && !node->sched.complex_allowed)
1270             alu_num_slot_needed_by_non_cplx_store++;
1271       }
1272    }
1273
1274    if (ctx->instr->slots[GPIR_INSTR_SLOT_MUL0] &&
1275        ctx->instr->slots[GPIR_INSTR_SLOT_MUL0]->op == gpir_op_complex1)
1276       alu_max_allowed_next_max = 4;
1277
1278    assert(ctx->instr->alu_num_slot_needed_by_max == alu_num_slot_needed_by_max);
1279    assert(ctx->instr->alu_num_unscheduled_next_max == alu_num_unscheduled_next_max);
1280    assert(ctx->instr->alu_max_allowed_next_max == alu_max_allowed_next_max);
1281    assert(ctx->instr->alu_num_slot_needed_by_store == alu_num_slot_needed_by_store);
1282    assert(ctx->instr->alu_num_slot_needed_by_non_cplx_store ==
1283           alu_num_slot_needed_by_non_cplx_store);
1284    assert(ctx->instr->alu_num_slot_free >= alu_num_slot_needed_by_store + alu_num_slot_needed_by_max + MAX2(alu_num_unscheduled_next_max - alu_max_allowed_next_max, 0));
1285    assert(ctx->instr->alu_non_cplx_slot_free >= alu_num_slot_needed_by_max + alu_num_slot_needed_by_non_cplx_store);
1286 }
1287
1288 static bool try_node(sched_ctx *ctx)
1289 {
1290    gpir_node *best_node = NULL;
1291    int best_score = INT_MIN;
1292
1293    /* Spilling will delete arbitrary nodes after the current one in the ready
1294     * list, which means that we always need to look up the next node in the
1295     * list at the end of each iteration. While list_for_each_entry() works for
1296     * this purpose, its sanity checking assumes that you don't want to modify
1297     * the list at all. We know better here, so we have to open-code
1298     * list_for_each_entry() without the check in order to not assert.
1299     */
1300    for (gpir_node *node = LIST_ENTRY(gpir_node, ctx->ready_list.next, list);
1301         &node->list != &ctx->ready_list;
1302         node = LIST_ENTRY(gpir_node, node->list.next, list)) {
1303       if (best_score != INT_MIN) {
1304          if (node->sched.dist < best_node->sched.dist)
1305             break;
1306       }
1307
1308       if (node->sched.ready) {
1309          ctx->total_spill_needed = 0;
1310          ctx->max_node_spill_needed = 0;
1311          int score = schedule_try_node(ctx, node, true);
1312          if (score == INT_MIN && !best_node &&
1313              ctx->total_spill_needed > 0 &&
1314              try_spill_nodes(ctx, node)) {
1315             score = schedule_try_node(ctx, node, true);
1316          }
1317
1318          /* schedule_first nodes must be scheduled if possible */
1319          if (gpir_op_infos[node->op].schedule_first && score != INT_MIN) {
1320             best_node = node;
1321             best_score = score;
1322             break;
1323          }
1324
1325          if (score > best_score) {
1326             best_score = score;
1327             best_node = node;
1328          }
1329       }
1330    }
1331
1332    if (best_node) {
1333       gpir_debug("scheduling %d (score = %d)%s\n", best_node->index,
1334                  best_score, best_node->sched.max_node ? " (max)" : "");
1335       ASSERTED int score = schedule_try_node(ctx, best_node, false);
1336       assert(score != INT_MIN);
1337       return true;
1338    }
1339
1340    return false;
1341 }
1342
1343 static void place_move(sched_ctx *ctx, gpir_node *node)
1344 {
1345    /* For complex1 that is consumed by a postlog2, we cannot allow any moves
1346     * in between. Convert the postlog2 to a move and insert a new postlog2,
1347     * and try to schedule it again in try_node().
1348     */
1349    gpir_node *postlog2 = consuming_postlog2(node);
1350    if (postlog2) {
1351       postlog2->op = gpir_op_mov;
1352       create_postlog2(ctx, node);
1353       return;
1354    }
1355
1356    gpir_node *move = create_move(ctx, node);
1357    gpir_node_foreach_succ_safe(move, dep) {
1358       gpir_node *succ = dep->succ;
1359       if (!succ->sched.instr ||
1360           ctx->instr->index < succ->sched.instr->index + gpir_get_min_dist(dep)) {
1361          gpir_node_replace_pred(dep, node);
1362          if (dep->type == GPIR_DEP_INPUT)
1363             gpir_node_replace_child(succ, move, node);
1364       }
1365    }
1366    ASSERTED int score = schedule_try_node(ctx, move, false);
1367    assert(score != INT_MIN);
1368 }
1369
1370 /* For next-max nodes, not every node can be offloaded to a move in the
1371  * complex slot. If we run out of non-complex slots, then such nodes cannot
1372  * have moves placed for them. There should always be sufficient
1373  * complex-capable nodes so that this isn't a problem. We also disallow moves
1374  * for schedule_first nodes here.
1375  */
1376 static bool can_place_move(sched_ctx *ctx, gpir_node *node)
1377 {
1378    if (gpir_op_infos[node->op].schedule_first)
1379       return false;
1380
1381    if (!node->sched.next_max_node)
1382       return true;
1383
1384    if (node->sched.complex_allowed)
1385       return true;
1386
1387    return ctx->instr->alu_non_cplx_slot_free > 0;
1388 }
1389
1390 static bool sched_move(sched_ctx *ctx)
1391 {
1392    list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
1393       if (node->sched.max_node) {
1394          place_move(ctx, node);
1395          return true;
1396       }
1397    }
1398
1399    if (ctx->instr->alu_num_slot_needed_by_store > 0) {
1400       list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
1401          if (used_by_store(node, ctx->instr)) {
1402             place_move(ctx, node);
1403             /* If we have a store of a load, then we need to make sure that we
1404              * immediately schedule the dependent load, or create a move
1405              * instruction for it, like we would with a normal instruction.
1406              * The rest of the code isn't set up to handle load nodes in the
1407              * ready list -- see the comments in _schedule_try_node().
1408              */
1409             if (node->type == gpir_node_type_load) {
1410                if (!schedule_try_place_node(ctx, node, false)) {
1411                   create_move(ctx, node);
1412                }
1413             }
1414             return true;
1415          }
1416       }
1417    }
1418
1419    /* complex1 is a bit a special case, since it has a latency of 2 cycles.
1420     * Once it is fully ready, we need to group all its uses in the same
1421     * instruction, and then we need to avoid creating any moves in the next
1422     * cycle in order to get it scheduled. Failing to do any of these things
1423     * could result in a cycle penalty, or even worse, an infinite loop of
1424     * inserting moves. If it is a next-max node and ready, then it has a use
1425     * in the previous cycle. If it has a use in the current cycle as well,
1426     * then we want to insert a move node to make it ready in two cycles -- if
1427     * we don't, then there will be at least a one cycle penalty. Otherwise, it
1428     * will be ready next cycle, and we shouldn't insert a move node, or else
1429     * we'll also have a one cycle penalty.
1430     */
1431    if (ctx->instr->alu_num_slot_free > 0) {
1432       list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
1433          if (!can_place_move(ctx, node))
1434             continue;
1435
1436          if (node->sched.next_max_node && node->op == gpir_op_complex1 &&
1437              node->sched.ready) {
1438             bool skip = true;
1439             gpir_node_foreach_succ(node, dep) {
1440                if (dep->type != GPIR_DEP_INPUT)
1441                   continue;
1442
1443                gpir_node *succ = dep->succ;
1444
1445                if (!succ->sched.instr ||
1446                    succ->sched.instr->index != ctx->instr->index - 1) {
1447                   skip = false;
1448                   break;
1449                }
1450             }
1451
1452             if (skip)
1453                continue;
1454
1455             place_move(ctx, node);
1456             return true;
1457          }
1458       }
1459    }
1460
1461    /* Once we've made all the required moves, we're free to use any extra
1462     * slots to schedule more moves for next max nodes. Besides sometimes being
1463     * necessary, this can free up extra space in the next instruction. We walk
1464     * from back to front so that we pick nodes less likely to be scheduled
1465     * next first -- an extra move would be unnecessary there. But make sure
1466     * not to handle the complex1 case handled above.
1467     */
1468    if (ctx->instr->alu_num_slot_free > 0) {
1469       list_for_each_entry_rev(gpir_node, node, &ctx->ready_list, list) {
1470          if (!can_place_move(ctx, node))
1471             continue;
1472
1473          if (node->sched.next_max_node &&
1474              !(node->op == gpir_op_complex1 && node->sched.ready)) {
1475             place_move(ctx, node);
1476             return true;
1477          }
1478       }
1479    }
1480
1481    /* We may have skipped complex1 above, but if we run out of space, we still
1482     * need to insert the move.
1483     */
1484
1485    if (ctx->instr->alu_num_unscheduled_next_max >
1486        ctx->instr->alu_max_allowed_next_max) {
1487       list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
1488          if (!can_place_move(ctx, node))
1489             continue;
1490
1491          if (node->sched.next_max_node) {
1492             place_move(ctx, node);
1493             return true;
1494          }
1495       }
1496    }
1497
1498
1499    return false;
1500 }
1501
1502 static bool gpir_sched_instr_pass(sched_ctx *ctx)
1503 {
1504    if (try_node(ctx))
1505       return true;
1506
1507    if (sched_move(ctx))
1508       return true;
1509
1510    return false;
1511 }
1512
1513 static void schedule_print_pre_one_instr(sched_ctx *ctx)
1514 {
1515    if (!(lima_debug & LIMA_DEBUG_GP))
1516       return;
1517
1518    printf("instr %d for ready list:", ctx->instr->index);
1519    list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
1520       printf(" %d/%c (%d, %d, %s)", node->index, node->sched.ready ? 'r' : 'p',
1521              node->sched.dist, gpir_get_slots_required(node),
1522              node->sched.max_node ? "max" : (node->sched.next_max_node ? "next" : "none"));
1523    }
1524    printf("\nlive physregs: ");
1525    for (unsigned i = 0; i < 16; i++) {
1526       if (ctx->live_physregs & (0xfull << (4 * i))) {
1527          printf("$%d.", i);
1528          for (unsigned j = 0; j < 4; j++) {
1529             if (ctx->live_physregs & (1ull << (4 * i + j)))
1530                printf("%c", "xyzw"[j]);
1531          }
1532          printf(" ");
1533       }
1534    }
1535    printf("\n");
1536 }
1537
1538 static void schedule_print_post_one_instr(gpir_instr *instr)
1539 {
1540    if (!(lima_debug & LIMA_DEBUG_GP))
1541       return;
1542
1543    printf("post schedule instr");
1544    for (int i = 0; i < GPIR_INSTR_SLOT_NUM; i++) {
1545       if (instr->slots[i])
1546          printf(" %d/%d", i, instr->slots[i]->index);
1547    }
1548    printf("\n");
1549 }
1550
1551
1552 static bool schedule_one_instr(sched_ctx *ctx)
1553 {
1554    gpir_instr *instr = gpir_instr_create(ctx->block);
1555    if (unlikely(!instr))
1556       return false;
1557
1558    ctx->instr = instr;
1559
1560    sched_find_max_nodes(ctx);
1561    schedule_print_pre_one_instr(ctx);
1562
1563    while (gpir_sched_instr_pass(ctx)) {
1564       assert(ctx->ready_list_slots == gpir_get_curr_ready_list_slots(ctx));
1565 #ifndef NDEBUG
1566       verify_max_nodes(ctx);
1567       verify_ready_list(ctx);
1568 #endif
1569    }
1570
1571    schedule_print_post_one_instr(instr);
1572    return true;
1573 }
1574
1575 static bool schedule_block(gpir_block *block)
1576 {
1577    /* calculate distance */
1578    list_for_each_entry(gpir_node, node, &block->node_list, list) {
1579       if (gpir_node_is_root(node))
1580          schedule_update_distance(node);
1581    }
1582
1583    sched_ctx ctx;
1584    list_inithead(&ctx.ready_list);
1585    ctx.block = block;
1586    ctx.ready_list_slots = 0;
1587    ctx.live_physregs = block->live_out_phys;
1588
1589    for (unsigned i = 0; i < GPIR_PHYSICAL_REG_NUM; i++) {
1590       list_inithead(&ctx.physreg_reads[i]);
1591    }
1592
1593    /* construct the ready list from root nodes */
1594    list_for_each_entry_safe(gpir_node, node, &block->node_list, list) {
1595       /* Add to physreg_reads */
1596       if (node->op == gpir_op_load_reg) {
1597          gpir_load_node *load = gpir_node_to_load(node);
1598          unsigned index = 4 * load->index + load->component;
1599          list_addtail(&load->reg_link, &ctx.physreg_reads[index]);
1600       }
1601
1602       if (gpir_node_is_root(node))
1603          schedule_insert_ready_list(&ctx, node);
1604    }
1605
1606    list_inithead(&block->node_list);
1607    while (!list_is_empty(&ctx.ready_list)) {
1608       if (!schedule_one_instr(&ctx))
1609          return false;
1610    }
1611
1612    return true;
1613 }
1614
1615 static void add_fake_dep(gpir_node *node, gpir_node *dep_node,
1616                          gpir_node *last_written[])
1617 {
1618       gpir_node_foreach_pred(node, dep) {
1619          if (dep->type == GPIR_DEP_INPUT) {
1620             int index = dep->pred->value_reg;
1621             if (index >= 0 && last_written[index]) {
1622                gpir_node_add_dep(last_written[index], dep_node,
1623                                  GPIR_DEP_WRITE_AFTER_READ);
1624             }
1625             if (gpir_op_infos[dep->pred->op].schedule_first) {
1626                /* Insert fake dependencies for any schedule_first children on
1627                 * this node as well. This guarantees that as soon as
1628                 * "dep_node" is ready to schedule, all of its schedule_first
1629                 * children, grandchildren, etc. are ready so that they can be
1630                 * scheduled as soon as possible.
1631                 */
1632                add_fake_dep(dep->pred, dep_node, last_written);
1633             }
1634          }
1635       }
1636 }
1637
1638 static void schedule_build_dependency(gpir_block *block)
1639 {
1640    gpir_node *last_written[GPIR_VALUE_REG_NUM + GPIR_PHYSICAL_REG_NUM] = {0};
1641
1642    /* merge dummy_f/m to the node created from */
1643    list_for_each_entry_safe(gpir_node, node, &block->node_list, list) {
1644       if (node->op == gpir_op_dummy_m) {
1645          gpir_alu_node *alu = gpir_node_to_alu(node);
1646          gpir_node *origin = alu->children[0];
1647          gpir_node *dummy_f = alu->children[1];
1648
1649          gpir_node_foreach_succ(node, dep) {
1650             gpir_node *succ = dep->succ;
1651             /* origin and node may have same succ (by VREG/INPUT or
1652              * VREG/VREG dep), so use gpir_node_add_dep() instead of
1653              * gpir_node_replace_pred() */
1654             gpir_node_add_dep(succ, origin, dep->type);
1655             gpir_node_replace_child(succ, node, origin);
1656          }
1657          gpir_node_delete(dummy_f);
1658          gpir_node_delete(node);
1659       }
1660    }
1661
1662    memset(last_written, 0, sizeof(last_written));
1663
1664    /* False dependencies. For value registers, these exist only to make sure
1665     * that the maximum pressure isn't exceeded and are hence "fake".
1666     */
1667    list_for_each_entry_rev(gpir_node, node, &block->node_list, list) {
1668       if (node->op == gpir_op_load_reg) {
1669          gpir_load_node *load = gpir_node_to_load(node);
1670          unsigned index = 4 * load->index + load->component;
1671          if (last_written[index]) {
1672             gpir_node_add_dep(last_written[index], node, GPIR_DEP_WRITE_AFTER_READ);
1673          }
1674       } else if (node->op == gpir_op_store_reg) {
1675          gpir_store_node *store = gpir_node_to_store(node);
1676          unsigned index = 4 * store->index + store->component;
1677          last_written[index] = node;
1678       } else {
1679          add_fake_dep(node, node, last_written);
1680       }
1681
1682       if (node->value_reg >= 0)
1683          last_written[node->value_reg] = node;
1684    }
1685 }
1686
1687 static void print_statistic(gpir_compiler *comp, int save_index)
1688 {
1689    int num_nodes[gpir_op_num] = {0};
1690    int num_created_nodes[gpir_op_num] = {0};
1691
1692    list_for_each_entry(gpir_block, block, &comp->block_list, list) {
1693       list_for_each_entry(gpir_node, node, &block->node_list, list) {
1694          num_nodes[node->op]++;
1695          if (node->index >= save_index)
1696             num_created_nodes[node->op]++;
1697       }
1698    }
1699
1700    printf("====== gpir scheduler statistic ======\n");
1701    printf("---- how many nodes are scheduled ----\n");
1702    int n = 0, l = 0;
1703    for (int i = 0; i < gpir_op_num; i++) {
1704       if (num_nodes[i]) {
1705          printf("%10s:%-6d", gpir_op_infos[i].name, num_nodes[i]);
1706          n += num_nodes[i];
1707          if (!(++l % 4))
1708             printf("\n");
1709       }
1710    }
1711    if (l % 4)
1712       printf("\n");
1713    printf("\ntotal: %d\n", n);
1714
1715    printf("---- how many nodes are created ----\n");
1716    n = l = 0;
1717    for (int i = 0; i < gpir_op_num; i++) {
1718       if (num_created_nodes[i]) {
1719          printf("%10s:%-6d", gpir_op_infos[i].name, num_created_nodes[i]);
1720          n += num_created_nodes[i];
1721          if (!(++l % 4))
1722             printf("\n");
1723       }
1724    }
1725    if (l % 4)
1726       printf("\n");
1727    printf("\ntotal: %d\n", n);
1728    printf("------------------------------------\n");
1729 }
1730
1731 bool gpir_schedule_prog(gpir_compiler *comp)
1732 {
1733    int save_index = comp->cur_index;
1734
1735    /* init schedule info */
1736    int index = 0;
1737    list_for_each_entry(gpir_block, block, &comp->block_list, list) {
1738       block->sched.instr_index = 0;
1739       list_for_each_entry(gpir_node, node, &block->node_list, list) {
1740          node->sched.instr = NULL;
1741          node->sched.pos = -1;
1742          node->sched.index = index++;
1743          node->sched.dist = -1;
1744          /* TODO when we support multiple basic blocks, we need a way to keep
1745           * track of this for physregs allocated before the scheduler.
1746           */
1747          node->sched.physreg_store = NULL;
1748          node->sched.ready = false;
1749          node->sched.inserted = false;
1750          node->sched.complex_allowed = false;
1751          node->sched.max_node = false;
1752          node->sched.next_max_node = false;
1753       }
1754    }
1755
1756    /* build dependency */
1757    list_for_each_entry(gpir_block, block, &comp->block_list, list) {
1758       schedule_build_dependency(block);
1759    }
1760
1761    //gpir_debug("after scheduler build reg dependency\n");
1762    //gpir_node_print_prog_dep(comp);
1763
1764    list_for_each_entry(gpir_block, block, &comp->block_list, list) {
1765       if (!schedule_block(block)) {
1766          gpir_error("fail schedule block\n");
1767          return false;
1768       }
1769    }
1770
1771    if (lima_debug & LIMA_DEBUG_GP) {
1772       print_statistic(comp, save_index);
1773       gpir_instr_print_prog(comp);
1774    }
1775
1776    return true;
1777 }