src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  * Authors:
  24  *    Eric Anholt <eric@anholt.net>
  25  *
  26  */
  27
  28 #include "brw_fs.h"
  29 #include "brw_cfg.h"
  30 #include "glsl/glsl_types.h"
  31 #include "glsl/ir_optimization.h"
  32
  33 static void
  34 assign_reg(int *reg_hw_locations, fs_reg *reg, int reg_width)
  35 {
  36    if (reg->file == GRF) {
  37       assert(reg->reg_offset >= 0);
  38       reg->reg = reg_hw_locations[reg->reg] + reg->reg_offset * reg_width;
  39       reg->reg_offset = 0;
  40    }
  41 }
  42
  43 void
  44 fs_visitor::assign_regs_trivial()
  45 {
  46    int hw_reg_mapping[this->virtual_grf_count + 1];
  47    int i;
  48    int reg_width = dispatch_width / 8;
  49
  50    /* Note that compressed instructions require alignment to 2 registers. */
  51    hw_reg_mapping[0] = ALIGN(this->first_non_payload_grf, reg_width);
  52    for (i = 1; i <= this->virtual_grf_count; i++) {
  53       hw_reg_mapping[i] = (hw_reg_mapping[i - 1] +
  54                            this->virtual_grf_sizes[i - 1] * reg_width);
  55    }
  56    this->grf_used = hw_reg_mapping[this->virtual_grf_count];
  57
  58    foreach_in_list(fs_inst, inst, &instructions) {
  59       assign_reg(hw_reg_mapping, &inst->dst, reg_width);
  60       for (i = 0; i < inst->sources; i++) {
  61          assign_reg(hw_reg_mapping, &inst->src[i], reg_width);
  62       }
  63    }
  64
  65    if (this->grf_used >= max_grf) {
  66       fail("Ran out of regs on trivial allocator (%d/%d)\n",
  67            this->grf_used, max_grf);
  68    } else {
  69       this->virtual_grf_count = this->grf_used;
  70    }
  71
  72 }
  73
  74 static void
  75 brw_alloc_reg_set(struct intel_screen *screen, int reg_width)
  76 {
  77    const struct brw_device_info *devinfo = screen->devinfo;
  78    int base_reg_count = BRW_MAX_GRF / reg_width;
  79    int index = reg_width - 1;
  80
  81    /* The registers used to make up almost all values handled in the compiler
  82     * are a scalar value occupying a single register (or 2 registers in the
  83     * case of SIMD16, which is handled by dividing base_reg_count by 2 and
  84     * multiplying allocated register numbers by 2).  Things that were
  85     * aggregates of scalar values at the GLSL level were split to scalar
  86     * values by split_virtual_grfs().
  87     *
  88     * However, texture SEND messages return a series of contiguous registers
  89     * to write into.  We currently always ask for 4 registers, but we may
  90     * convert that to use less some day.
  91     *
  92     * Additionally, on gen5 we need aligned pairs of registers for the PLN
  93     * instruction, and on gen4 we need 8 contiguous regs for workaround simd16
  94     * texturing.
  95     *
  96     * So we have a need for classes for 1, 2, 4, and 8 registers currently,
  97     * and we add in '3' to make indexing the array easier for the common case
  98     * (since we'll probably want it for texturing later).
  99     *
 100     * And, on gen7 and newer, we do texturing SEND messages from GRFs, which
 101     * means that we may need any size up to the sampler message size limit (11
 102     * regs).
 103     */
 104    int class_count;
 105    int class_sizes[BRW_MAX_MRF];
 106
 107    if (devinfo->gen >= 7) {
 108       for (class_count = 0; class_count < MAX_SAMPLER_MESSAGE_SIZE;
 109            class_count++)
 110          class_sizes[class_count] = class_count + 1;
 111    } else {
 112       for (class_count = 0; class_count < 4; class_count++)
 113          class_sizes[class_count] = class_count + 1;
 114       class_sizes[class_count++] = 8;
 115    }
 116
 117    /* Compute the total number of registers across all classes. */
 118    int ra_reg_count = 0;
 119    for (int i = 0; i < class_count; i++) {
 120       ra_reg_count += base_reg_count - (class_sizes[i] - 1);
 121    }
 122
 123    uint8_t *ra_reg_to_grf = ralloc_array(screen, uint8_t, ra_reg_count);
 124    struct ra_regs *regs = ra_alloc_reg_set(screen, ra_reg_count);
 125    if (devinfo->gen >= 6)
 126       ra_set_allocate_round_robin(regs);
 127    int *classes = ralloc_array(screen, int, class_count);
 128    int aligned_pairs_class = -1;
 129
 130    /* Now, add the registers to their classes, and add the conflicts
 131     * between them and the base GRF registers (and also each other).
 132     */
 133    int reg = 0;
 134    int pairs_base_reg = 0;
 135    int pairs_reg_count = 0;
 136    for (int i = 0; i < class_count; i++) {
 137       int class_reg_count = base_reg_count - (class_sizes[i] - 1);
 138       classes[i] = ra_alloc_reg_class(regs);
 139
 140       /* Save this off for the aligned pair class at the end. */
 141       if (class_sizes[i] == 2) {
 142          pairs_base_reg = reg;
 143          pairs_reg_count = class_reg_count;
 144       }
 145
 146       for (int j = 0; j < class_reg_count; j++) {
 147          ra_class_add_reg(regs, classes[i], reg);
 148
 149          ra_reg_to_grf[reg] = j;
 150
 151          for (int base_reg = j;
 152               base_reg < j + class_sizes[i];
 153               base_reg++) {
 154             ra_add_transitive_reg_conflict(regs, base_reg, reg);
 155          }
 156
 157          reg++;
 158       }
 159    }
 160    assert(reg == ra_reg_count);
 161
 162    /* Add a special class for aligned pairs, which we'll put delta_x/y
 163     * in on gen5 so that we can do PLN.
 164     */
 165    if (devinfo->has_pln && reg_width == 1 && devinfo->gen < 6) {
 166       aligned_pairs_class = ra_alloc_reg_class(regs);
 167
 168       for (int i = 0; i < pairs_reg_count; i++) {
 169          if ((ra_reg_to_grf[pairs_base_reg + i] & 1) == 0) {
 170             ra_class_add_reg(regs, aligned_pairs_class, pairs_base_reg + i);
 171          }
 172       }
 173    }
 174
 175    ra_set_finalize(regs, NULL);
 176
 177    screen->wm_reg_sets[index].regs = regs;
 178    for (unsigned i = 0; i < ARRAY_SIZE(screen->wm_reg_sets[index].classes); i++)
 179       screen->wm_reg_sets[index].classes[i] = -1;
 180    for (int i = 0; i < class_count; i++)
 181       screen->wm_reg_sets[index].classes[class_sizes[i] - 1] = classes[i];
 182    screen->wm_reg_sets[index].ra_reg_to_grf = ra_reg_to_grf;
 183    screen->wm_reg_sets[index].aligned_pairs_class = aligned_pairs_class;
 184 }
 185
 186 void
 187 brw_fs_alloc_reg_sets(struct intel_screen *screen)
 188 {
 189    brw_alloc_reg_set(screen, 1);
 190    brw_alloc_reg_set(screen, 2);
 191 }
 192
 193 int
 194 count_to_loop_end(fs_inst *do_inst)
 195 {
 196    int depth = 1;
 197    int ip = 1;
 198    for (fs_inst *inst = (fs_inst *)do_inst->next;
 199         depth > 0;
 200         inst = (fs_inst *)inst->next) {
 201       switch (inst->opcode) {
 202       case BRW_OPCODE_DO:
 203          depth++;
 204          break;
 205       case BRW_OPCODE_WHILE:
 206          depth--;
 207          break;
 208       default:
 209          break;
 210       }
 211       ip++;
 212    }
 213    return ip;
 214 }
 215
 216 /**
 217  * Sets up interference between thread payload registers and the virtual GRFs
 218  * to be allocated for program temporaries.
 219  *
 220  * We want to be able to reallocate the payload for our virtual GRFs, notably
 221  * because the setup coefficients for a full set of 16 FS inputs takes up 8 of
 222  * our 128 registers.
 223  *
 224  * The layout of the payload registers is:
 225  *
 226  * 0..payload.num_regs-1: fixed function setup (including bary coordinates).
 227  * payload.num_regs..payload.num_regs+curb_read_lengh-1: uniform data
 228  * payload.num_regs+curb_read_lengh..first_non_payload_grf-1: setup coefficients.
 229  *
 230  * And we have payload_node_count nodes covering these registers in order
 231  * (note that in SIMD16, a node is two registers).
 232  */
 233 void
 234 fs_visitor::setup_payload_interference(struct ra_graph *g,
 235                                        int payload_node_count,
 236                                        int first_payload_node)
 237 {
 238    int reg_width = dispatch_width / 8;
 239    int loop_depth = 0;
 240    int loop_end_ip = 0;
 241
 242    int payload_last_use_ip[payload_node_count];
 243    memset(payload_last_use_ip, 0, sizeof(payload_last_use_ip));
 244    int ip = 0;
 245    foreach_in_list(fs_inst, inst, &instructions) {
 246       switch (inst->opcode) {
 247       case BRW_OPCODE_DO:
 248          loop_depth++;
 249
 250          /* Since payload regs are deffed only at the start of the shader
 251           * execution, any uses of the payload within a loop mean the live
 252           * interval extends to the end of the outermost loop.  Find the ip of
 253           * the end now.
 254           */
 255          if (loop_depth == 1)
 256             loop_end_ip = ip + count_to_loop_end(inst);
 257          break;
 258       case BRW_OPCODE_WHILE:
 259          loop_depth--;
 260          break;
 261       default:
 262          break;
 263       }
 264
 265       int use_ip;
 266       if (loop_depth > 0)
 267          use_ip = loop_end_ip;
 268       else
 269          use_ip = ip;
 270
 271       /* Note that UNIFORM args have been turned into FIXED_HW_REG by
 272        * assign_curbe_setup(), and interpolation uses fixed hardware regs from
 273        * the start (see interp_reg()).
 274        */
 275       for (int i = 0; i < inst->sources; i++) {
 276          if (inst->src[i].file == HW_REG &&
 277              inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
 278             int node_nr = inst->src[i].fixed_hw_reg.nr / reg_width;
 279             if (node_nr >= payload_node_count)
 280                continue;
 281
 282             payload_last_use_ip[node_nr] = use_ip;
 283          }
 284       }
 285
 286       /* Special case instructions which have extra implied registers used. */
 287       switch (inst->opcode) {
 288       case FS_OPCODE_FB_WRITE:
 289          /* We could omit this for the !inst->header_present case, except that
 290           * the simulator apparently incorrectly reads from g0/g1 instead of
 291           * sideband.  It also really freaks out driver developers to see g0
 292           * used in unusual places, so just always reserve it.
 293           */
 294          payload_last_use_ip[0 / reg_width] = use_ip;
 295          payload_last_use_ip[1 / reg_width] = use_ip;
 296          break;
 297
 298       case FS_OPCODE_LINTERP:
 299          /* On gen6+ in SIMD16, there are 4 adjacent registers (so 2 nodes)
 300           * used by PLN's sourcing of the deltas, while we list only the first
 301           * two in the arguments (1 node).  Pre-gen6, the deltas are computed
 302           * in normal VGRFs.
 303           */
 304          if (brw->gen >= 6) {
 305             int delta_x_arg = 0;
 306             if (inst->src[delta_x_arg].file == HW_REG &&
 307                 inst->src[delta_x_arg].fixed_hw_reg.file ==
 308                 BRW_GENERAL_REGISTER_FILE) {
 309                int sechalf_node = (inst->src[delta_x_arg].fixed_hw_reg.nr /
 310                                    reg_width) + 1;
 311                assert(sechalf_node < payload_node_count);
 312                payload_last_use_ip[sechalf_node] = use_ip;
 313             }
 314          }
 315          break;
 316
 317       default:
 318          break;
 319       }
 320
 321       ip++;
 322    }
 323
 324    for (int i = 0; i < payload_node_count; i++) {
 325       /* Mark the payload node as interfering with any virtual grf that is
 326        * live between the start of the program and our last use of the payload
 327        * node.
 328        */
 329       for (int j = 0; j < this->virtual_grf_count; j++) {
 330          /* Note that we use a <= comparison, unlike virtual_grf_interferes(),
 331           * in order to not have to worry about the uniform issue described in
 332           * calculate_live_intervals().
 333           */
 334          if (this->virtual_grf_start[j] <= payload_last_use_ip[i]) {
 335             ra_add_node_interference(g, first_payload_node + i, j);
 336          }
 337       }
 338    }
 339
 340    for (int i = 0; i < payload_node_count; i++) {
 341       /* Mark each payload node as being allocated to its physical register.
 342        *
 343        * The alternative would be to have per-physical-register classes, which
 344        * would just be silly.
 345        */
 346       ra_set_node_reg(g, first_payload_node + i, i);
 347    }
 348 }
 349
 350 /**
 351  * Sets the mrf_used array to indicate which MRFs are used by the shader IR
 352  *
 353  * This is used in assign_regs() to decide which of the GRFs that we use as
 354  * MRFs on gen7 get normally register allocated, and in register spilling to
 355  * see if we can actually use MRFs to do spills without overwriting normal MRF
 356  * contents.
 357  */
 358 void
 359 fs_visitor::get_used_mrfs(bool *mrf_used)
 360 {
 361    int reg_width = dispatch_width / 8;
 362
 363    memset(mrf_used, 0, BRW_MAX_MRF * sizeof(bool));
 364
 365    foreach_in_list(fs_inst, inst, &instructions) {
 366       if (inst->dst.file == MRF) {
 367          int reg = inst->dst.reg & ~BRW_MRF_COMPR4;
 368          mrf_used[reg] = true;
 369          if (reg_width == 2) {
 370             if (inst->dst.reg & BRW_MRF_COMPR4) {
 371                mrf_used[reg + 4] = true;
 372             } else {
 373                mrf_used[reg + 1] = true;
 374             }
 375          }
 376       }
 377
 378       if (inst->mlen > 0) {
 379          for (int i = 0; i < implied_mrf_writes(inst); i++) {
 380             mrf_used[inst->base_mrf + i] = true;
 381          }
 382       }
 383    }
 384 }
 385
 386 /**
 387  * Sets interference between virtual GRFs and usage of the high GRFs for SEND
 388  * messages (treated as MRFs in code generation).
 389  */
 390 void
 391 fs_visitor::setup_mrf_hack_interference(struct ra_graph *g, int first_mrf_node)
 392 {
 393    int reg_width = dispatch_width / 8;
 394
 395    bool mrf_used[BRW_MAX_MRF];
 396    get_used_mrfs(mrf_used);
 397
 398    for (int i = 0; i < BRW_MAX_MRF; i++) {
 399       /* Mark each MRF reg node as being allocated to its physical register.
 400        *
 401        * The alternative would be to have per-physical-register classes, which
 402        * would just be silly.
 403        */
 404       ra_set_node_reg(g, first_mrf_node + i,
 405                       (GEN7_MRF_HACK_START + i) / reg_width);
 406
 407       /* Since we don't have any live/dead analysis on the MRFs, just mark all
 408        * that are used as conflicting with all virtual GRFs.
 409        */
 410       if (mrf_used[i]) {
 411          for (int j = 0; j < this->virtual_grf_count; j++) {
 412             ra_add_node_interference(g, first_mrf_node + i, j);
 413          }
 414       }
 415    }
 416 }
 417
 418 bool
 419 fs_visitor::assign_regs(bool allow_spilling)
 420 {
 421    struct intel_screen *screen = brw->intelScreen;
 422    /* Most of this allocation was written for a reg_width of 1
 423     * (dispatch_width == 8).  In extending to SIMD16, the code was
 424     * left in place and it was converted to have the hardware
 425     * registers it's allocating be contiguous physical pairs of regs
 426     * for reg_width == 2.
 427     */
 428    int reg_width = dispatch_width / 8;
 429    int hw_reg_mapping[this->virtual_grf_count];
 430    int payload_node_count = (ALIGN(this->first_non_payload_grf, reg_width) /
 431                             reg_width);
 432    int rsi = reg_width - 1; /* Which screen->wm_reg_sets[] to use */
 433    calculate_live_intervals();
 434
 435    int node_count = this->virtual_grf_count;
 436    int first_payload_node = node_count;
 437    node_count += payload_node_count;
 438    int first_mrf_hack_node = node_count;
 439    if (brw->gen >= 7)
 440       node_count += BRW_MAX_GRF - GEN7_MRF_HACK_START;
 441    struct ra_graph *g = ra_alloc_interference_graph(screen->wm_reg_sets[rsi].regs,
 442                                                     node_count);
 443
 444    for (int i = 0; i < this->virtual_grf_count; i++) {
 445       unsigned size = this->virtual_grf_sizes[i];
 446       int c;
 447
 448       assert(size <= ARRAY_SIZE(screen->wm_reg_sets[rsi].classes) &&
 449              "Register allocation relies on split_virtual_grfs()");
 450       c = screen->wm_reg_sets[rsi].classes[size - 1];
 451
 452       /* Special case: on pre-GEN6 hardware that supports PLN, the
 453        * second operand of a PLN instruction needs to be an
 454        * even-numbered register, so we have a special register class
 455        * wm_aligned_pairs_class to handle this case.  pre-GEN6 always
 456        * uses this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] as the
 457        * second operand of a PLN instruction (since it doesn't support
 458        * any other interpolation modes).  So all we need to do is find
 459        * that register and set it to the appropriate class.
 460        */
 461       if (screen->wm_reg_sets[rsi].aligned_pairs_class >= 0 &&
 462           this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF &&
 463           this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg == i) {
 464          c = screen->wm_reg_sets[rsi].aligned_pairs_class;
 465       }
 466
 467       ra_set_node_class(g, i, c);
 468
 469       for (int j = 0; j < i; j++) {
 470          if (virtual_grf_interferes(i, j)) {
 471             ra_add_node_interference(g, i, j);
 472          }
 473       }
 474    }
 475
 476    setup_payload_interference(g, payload_node_count, first_payload_node);
 477    if (brw->gen >= 7)
 478       setup_mrf_hack_interference(g, first_mrf_hack_node);
 479
 480    /* Debug of register spilling: Go spill everything. */
 481    if (0) {
 482       int reg = choose_spill_reg(g);
 483
 484       if (reg != -1) {
 485          spill_reg(reg);
 486          ralloc_free(g);
 487          return false;
 488       }
 489    }
 490
 491    if (!ra_allocate(g)) {
 492       /* Failed to allocate registers.  Spill a reg, and the caller will
 493        * loop back into here to try again.
 494        */
 495       int reg = choose_spill_reg(g);
 496
 497       if (reg == -1) {
 498          fail("no register to spill:\n");
 499          dump_instructions(NULL);
 500       } else if (allow_spilling) {
 501          spill_reg(reg);
 502       }
 503
 504       ralloc_free(g);
 505
 506       return false;
 507    }
 508
 509    /* Get the chosen virtual registers for each node, and map virtual
 510     * regs in the register classes back down to real hardware reg
 511     * numbers.
 512     */
 513    this->grf_used = payload_node_count * reg_width;
 514    for (int i = 0; i < this->virtual_grf_count; i++) {
 515       int reg = ra_get_node_reg(g, i);
 516
 517       hw_reg_mapping[i] = screen->wm_reg_sets[rsi].ra_reg_to_grf[reg] * reg_width;
 518       this->grf_used = MAX2(this->grf_used,
 519                             hw_reg_mapping[i] + this->virtual_grf_sizes[i] *
 520                             reg_width);
 521    }
 522
 523    foreach_in_list(fs_inst, inst, &instructions) {
 524       assign_reg(hw_reg_mapping, &inst->dst, reg_width);
 525       for (int i = 0; i < inst->sources; i++) {
 526          assign_reg(hw_reg_mapping, &inst->src[i], reg_width);
 527       }
 528    }
 529
 530    this->virtual_grf_count = this->grf_used;
 531
 532    ralloc_free(g);
 533
 534    return true;
 535 }
 536
 537 void
 538 fs_visitor::emit_unspill(bblock_t *block, fs_inst *inst, fs_reg dst,
 539                          uint32_t spill_offset, int count)
 540 {
 541    for (int i = 0; i < count; i++) {
 542       /* The gen7 descriptor-based offset is 12 bits of HWORD units. */
 543       bool gen7_read = brw->gen >= 7 && spill_offset < (1 << 12) * REG_SIZE;
 544
 545       fs_inst *unspill_inst =
 546          new(mem_ctx) fs_inst(gen7_read ?
 547                               SHADER_OPCODE_GEN7_SCRATCH_READ :
 548                               SHADER_OPCODE_GEN4_SCRATCH_READ,
 549                               dst);
 550       unspill_inst->offset = spill_offset;
 551       unspill_inst->ir = inst->ir;
 552       unspill_inst->annotation = inst->annotation;
 553
 554       if (!gen7_read) {
 555          unspill_inst->base_mrf = 14;
 556          unspill_inst->mlen = 1; /* header contains offset */
 557       }
 558       inst->insert_before(block, unspill_inst);
 559
 560       dst.reg_offset++;
 561       spill_offset += dispatch_width * sizeof(float);
 562    }
 563 }
 564
 565 int
 566 fs_visitor::choose_spill_reg(struct ra_graph *g)
 567 {
 568    float loop_scale = 1.0;
 569    float spill_costs[this->virtual_grf_count];
 570    bool no_spill[this->virtual_grf_count];
 571
 572    for (int i = 0; i < this->virtual_grf_count; i++) {
 573       spill_costs[i] = 0.0;
 574       no_spill[i] = false;
 575    }
 576
 577    /* Calculate costs for spilling nodes.  Call it a cost of 1 per
 578     * spill/unspill we'll have to do, and guess that the insides of
 579     * loops run 10 times.
 580     */
 581    foreach_in_list(fs_inst, inst, &instructions) {
 582       for (unsigned int i = 0; i < inst->sources; i++) {
 583          if (inst->src[i].file == GRF) {
 584             spill_costs[inst->src[i].reg] += loop_scale;
 585
 586             /* Register spilling logic assumes full-width registers; smeared
 587              * registers have a width of 1 so if we try to spill them we'll
 588              * generate invalid assembly.  This shouldn't be a problem because
 589              * smeared registers are only used as short-term temporaries when
 590              * loading pull constants, so spilling them is unlikely to reduce
 591              * register pressure anyhow.
 592              */
 593             if (!inst->src[i].is_contiguous()) {
 594                no_spill[inst->src[i].reg] = true;
 595             }
 596          }
 597       }
 598
 599       if (inst->dst.file == GRF) {
 600          spill_costs[inst->dst.reg] += inst->regs_written * loop_scale;
 601
 602          if (!inst->dst.is_contiguous()) {
 603             no_spill[inst->dst.reg] = true;
 604          }
 605       }
 606
 607       switch (inst->opcode) {
 608
 609       case BRW_OPCODE_DO:
 610          loop_scale *= 10;
 611          break;
 612
 613       case BRW_OPCODE_WHILE:
 614          loop_scale /= 10;
 615          break;
 616
 617       case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
 618          if (inst->src[0].file == GRF)
 619             no_spill[inst->src[0].reg] = true;
 620          break;
 621
 622       case SHADER_OPCODE_GEN4_SCRATCH_READ:
 623       case SHADER_OPCODE_GEN7_SCRATCH_READ:
 624          if (inst->dst.file == GRF)
 625             no_spill[inst->dst.reg] = true;
 626          break;
 627
 628       default:
 629          break;
 630       }
 631    }
 632
 633    for (int i = 0; i < this->virtual_grf_count; i++) {
 634       if (!no_spill[i])
 635          ra_set_node_spill_cost(g, i, spill_costs[i]);
 636    }
 637
 638    return ra_get_best_spill_node(g);
 639 }
 640
 641 void
 642 fs_visitor::spill_reg(int spill_reg)
 643 {
 644    int reg_size = dispatch_width * sizeof(float);
 645    int size = virtual_grf_sizes[spill_reg];
 646    unsigned int spill_offset = last_scratch;
 647    assert(ALIGN(spill_offset, 16) == spill_offset); /* oword read/write req. */
 648    int spill_base_mrf = dispatch_width > 8 ? 13 : 14;
 649
 650    /* Spills may use MRFs 13-15 in the SIMD16 case.  Our texturing is done
 651     * using up to 11 MRFs starting from either m1 or m2, and fb writes can use
 652     * up to m13 (gen6+ simd16: 2 header + 8 color + 2 src0alpha + 2 omask) or
 653     * m15 (gen4-5 simd16: 2 header + 8 color + 1 aads + 2 src depth + 2 dst
 654     * depth), starting from m1.  In summary: We may not be able to spill in
 655     * SIMD16 mode, because we'd stomp the FB writes.
 656     */
 657    if (!spilled_any_registers) {
 658       bool mrf_used[BRW_MAX_MRF];
 659       get_used_mrfs(mrf_used);
 660
 661       for (int i = spill_base_mrf; i < BRW_MAX_MRF; i++) {
 662          if (mrf_used[i]) {
 663             fail("Register spilling not supported with m%d used", i);
 664           return;
 665          }
 666       }
 667
 668       spilled_any_registers = true;
 669    }
 670
 671    last_scratch += size * reg_size;
 672
 673    calculate_cfg();
 674
 675    /* Generate spill/unspill instructions for the objects being
 676     * spilled.  Right now, we spill or unspill the whole thing to a
 677     * virtual grf of the same size.  For most instructions, though, we
 678     * could just spill/unspill the GRF being accessed.
 679     */
 680    foreach_block_and_inst (block, fs_inst, inst, cfg) {
 681       for (unsigned int i = 0; i < inst->sources; i++) {
 682          if (inst->src[i].file == GRF &&
 683              inst->src[i].reg == spill_reg) {
 684             int regs_read = inst->regs_read(this, i);
 685             int subset_spill_offset = (spill_offset +
 686                                        reg_size * inst->src[i].reg_offset);
 687             fs_reg unspill_dst(GRF, virtual_grf_alloc(regs_read));
 688
 689             inst->src[i].reg = unspill_dst.reg;
 690             inst->src[i].reg_offset = 0;
 691
 692             emit_unspill(block, inst, unspill_dst, subset_spill_offset,
 693                          regs_read);
 694          }
 695       }
 696
 697       if (inst->dst.file == GRF &&
 698           inst->dst.reg == spill_reg) {
 699          int subset_spill_offset = (spill_offset +
 700                                     reg_size * inst->dst.reg_offset);
 701          fs_reg spill_src(GRF, virtual_grf_alloc(inst->regs_written));
 702
 703          inst->dst.reg = spill_src.reg;
 704          inst->dst.reg_offset = 0;
 705
 706          /* If our write is going to affect just part of the
 707           * inst->regs_written(), then we need to unspill the destination
 708           * since we write back out all of the regs_written().
 709           */
 710          if (inst->predicate || inst->force_uncompressed ||
 711              inst->force_sechalf || inst->dst.subreg_offset) {
 712             emit_unspill(block, inst, spill_src, subset_spill_offset,
 713                          inst->regs_written);
 714          }
 715
 716          for (int chan = 0; chan < inst->regs_written; chan++) {
 717             fs_inst *spill_inst =
 718                new(mem_ctx) fs_inst(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
 719                                     reg_null_f, spill_src);
 720             spill_src.reg_offset++;
 721             spill_inst->offset = subset_spill_offset + chan * reg_size;
 722             spill_inst->ir = inst->ir;
 723             spill_inst->annotation = inst->annotation;
 724             spill_inst->mlen = 1 + dispatch_width / 8; /* header, value */
 725             spill_inst->base_mrf = spill_base_mrf;
 726             inst->insert_after(block, spill_inst);
 727          }
 728       }
 729    }
 730
 731    invalidate_live_intervals(false);
 732 }