src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  * Authors:
  24  *    Eric Anholt <eric@anholt.net>
  25  *
  26  */
  27
  28 #include "brw_fs.h"
  29 #include "glsl/glsl_types.h"
  30 #include "glsl/ir_optimization.h"
  31
  32 static void
  33 assign_reg(int *reg_hw_locations, fs_reg *reg, int reg_width)
  34 {
  35    if (reg->file == GRF) {
  36       assert(reg->reg_offset >= 0);
  37       reg->reg = reg_hw_locations[reg->reg] + reg->reg_offset * reg_width;
  38       reg->reg_offset = 0;
  39    }
  40 }
  41
  42 void
  43 fs_visitor::assign_regs_trivial()
  44 {
  45    int hw_reg_mapping[this->virtual_grf_count + 1];
  46    int i;
  47    int reg_width = dispatch_width / 8;
  48
  49    /* Note that compressed instructions require alignment to 2 registers. */
  50    hw_reg_mapping[0] = ALIGN(this->first_non_payload_grf, reg_width);
  51    for (i = 1; i <= this->virtual_grf_count; i++) {
  52       hw_reg_mapping[i] = (hw_reg_mapping[i - 1] +
  53                            this->virtual_grf_sizes[i - 1] * reg_width);
  54    }
  55    this->grf_used = hw_reg_mapping[this->virtual_grf_count];
  56
  57    foreach_list(node, &this->instructions) {
  58       fs_inst *inst = (fs_inst *)node;
  59
  60       assign_reg(hw_reg_mapping, &inst->dst, reg_width);
  61       assign_reg(hw_reg_mapping, &inst->src[0], reg_width);
  62       assign_reg(hw_reg_mapping, &inst->src[1], reg_width);
  63       assign_reg(hw_reg_mapping, &inst->src[2], reg_width);
  64    }
  65
  66    if (this->grf_used >= max_grf) {
  67       fail("Ran out of regs on trivial allocator (%d/%d)\n",
  68            this->grf_used, max_grf);
  69    }
  70
  71 }
  72
  73 static void
  74 brw_alloc_reg_set(struct brw_context *brw, int reg_width)
  75 {
  76    int base_reg_count = BRW_MAX_GRF / reg_width;
  77    int index = reg_width - 1;
  78
  79    /* The registers used to make up almost all values handled in the compiler
  80     * are a scalar value occupying a single register (or 2 registers in the
  81     * case of SIMD16, which is handled by dividing base_reg_count by 2 and
  82     * multiplying allocated register numbers by 2).  Things that were
  83     * aggregates of scalar values at the GLSL level were split to scalar
  84     * values by split_virtual_grfs().
  85     *
  86     * However, texture SEND messages return a series of contiguous registers
  87     * to write into.  We currently always ask for 4 registers, but we may
  88     * convert that to use less some day.
  89     *
  90     * Additionally, on gen5 we need aligned pairs of registers for the PLN
  91     * instruction, and on gen4 we need 8 contiguous regs for workaround simd16
  92     * texturing.
  93     *
  94     * So we have a need for classes for 1, 2, 4, and 8 registers currently,
  95     * and we add in '3' to make indexing the array easier for the common case
  96     * (since we'll probably want it for texturing later).
  97     *
  98     * And, on gen7 and newer, we do texturing SEND messages from GRFs, which
  99     * means that we may need any size up to the sampler message size limit (11
 100     * regs).
 101     */
 102    int class_count;
 103    int class_sizes[BRW_MAX_MRF];
 104
 105    if (brw->gen >= 7) {
 106       for (class_count = 0; class_count < MAX_SAMPLER_MESSAGE_SIZE;
 107            class_count++)
 108          class_sizes[class_count] = class_count + 1;
 109    } else {
 110       for (class_count = 0; class_count < 4; class_count++)
 111          class_sizes[class_count] = class_count + 1;
 112       class_sizes[class_count++] = 8;
 113    }
 114
 115    /* Compute the total number of registers across all classes. */
 116    int ra_reg_count = 0;
 117    for (int i = 0; i < class_count; i++) {
 118       ra_reg_count += base_reg_count - (class_sizes[i] - 1);
 119    }
 120
 121    uint8_t *ra_reg_to_grf = ralloc_array(brw, uint8_t, ra_reg_count);
 122    struct ra_regs *regs = ra_alloc_reg_set(brw, ra_reg_count);
 123    if (brw->gen >= 6)
 124       ra_set_allocate_round_robin(regs);
 125    int *classes = ralloc_array(brw, int, class_count);
 126    int aligned_pairs_class = -1;
 127
 128    /* Now, add the registers to their classes, and add the conflicts
 129     * between them and the base GRF registers (and also each other).
 130     */
 131    int reg = 0;
 132    int pairs_base_reg = 0;
 133    int pairs_reg_count = 0;
 134    for (int i = 0; i < class_count; i++) {
 135       int class_reg_count = base_reg_count - (class_sizes[i] - 1);
 136       classes[i] = ra_alloc_reg_class(regs);
 137
 138       /* Save this off for the aligned pair class at the end. */
 139       if (class_sizes[i] == 2) {
 140          pairs_base_reg = reg;
 141          pairs_reg_count = class_reg_count;
 142       }
 143
 144       for (int j = 0; j < class_reg_count; j++) {
 145          ra_class_add_reg(regs, classes[i], reg);
 146
 147          ra_reg_to_grf[reg] = j;
 148
 149          for (int base_reg = j;
 150               base_reg < j + class_sizes[i];
 151               base_reg++) {
 152             ra_add_transitive_reg_conflict(regs, base_reg, reg);
 153          }
 154
 155          reg++;
 156       }
 157    }
 158    assert(reg == ra_reg_count);
 159
 160    /* Add a special class for aligned pairs, which we'll put delta_x/y
 161     * in on gen5 so that we can do PLN.
 162     */
 163    if (brw->has_pln && reg_width == 1 && brw->gen < 6) {
 164       aligned_pairs_class = ra_alloc_reg_class(regs);
 165
 166       for (int i = 0; i < pairs_reg_count; i++) {
 167          if ((ra_reg_to_grf[pairs_base_reg + i] & 1) == 0) {
 168             ra_class_add_reg(regs, aligned_pairs_class, pairs_base_reg + i);
 169          }
 170       }
 171    }
 172
 173    ra_set_finalize(regs, NULL);
 174
 175    brw->wm.reg_sets[index].regs = regs;
 176    for (unsigned i = 0; i < ARRAY_SIZE(brw->wm.reg_sets[index].classes); i++)
 177       brw->wm.reg_sets[index].classes[i] = -1;
 178    for (int i = 0; i < class_count; i++)
 179       brw->wm.reg_sets[index].classes[class_sizes[i] - 1] = classes[i];
 180    brw->wm.reg_sets[index].ra_reg_to_grf = ra_reg_to_grf;
 181    brw->wm.reg_sets[index].aligned_pairs_class = aligned_pairs_class;
 182 }
 183
 184 void
 185 brw_fs_alloc_reg_sets(struct brw_context *brw)
 186 {
 187    brw_alloc_reg_set(brw, 1);
 188    brw_alloc_reg_set(brw, 2);
 189 }
 190
 191 int
 192 count_to_loop_end(fs_inst *do_inst)
 193 {
 194    int depth = 1;
 195    int ip = 1;
 196    for (fs_inst *inst = (fs_inst *)do_inst->next;
 197         depth > 0;
 198         inst = (fs_inst *)inst->next) {
 199       switch (inst->opcode) {
 200       case BRW_OPCODE_DO:
 201          depth++;
 202          break;
 203       case BRW_OPCODE_WHILE:
 204          depth--;
 205          break;
 206       default:
 207          break;
 208       }
 209       ip++;
 210    }
 211    return ip;
 212 }
 213
 214 /**
 215  * Sets up interference between thread payload registers and the virtual GRFs
 216  * to be allocated for program temporaries.
 217  *
 218  * We want to be able to reallocate the payload for our virtual GRFs, notably
 219  * because the setup coefficients for a full set of 16 FS inputs takes up 8 of
 220  * our 128 registers.
 221  *
 222  * The layout of the payload registers is:
 223  *
 224  * 0..nr_payload_regs-1: fixed function setup (including bary coordinates).
 225  * nr_payload_regs..nr_payload_regs+curb_read_lengh-1: uniform data
 226  * nr_payload_regs+curb_read_lengh..first_non_payload_grf-1: setup coefficients.
 227  *
 228  * And we have payload_node_count nodes covering these registers in order
 229  * (note that in SIMD16, a node is two registers).
 230  */
 231 void
 232 fs_visitor::setup_payload_interference(struct ra_graph *g,
 233                                        int payload_node_count,
 234                                        int first_payload_node)
 235 {
 236    int reg_width = dispatch_width / 8;
 237    int loop_depth = 0;
 238    int loop_end_ip = 0;
 239
 240    int payload_last_use_ip[payload_node_count];
 241    memset(payload_last_use_ip, 0, sizeof(payload_last_use_ip));
 242    int ip = 0;
 243    foreach_list(node, &this->instructions) {
 244       fs_inst *inst = (fs_inst *)node;
 245
 246       switch (inst->opcode) {
 247       case BRW_OPCODE_DO:
 248          loop_depth++;
 249
 250          /* Since payload regs are deffed only at the start of the shader
 251           * execution, any uses of the payload within a loop mean the live
 252           * interval extends to the end of the outermost loop.  Find the ip of
 253           * the end now.
 254           */
 255          if (loop_depth == 1)
 256             loop_end_ip = ip + count_to_loop_end(inst);
 257          break;
 258       case BRW_OPCODE_WHILE:
 259          loop_depth--;
 260          break;
 261       default:
 262          break;
 263       }
 264
 265       int use_ip;
 266       if (loop_depth > 0)
 267          use_ip = loop_end_ip;
 268       else
 269          use_ip = ip;
 270
 271       /* Note that UNIFORM args have been turned into FIXED_HW_REG by
 272        * assign_curbe_setup(), and interpolation uses fixed hardware regs from
 273        * the start (see interp_reg()).
 274        */
 275       for (int i = 0; i < 3; i++) {
 276          if (inst->src[i].file == HW_REG &&
 277              inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
 278             int node_nr = inst->src[i].fixed_hw_reg.nr / reg_width;
 279             if (node_nr >= payload_node_count)
 280                continue;
 281
 282             payload_last_use_ip[node_nr] = use_ip;
 283          }
 284       }
 285
 286       /* Special case instructions which have extra implied registers used. */
 287       switch (inst->opcode) {
 288       case FS_OPCODE_FB_WRITE:
 289          /* We could omit this for the !inst->header_present case, except that
 290           * the simulator apparently incorrectly reads from g0/g1 instead of
 291           * sideband.  It also really freaks out driver developers to see g0
 292           * used in unusual places, so just always reserve it.
 293           */
 294          payload_last_use_ip[0 / reg_width] = use_ip;
 295          payload_last_use_ip[1 / reg_width] = use_ip;
 296          break;
 297
 298       case FS_OPCODE_LINTERP:
 299          /* On gen6+ in SIMD16, there are 4 adjacent registers (so 2 nodes)
 300           * used by PLN's sourcing of the deltas, while we list only the first
 301           * two in the arguments (1 node).  Pre-gen6, the deltas are computed
 302           * in normal VGRFs.
 303           */
 304          if (brw->gen >= 6) {
 305             int delta_x_arg = 0;
 306             if (inst->src[delta_x_arg].file == HW_REG &&
 307                 inst->src[delta_x_arg].fixed_hw_reg.file ==
 308                 BRW_GENERAL_REGISTER_FILE) {
 309                int sechalf_node = (inst->src[delta_x_arg].fixed_hw_reg.nr /
 310                                    reg_width) + 1;
 311                assert(sechalf_node < payload_node_count);
 312                payload_last_use_ip[sechalf_node] = use_ip;
 313             }
 314          }
 315          break;
 316
 317       default:
 318          break;
 319       }
 320
 321       ip++;
 322    }
 323
 324    for (int i = 0; i < payload_node_count; i++) {
 325       /* Mark the payload node as interfering with any virtual grf that is
 326        * live between the start of the program and our last use of the payload
 327        * node.
 328        */
 329       for (int j = 0; j < this->virtual_grf_count; j++) {
 330          /* Note that we use a <= comparison, unlike virtual_grf_interferes(),
 331           * in order to not have to worry about the uniform issue described in
 332           * calculate_live_intervals().
 333           */
 334          if (this->virtual_grf_start[j] <= payload_last_use_ip[i]) {
 335             ra_add_node_interference(g, first_payload_node + i, j);
 336          }
 337       }
 338    }
 339
 340    for (int i = 0; i < payload_node_count; i++) {
 341       /* Mark each payload node as being allocated to its physical register.
 342        *
 343        * The alternative would be to have per-physical-register classes, which
 344        * would just be silly.
 345        */
 346       ra_set_node_reg(g, first_payload_node + i, i);
 347    }
 348 }
 349
 350 /**
 351  * Sets the mrf_used array to indicate which MRFs are used by the shader IR
 352  *
 353  * This is used in assign_regs() to decide which of the GRFs that we use as
 354  * MRFs on gen7 get normally register allocated, and in register spilling to
 355  * see if we can actually use MRFs to do spills without overwriting normal MRF
 356  * contents.
 357  */
 358 void
 359 fs_visitor::get_used_mrfs(bool *mrf_used)
 360 {
 361    int reg_width = dispatch_width / 8;
 362
 363    memset(mrf_used, 0, BRW_MAX_MRF * sizeof(bool));
 364
 365    foreach_list(node, &this->instructions) {
 366       fs_inst *inst = (fs_inst *)node;
 367
 368       if (inst->dst.file == MRF) {
 369          int reg = inst->dst.reg & ~BRW_MRF_COMPR4;
 370          mrf_used[reg] = true;
 371          if (reg_width == 2) {
 372             if (inst->dst.reg & BRW_MRF_COMPR4) {
 373                mrf_used[reg + 4] = true;
 374             } else {
 375                mrf_used[reg + 1] = true;
 376             }
 377          }
 378       }
 379
 380       if (inst->mlen > 0) {
 381          for (int i = 0; i < implied_mrf_writes(inst); i++) {
 382             mrf_used[inst->base_mrf + i] = true;
 383          }
 384       }
 385    }
 386 }
 387
 388 /**
 389  * Sets interference between virtual GRFs and usage of the high GRFs for SEND
 390  * messages (treated as MRFs in code generation).
 391  */
 392 void
 393 fs_visitor::setup_mrf_hack_interference(struct ra_graph *g, int first_mrf_node)
 394 {
 395    int reg_width = dispatch_width / 8;
 396
 397    bool mrf_used[BRW_MAX_MRF];
 398    get_used_mrfs(mrf_used);
 399
 400    for (int i = 0; i < BRW_MAX_MRF; i++) {
 401       /* Mark each MRF reg node as being allocated to its physical register.
 402        *
 403        * The alternative would be to have per-physical-register classes, which
 404        * would just be silly.
 405        */
 406       ra_set_node_reg(g, first_mrf_node + i,
 407                       (GEN7_MRF_HACK_START + i) / reg_width);
 408
 409       /* Since we don't have any live/dead analysis on the MRFs, just mark all
 410        * that are used as conflicting with all virtual GRFs.
 411        */
 412       if (mrf_used[i]) {
 413          for (int j = 0; j < this->virtual_grf_count; j++) {
 414             ra_add_node_interference(g, first_mrf_node + i, j);
 415          }
 416       }
 417    }
 418 }
 419
 420 bool
 421 fs_visitor::assign_regs(bool allow_spilling)
 422 {
 423    /* Most of this allocation was written for a reg_width of 1
 424     * (dispatch_width == 8).  In extending to SIMD16, the code was
 425     * left in place and it was converted to have the hardware
 426     * registers it's allocating be contiguous physical pairs of regs
 427     * for reg_width == 2.
 428     */
 429    int reg_width = dispatch_width / 8;
 430    int hw_reg_mapping[this->virtual_grf_count];
 431    int payload_node_count = (ALIGN(this->first_non_payload_grf, reg_width) /
 432                             reg_width);
 433    int rsi = reg_width - 1; /* Which brw->wm.reg_sets[] to use */
 434    calculate_live_intervals();
 435
 436    int node_count = this->virtual_grf_count;
 437    int first_payload_node = node_count;
 438    node_count += payload_node_count;
 439    int first_mrf_hack_node = node_count;
 440    if (brw->gen >= 7)
 441       node_count += BRW_MAX_GRF - GEN7_MRF_HACK_START;
 442    struct ra_graph *g = ra_alloc_interference_graph(brw->wm.reg_sets[rsi].regs,
 443                                                     node_count);
 444
 445    for (int i = 0; i < this->virtual_grf_count; i++) {
 446       unsigned size = this->virtual_grf_sizes[i];
 447       int c;
 448
 449       assert(size <= ARRAY_SIZE(brw->wm.reg_sets[rsi].classes) &&
 450              "Register allocation relies on split_virtual_grfs()");
 451       c = brw->wm.reg_sets[rsi].classes[size - 1];
 452
 453       /* Special case: on pre-GEN6 hardware that supports PLN, the
 454        * second operand of a PLN instruction needs to be an
 455        * even-numbered register, so we have a special register class
 456        * wm_aligned_pairs_class to handle this case.  pre-GEN6 always
 457        * uses this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] as the
 458        * second operand of a PLN instruction (since it doesn't support
 459        * any other interpolation modes).  So all we need to do is find
 460        * that register and set it to the appropriate class.
 461        */
 462       if (brw->wm.reg_sets[rsi].aligned_pairs_class >= 0 &&
 463           this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg == i) {
 464          c = brw->wm.reg_sets[rsi].aligned_pairs_class;
 465       }
 466
 467       ra_set_node_class(g, i, c);
 468
 469       for (int j = 0; j < i; j++) {
 470          if (virtual_grf_interferes(i, j)) {
 471             ra_add_node_interference(g, i, j);
 472          }
 473       }
 474    }
 475
 476    setup_payload_interference(g, payload_node_count, first_payload_node);
 477    if (brw->gen >= 7)
 478       setup_mrf_hack_interference(g, first_mrf_hack_node);
 479
 480    /* Debug of register spilling: Go spill everything. */
 481    if (0) {
 482       int reg = choose_spill_reg(g);
 483
 484       if (reg != -1) {
 485          spill_reg(reg);
 486          ralloc_free(g);
 487          return false;
 488       }
 489    }
 490
 491    if (!ra_allocate_no_spills(g)) {
 492       /* Failed to allocate registers.  Spill a reg, and the caller will
 493        * loop back into here to try again.
 494        */
 495       int reg = choose_spill_reg(g);
 496
 497       if (reg == -1) {
 498          fail("no register to spill:\n");
 499          dump_instructions();
 500       } else if (allow_spilling) {
 501          spill_reg(reg);
 502       }
 503
 504       ralloc_free(g);
 505
 506       return false;
 507    }
 508
 509    /* Get the chosen virtual registers for each node, and map virtual
 510     * regs in the register classes back down to real hardware reg
 511     * numbers.
 512     */
 513    this->grf_used = payload_node_count * reg_width;
 514    for (int i = 0; i < this->virtual_grf_count; i++) {
 515       int reg = ra_get_node_reg(g, i);
 516
 517       hw_reg_mapping[i] = brw->wm.reg_sets[rsi].ra_reg_to_grf[reg] * reg_width;
 518       this->grf_used = MAX2(this->grf_used,
 519                             hw_reg_mapping[i] + this->virtual_grf_sizes[i] *
 520                             reg_width);
 521    }
 522
 523    foreach_list(node, &this->instructions) {
 524       fs_inst *inst = (fs_inst *)node;
 525
 526       assign_reg(hw_reg_mapping, &inst->dst, reg_width);
 527       assign_reg(hw_reg_mapping, &inst->src[0], reg_width);
 528       assign_reg(hw_reg_mapping, &inst->src[1], reg_width);
 529       assign_reg(hw_reg_mapping, &inst->src[2], reg_width);
 530    }
 531
 532    ralloc_free(g);
 533
 534    return true;
 535 }
 536
 537 void
 538 fs_visitor::emit_unspill(fs_inst *inst, fs_reg dst, uint32_t spill_offset,
 539                          int count)
 540 {
 541    for (int i = 0; i < count; i++) {
 542       /* The gen7 descriptor-based offset is 12 bits of HWORD units. */
 543       bool gen7_read = brw->gen >= 7 && spill_offset < (1 << 12) * REG_SIZE;
 544
 545       fs_inst *unspill_inst =
 546          new(mem_ctx) fs_inst(gen7_read ?
 547                               SHADER_OPCODE_GEN7_SCRATCH_READ :
 548                               SHADER_OPCODE_GEN4_SCRATCH_READ,
 549                               dst);
 550       unspill_inst->offset = spill_offset;
 551       unspill_inst->ir = inst->ir;
 552       unspill_inst->annotation = inst->annotation;
 553
 554       if (!gen7_read) {
 555          unspill_inst->base_mrf = 14;
 556          unspill_inst->mlen = 1; /* header contains offset */
 557       }
 558       inst->insert_before(unspill_inst);
 559
 560       dst.reg_offset++;
 561       spill_offset += dispatch_width * sizeof(float);
 562    }
 563 }
 564
 565 int
 566 fs_visitor::choose_spill_reg(struct ra_graph *g)
 567 {
 568    float loop_scale = 1.0;
 569    float spill_costs[this->virtual_grf_count];
 570    bool no_spill[this->virtual_grf_count];
 571
 572    for (int i = 0; i < this->virtual_grf_count; i++) {
 573       spill_costs[i] = 0.0;
 574       no_spill[i] = false;
 575    }
 576
 577    /* Calculate costs for spilling nodes.  Call it a cost of 1 per
 578     * spill/unspill we'll have to do, and guess that the insides of
 579     * loops run 10 times.
 580     */
 581    foreach_list(node, &this->instructions) {
 582       fs_inst *inst = (fs_inst *)node;
 583
 584       for (unsigned int i = 0; i < 3; i++) {
 585          if (inst->src[i].file == GRF) {
 586             spill_costs[inst->src[i].reg] += loop_scale;
 587
 588             /* Register spilling logic assumes full-width registers; smeared
 589              * registers have a width of 1 so if we try to spill them we'll
 590              * generate invalid assembly.  This shouldn't be a problem because
 591              * smeared registers are only used as short-term temporaries when
 592              * loading pull constants, so spilling them is unlikely to reduce
 593              * register pressure anyhow.
 594              */
 595             if (inst->src[i].smear >= 0) {
 596                no_spill[inst->src[i].reg] = true;
 597             }
 598          }
 599       }
 600
 601       if (inst->dst.file == GRF) {
 602          spill_costs[inst->dst.reg] += inst->regs_written * loop_scale;
 603
 604          if (inst->dst.smear >= 0) {
 605             no_spill[inst->dst.reg] = true;
 606          }
 607       }
 608
 609       switch (inst->opcode) {
 610
 611       case BRW_OPCODE_DO:
 612          loop_scale *= 10;
 613          break;
 614
 615       case BRW_OPCODE_WHILE:
 616          loop_scale /= 10;
 617          break;
 618
 619       case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
 620          if (inst->src[0].file == GRF)
 621             no_spill[inst->src[0].reg] = true;
 622          break;
 623
 624       case SHADER_OPCODE_GEN4_SCRATCH_READ:
 625       case SHADER_OPCODE_GEN7_SCRATCH_READ:
 626          if (inst->dst.file == GRF)
 627             no_spill[inst->dst.reg] = true;
 628          break;
 629
 630       default:
 631          break;
 632       }
 633    }
 634
 635    for (int i = 0; i < this->virtual_grf_count; i++) {
 636       if (!no_spill[i])
 637          ra_set_node_spill_cost(g, i, spill_costs[i]);
 638    }
 639
 640    return ra_get_best_spill_node(g);
 641 }
 642
 643 void
 644 fs_visitor::spill_reg(int spill_reg)
 645 {
 646    int reg_size = dispatch_width * sizeof(float);
 647    int size = virtual_grf_sizes[spill_reg];
 648    unsigned int spill_offset = c->last_scratch;
 649    assert(ALIGN(spill_offset, 16) == spill_offset); /* oword read/write req. */
 650    int spill_base_mrf = dispatch_width > 8 ? 13 : 14;
 651
 652    /* Spills may use MRFs 13-15 in the SIMD16 case.  Our texturing is done
 653     * using up to 11 MRFs starting from either m1 or m2, and fb writes can use
 654     * up to m13 (gen6+ simd16: 2 header + 8 color + 2 src0alpha + 2 omask) or
 655     * m15 (gen4-5 simd16: 2 header + 8 color + 1 aads + 2 src depth + 2 dst
 656     * depth), starting from m1.  In summary: We may not be able to spill in
 657     * SIMD16 mode, because we'd stomp the FB writes.
 658     */
 659    if (!spilled_any_registers) {
 660       bool mrf_used[BRW_MAX_MRF];
 661       get_used_mrfs(mrf_used);
 662
 663       for (int i = spill_base_mrf; i < BRW_MAX_MRF; i++) {
 664          if (mrf_used[i]) {
 665             fail("Register spilling not supported with m%d used", i);
 666           return;
 667          }
 668       }
 669
 670       spilled_any_registers = true;
 671    }
 672
 673    c->last_scratch += size * reg_size;
 674
 675    /* Generate spill/unspill instructions for the objects being
 676     * spilled.  Right now, we spill or unspill the whole thing to a
 677     * virtual grf of the same size.  For most instructions, though, we
 678     * could just spill/unspill the GRF being accessed.
 679     */
 680    foreach_list(node, &this->instructions) {
 681       fs_inst *inst = (fs_inst *)node;
 682
 683       for (unsigned int i = 0; i < 3; i++) {
 684          if (inst->src[i].file == GRF &&
 685              inst->src[i].reg == spill_reg) {
 686             int regs_read = inst->regs_read(this, i);
 687             int subset_spill_offset = (spill_offset +
 688                                        reg_size * inst->src[i].reg_offset);
 689             fs_reg unspill_dst(GRF, virtual_grf_alloc(regs_read));
 690
 691             inst->src[i].reg = unspill_dst.reg;
 692             inst->src[i].reg_offset = 0;
 693
 694             emit_unspill(inst, unspill_dst, subset_spill_offset, regs_read);
 695          }
 696       }
 697
 698       if (inst->dst.file == GRF &&
 699           inst->dst.reg == spill_reg) {
 700          int subset_spill_offset = (spill_offset +
 701                                     reg_size * inst->dst.reg_offset);
 702          fs_reg spill_src(GRF, virtual_grf_alloc(inst->regs_written));
 703
 704          inst->dst.reg = spill_src.reg;
 705          inst->dst.reg_offset = 0;
 706
 707          /* If our write is going to affect just part of the
 708           * inst->regs_written(), then we need to unspill the destination
 709           * since we write back out all of the regs_written().
 710           */
 711          if (inst->predicate || inst->force_uncompressed ||
 712              inst->force_sechalf || inst->dst.subreg_offset) {
 713             emit_unspill(inst, spill_src, subset_spill_offset,
 714                          inst->regs_written);
 715          }
 716
 717          for (int chan = 0; chan < inst->regs_written; chan++) {
 718             fs_inst *spill_inst =
 719                new(mem_ctx) fs_inst(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
 720                                     reg_null_f, spill_src);
 721             spill_src.reg_offset++;
 722             spill_inst->offset = subset_spill_offset + chan * reg_size;
 723             spill_inst->ir = inst->ir;
 724             spill_inst->annotation = inst->annotation;
 725             spill_inst->mlen = 1 + dispatch_width / 8; /* header, value */
 726             spill_inst->base_mrf = spill_base_mrf;
 727             inst->insert_after(spill_inst);
 728          }
 729       }
 730    }
 731
 732    invalidate_live_intervals();
 733 }