src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 extern "C" {
  32
  33 #include <sys/types.h>
  34
  35 #include "main/macros.h"
  36 #include "main/shaderobj.h"
  37 #include "main/uniforms.h"
  38 #include "main/fbobject.h"
  39 #include "program/prog_parameter.h"
  40 #include "program/prog_print.h"
  41 #include "program/register_allocate.h"
  42 #include "program/sampler.h"
  43 #include "program/hash_table.h"
  44 #include "brw_context.h"
  45 #include "brw_eu.h"
  46 #include "brw_wm.h"
  47 }
  48 #include "brw_fs.h"
  49 #include "glsl/glsl_types.h"
  50 #include "glsl/ir_print_visitor.h"
  51
  52 void
  53 fs_inst::init()
  54 {
  55    memset(this, 0, sizeof(*this));
  56    this->opcode = BRW_OPCODE_NOP;
  57    this->conditional_mod = BRW_CONDITIONAL_NONE;
  58
  59    this->dst = reg_undef;
  60    this->src[0] = reg_undef;
  61    this->src[1] = reg_undef;
  62    this->src[2] = reg_undef;
  63 }
  64
  65 fs_inst::fs_inst()
  66 {
  67    init();
  68 }
  69
  70 fs_inst::fs_inst(enum opcode opcode)
  71 {
  72    init();
  73    this->opcode = opcode;
  74 }
  75
  76 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
  77 {
  78    init();
  79    this->opcode = opcode;
  80    this->dst = dst;
  81
  82    if (dst.file == GRF)
  83       assert(dst.reg_offset >= 0);
  84 }
  85
  86 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
  87 {
  88    init();
  89    this->opcode = opcode;
  90    this->dst = dst;
  91    this->src[0] = src0;
  92
  93    if (dst.file == GRF)
  94       assert(dst.reg_offset >= 0);
  95    if (src[0].file == GRF)
  96       assert(src[0].reg_offset >= 0);
  97 }
  98
  99 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 100 {
 101    init();
 102    this->opcode = opcode;
 103    this->dst = dst;
 104    this->src[0] = src0;
 105    this->src[1] = src1;
 106
 107    if (dst.file == GRF)
 108       assert(dst.reg_offset >= 0);
 109    if (src[0].file == GRF)
 110       assert(src[0].reg_offset >= 0);
 111    if (src[1].file == GRF)
 112       assert(src[1].reg_offset >= 0);
 113 }
 114
 115 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
 116                  fs_reg src0, fs_reg src1, fs_reg src2)
 117 {
 118    init();
 119    this->opcode = opcode;
 120    this->dst = dst;
 121    this->src[0] = src0;
 122    this->src[1] = src1;
 123    this->src[2] = src2;
 124
 125    if (dst.file == GRF)
 126       assert(dst.reg_offset >= 0);
 127    if (src[0].file == GRF)
 128       assert(src[0].reg_offset >= 0);
 129    if (src[1].file == GRF)
 130       assert(src[1].reg_offset >= 0);
 131    if (src[2].file == GRF)
 132       assert(src[2].reg_offset >= 0);
 133 }
 134
 135 #define ALU1(op)                                                        \
 136    fs_inst *                                                            \
 137    fs_visitor::op(fs_reg dst, fs_reg src0)                              \
 138    {                                                                    \
 139       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
 140    }
 141
 142 #define ALU2(op)                                                        \
 143    fs_inst *                                                            \
 144    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1)                 \
 145    {                                                                    \
 146       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
 147    }
 148
 149 ALU1(NOT)
 150 ALU1(MOV)
 151 ALU1(FRC)
 152 ALU1(RNDD)
 153 ALU1(RNDE)
 154 ALU1(RNDZ)
 155 ALU2(ADD)
 156 ALU2(MUL)
 157 ALU2(MACH)
 158 ALU2(AND)
 159 ALU2(OR)
 160 ALU2(XOR)
 161 ALU2(SHL)
 162 ALU2(SHR)
 163 ALU2(ASR)
 164
 165 /** Gen4 predicated IF. */
 166 fs_inst *
 167 fs_visitor::IF(uint32_t predicate)
 168 {
 169    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
 170    inst->predicate = predicate;
 171    return inst;
 172 }
 173
 174 /** Gen6+ IF with embedded comparison. */
 175 fs_inst *
 176 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
 177 {
 178    assert(intel->gen >= 6);
 179    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
 180                                         reg_null_d, src0, src1);
 181    inst->conditional_mod = condition;
 182    return inst;
 183 }
 184
 185 /**
 186  * CMP: Sets the low bit of the destination channels with the result
 187  * of the comparison, while the upper bits are undefined, and updates
 188  * the flag register with the packed 16 bits of the result.
 189  */
 190 fs_inst *
 191 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
 192 {
 193    fs_inst *inst;
 194
 195    /* Take the instruction:
 196     *
 197     * CMP null<d> src0<f> src1<f>
 198     *
 199     * Original gen4 does type conversion to the destination type before
 200     * comparison, producing garbage results for floating point comparisons.
 201     * gen5 does the comparison on the execution type (resolved source types),
 202     * so dst type doesn't matter.  gen6 does comparison and then uses the
 203     * result as if it was the dst type with no conversion, which happens to
 204     * mostly work out for float-interpreted-as-int since our comparisons are
 205     * for >0, =0, <0.
 206     */
 207    if (intel->gen == 4) {
 208       dst.type = src0.type;
 209       if (dst.file == FIXED_HW_REG)
 210          dst.fixed_hw_reg.type = dst.type;
 211    }
 212
 213    resolve_ud_negate(&src0);
 214    resolve_ud_negate(&src1);
 215
 216    inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
 217    inst->conditional_mod = condition;
 218
 219    return inst;
 220 }
 221
 222 exec_list
 223 fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
 224                                        fs_reg offset)
 225 {
 226    exec_list instructions;
 227    fs_inst *inst;
 228
 229    if (intel->gen >= 7) {
 230       inst = new(mem_ctx) fs_inst(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7,
 231                                   dst, surf_index, offset);
 232       instructions.push_tail(inst);
 233    } else {
 234       int base_mrf = 13;
 235       bool header_present = true;
 236
 237       fs_reg mrf = fs_reg(MRF, base_mrf + header_present);
 238       mrf.type = BRW_REGISTER_TYPE_D;
 239
 240       /* On gen6+ we want the dword offset passed in, but on gen4/5 we need a
 241        * dword-aligned byte offset.
 242        */
 243       if (intel->gen == 6) {
 244          instructions.push_tail(MOV(mrf, offset));
 245       } else {
 246          instructions.push_tail(MUL(mrf, offset, fs_reg(4)));
 247       }
 248       inst = MOV(mrf, offset);
 249       inst = new(mem_ctx) fs_inst(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD,
 250                                   dst, surf_index);
 251       inst->header_present = header_present;
 252       inst->base_mrf = base_mrf;
 253       inst->mlen = header_present + dispatch_width / 8;
 254
 255       instructions.push_tail(inst);
 256    }
 257
 258    return instructions;
 259 }
 260
 261 bool
 262 fs_inst::equals(fs_inst *inst)
 263 {
 264    return (opcode == inst->opcode &&
 265            dst.equals(inst->dst) &&
 266            src[0].equals(inst->src[0]) &&
 267            src[1].equals(inst->src[1]) &&
 268            src[2].equals(inst->src[2]) &&
 269            saturate == inst->saturate &&
 270            predicate == inst->predicate &&
 271            conditional_mod == inst->conditional_mod &&
 272            mlen == inst->mlen &&
 273            base_mrf == inst->base_mrf &&
 274            sampler == inst->sampler &&
 275            target == inst->target &&
 276            eot == inst->eot &&
 277            header_present == inst->header_present &&
 278            shadow_compare == inst->shadow_compare &&
 279            offset == inst->offset);
 280 }
 281
 282 int
 283 fs_inst::regs_written()
 284 {
 285    if (is_tex())
 286       return 4;
 287
 288    /* The SINCOS and INT_DIV_QUOTIENT_AND_REMAINDER math functions return 2,
 289     * but we don't currently use them...nor do we have an opcode for them.
 290     */
 291
 292    return 1;
 293 }
 294
 295 bool
 296 fs_inst::overwrites_reg(const fs_reg &reg)
 297 {
 298    return (reg.file == dst.file &&
 299            reg.reg == dst.reg &&
 300            reg.reg_offset >= dst.reg_offset  &&
 301            reg.reg_offset < dst.reg_offset + regs_written());
 302 }
 303
 304 bool
 305 fs_inst::is_tex()
 306 {
 307    return (opcode == SHADER_OPCODE_TEX ||
 308            opcode == FS_OPCODE_TXB ||
 309            opcode == SHADER_OPCODE_TXD ||
 310            opcode == SHADER_OPCODE_TXF ||
 311            opcode == SHADER_OPCODE_TXL ||
 312            opcode == SHADER_OPCODE_TXS);
 313 }
 314
 315 bool
 316 fs_inst::is_math()
 317 {
 318    return (opcode == SHADER_OPCODE_RCP ||
 319            opcode == SHADER_OPCODE_RSQ ||
 320            opcode == SHADER_OPCODE_SQRT ||
 321            opcode == SHADER_OPCODE_EXP2 ||
 322            opcode == SHADER_OPCODE_LOG2 ||
 323            opcode == SHADER_OPCODE_SIN ||
 324            opcode == SHADER_OPCODE_COS ||
 325            opcode == SHADER_OPCODE_INT_QUOTIENT ||
 326            opcode == SHADER_OPCODE_INT_REMAINDER ||
 327            opcode == SHADER_OPCODE_POW);
 328 }
 329
 330 bool
 331 fs_inst::is_send_from_grf()
 332 {
 333    return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
 334            (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
 335             src[1].file == GRF));
 336 }
 337
 338 bool
 339 fs_visitor::can_do_source_mods(fs_inst *inst)
 340 {
 341    if (intel->gen == 6 && inst->is_math())
 342       return false;
 343
 344    if (inst->is_send_from_grf())
 345       return false;
 346
 347    return true;
 348 }
 349
 350 void
 351 fs_reg::init()
 352 {
 353    memset(this, 0, sizeof(*this));
 354    this->smear = -1;
 355 }
 356
 357 /** Generic unset register constructor. */
 358 fs_reg::fs_reg()
 359 {
 360    init();
 361    this->file = BAD_FILE;
 362 }
 363
 364 /** Immediate value constructor. */
 365 fs_reg::fs_reg(float f)
 366 {
 367    init();
 368    this->file = IMM;
 369    this->type = BRW_REGISTER_TYPE_F;
 370    this->imm.f = f;
 371 }
 372
 373 /** Immediate value constructor. */
 374 fs_reg::fs_reg(int32_t i)
 375 {
 376    init();
 377    this->file = IMM;
 378    this->type = BRW_REGISTER_TYPE_D;
 379    this->imm.i = i;
 380 }
 381
 382 /** Immediate value constructor. */
 383 fs_reg::fs_reg(uint32_t u)
 384 {
 385    init();
 386    this->file = IMM;
 387    this->type = BRW_REGISTER_TYPE_UD;
 388    this->imm.u = u;
 389 }
 390
 391 /** Fixed brw_reg Immediate value constructor. */
 392 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
 393 {
 394    init();
 395    this->file = FIXED_HW_REG;
 396    this->fixed_hw_reg = fixed_hw_reg;
 397    this->type = fixed_hw_reg.type;
 398 }
 399
 400 bool
 401 fs_reg::equals(const fs_reg &r) const
 402 {
 403    return (file == r.file &&
 404            reg == r.reg &&
 405            reg_offset == r.reg_offset &&
 406            type == r.type &&
 407            negate == r.negate &&
 408            abs == r.abs &&
 409            !reladdr && !r.reladdr &&
 410            memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
 411                   sizeof(fixed_hw_reg)) == 0 &&
 412            smear == r.smear &&
 413            imm.u == r.imm.u);
 414 }
 415
 416 bool
 417 fs_reg::is_zero() const
 418 {
 419    if (file != IMM)
 420       return false;
 421
 422    return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
 423 }
 424
 425 bool
 426 fs_reg::is_one() const
 427 {
 428    if (file != IMM)
 429       return false;
 430
 431    return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
 432 }
 433
 434 int
 435 fs_visitor::type_size(const struct glsl_type *type)
 436 {
 437    unsigned int size, i;
 438
 439    switch (type->base_type) {
 440    case GLSL_TYPE_UINT:
 441    case GLSL_TYPE_INT:
 442    case GLSL_TYPE_FLOAT:
 443    case GLSL_TYPE_BOOL:
 444       return type->components();
 445    case GLSL_TYPE_ARRAY:
 446       return type_size(type->fields.array) * type->length;
 447    case GLSL_TYPE_STRUCT:
 448       size = 0;
 449       for (i = 0; i < type->length; i++) {
 450          size += type_size(type->fields.structure[i].type);
 451       }
 452       return size;
 453    case GLSL_TYPE_SAMPLER:
 454       /* Samplers take up no register space, since they're baked in at
 455        * link time.
 456        */
 457       return 0;
 458    default:
 459       assert(!"not reached");
 460       return 0;
 461    }
 462 }
 463
 464 fs_reg
 465 fs_visitor::get_timestamp()
 466 {
 467    assert(intel->gen >= 7);
 468
 469    fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 470                                           BRW_ARF_TIMESTAMP,
 471                                           0),
 472                              BRW_REGISTER_TYPE_UD));
 473
 474    fs_reg dst = fs_reg(this, glsl_type::uint_type);
 475
 476    fs_inst *mov = emit(MOV(dst, ts));
 477    /* We want to read the 3 fields we care about (mostly field 0, but also 2)
 478     * even if it's not enabled in the dispatch.
 479     */
 480    mov->force_writemask_all = true;
 481    mov->force_uncompressed = true;
 482
 483    /* The caller wants the low 32 bits of the timestamp.  Since it's running
 484     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
 485     * which is plenty of time for our purposes.  It is identical across the
 486     * EUs, but since it's tracking GPU core speed it will increment at a
 487     * varying rate as render P-states change.
 488     *
 489     * The caller could also check if render P-states have changed (or anything
 490     * else that might disrupt timing) by setting smear to 2 and checking if
 491     * that field is != 0.
 492     */
 493    dst.smear = 0;
 494
 495    return dst;
 496 }
 497
 498 void
 499 fs_visitor::emit_shader_time_begin()
 500 {
 501    current_annotation = "shader time start";
 502    shader_start_time = get_timestamp();
 503 }
 504
 505 void
 506 fs_visitor::emit_shader_time_end()
 507 {
 508    current_annotation = "shader time end";
 509
 510    enum shader_time_shader_type type, written_type, reset_type;
 511    if (dispatch_width == 8) {
 512       type = ST_FS8;
 513       written_type = ST_FS8_WRITTEN;
 514       reset_type = ST_FS8_RESET;
 515    } else {
 516       assert(dispatch_width == 16);
 517       type = ST_FS16;
 518       written_type = ST_FS16_WRITTEN;
 519       reset_type = ST_FS16_RESET;
 520    }
 521
 522    fs_reg shader_end_time = get_timestamp();
 523
 524    /* Check that there weren't any timestamp reset events (assuming these
 525     * were the only two timestamp reads that happened).
 526     */
 527    fs_reg reset = shader_end_time;
 528    reset.smear = 2;
 529    fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
 530    test->conditional_mod = BRW_CONDITIONAL_Z;
 531    emit(IF(BRW_PREDICATE_NORMAL));
 532
 533    push_force_uncompressed();
 534    fs_reg start = shader_start_time;
 535    start.negate = true;
 536    fs_reg diff = fs_reg(this, glsl_type::uint_type);
 537    emit(ADD(diff, start, shader_end_time));
 538
 539    /* If there were no instructions between the two timestamp gets, the diff
 540     * is 2 cycles.  Remove that overhead, so I can forget about that when
 541     * trying to determine the time taken for single instructions.
 542     */
 543    emit(ADD(diff, diff, fs_reg(-2u)));
 544
 545    emit_shader_time_write(type, diff);
 546    emit_shader_time_write(written_type, fs_reg(1u));
 547    emit(BRW_OPCODE_ELSE);
 548    emit_shader_time_write(reset_type, fs_reg(1u));
 549    emit(BRW_OPCODE_ENDIF);
 550
 551    pop_force_uncompressed();
 552 }
 553
 554 void
 555 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
 556                                    fs_reg value)
 557 {
 558    /* Choose an index in the buffer and set up tracking information for our
 559     * printouts.
 560     */
 561    int shader_time_index = brw->shader_time.num_entries++;
 562    assert(shader_time_index <= brw->shader_time.max_entries);
 563    brw->shader_time.types[shader_time_index] = type;
 564    if (prog) {
 565       _mesa_reference_shader_program(ctx,
 566                                      &brw->shader_time.programs[shader_time_index],
 567                                      prog);
 568    }
 569
 570    int base_mrf = 6;
 571
 572    fs_reg offset_mrf = fs_reg(MRF, base_mrf);
 573    offset_mrf.type = BRW_REGISTER_TYPE_UD;
 574    emit(MOV(offset_mrf, fs_reg(shader_time_index * 4)));
 575
 576    fs_reg time_mrf = fs_reg(MRF, base_mrf + 1);
 577    time_mrf.type = BRW_REGISTER_TYPE_UD;
 578    emit(MOV(time_mrf, value));
 579
 580    fs_inst *inst = emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD));
 581    inst->base_mrf = base_mrf;
 582    inst->mlen = 2;
 583 }
 584
 585 void
 586 fs_visitor::fail(const char *format, ...)
 587 {
 588    va_list va;
 589    char *msg;
 590
 591    if (failed)
 592       return;
 593
 594    failed = true;
 595
 596    va_start(va, format);
 597    msg = ralloc_vasprintf(mem_ctx, format, va);
 598    va_end(va);
 599    msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
 600
 601    this->fail_msg = msg;
 602
 603    if (INTEL_DEBUG & DEBUG_WM) {
 604       fprintf(stderr, "%s",  msg);
 605    }
 606 }
 607
 608 fs_inst *
 609 fs_visitor::emit(enum opcode opcode)
 610 {
 611    return emit(fs_inst(opcode));
 612 }
 613
 614 fs_inst *
 615 fs_visitor::emit(enum opcode opcode, fs_reg dst)
 616 {
 617    return emit(fs_inst(opcode, dst));
 618 }
 619
 620 fs_inst *
 621 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
 622 {
 623    return emit(fs_inst(opcode, dst, src0));
 624 }
 625
 626 fs_inst *
 627 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 628 {
 629    return emit(fs_inst(opcode, dst, src0, src1));
 630 }
 631
 632 fs_inst *
 633 fs_visitor::emit(enum opcode opcode, fs_reg dst,
 634                  fs_reg src0, fs_reg src1, fs_reg src2)
 635 {
 636    return emit(fs_inst(opcode, dst, src0, src1, src2));
 637 }
 638
 639 void
 640 fs_visitor::push_force_uncompressed()
 641 {
 642    force_uncompressed_stack++;
 643 }
 644
 645 void
 646 fs_visitor::pop_force_uncompressed()
 647 {
 648    force_uncompressed_stack--;
 649    assert(force_uncompressed_stack >= 0);
 650 }
 651
 652 void
 653 fs_visitor::push_force_sechalf()
 654 {
 655    force_sechalf_stack++;
 656 }
 657
 658 void
 659 fs_visitor::pop_force_sechalf()
 660 {
 661    force_sechalf_stack--;
 662    assert(force_sechalf_stack >= 0);
 663 }
 664
 665 /**
 666  * Returns how many MRFs an FS opcode will write over.
 667  *
 668  * Note that this is not the 0 or 1 implied writes in an actual gen
 669  * instruction -- the FS opcodes often generate MOVs in addition.
 670  */
 671 int
 672 fs_visitor::implied_mrf_writes(fs_inst *inst)
 673 {
 674    if (inst->mlen == 0)
 675       return 0;
 676
 677    switch (inst->opcode) {
 678    case SHADER_OPCODE_RCP:
 679    case SHADER_OPCODE_RSQ:
 680    case SHADER_OPCODE_SQRT:
 681    case SHADER_OPCODE_EXP2:
 682    case SHADER_OPCODE_LOG2:
 683    case SHADER_OPCODE_SIN:
 684    case SHADER_OPCODE_COS:
 685       return 1 * dispatch_width / 8;
 686    case SHADER_OPCODE_POW:
 687    case SHADER_OPCODE_INT_QUOTIENT:
 688    case SHADER_OPCODE_INT_REMAINDER:
 689       return 2 * dispatch_width / 8;
 690    case SHADER_OPCODE_TEX:
 691    case FS_OPCODE_TXB:
 692    case SHADER_OPCODE_TXD:
 693    case SHADER_OPCODE_TXF:
 694    case SHADER_OPCODE_TXL:
 695    case SHADER_OPCODE_TXS:
 696       return 1;
 697    case SHADER_OPCODE_SHADER_TIME_ADD:
 698       return 0;
 699    case FS_OPCODE_FB_WRITE:
 700       return 2;
 701    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 702    case FS_OPCODE_UNSPILL:
 703       return 1;
 704    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
 705       return inst->header_present;
 706    case FS_OPCODE_SPILL:
 707       return 2;
 708    default:
 709       assert(!"not reached");
 710       return inst->mlen;
 711    }
 712 }
 713
 714 int
 715 fs_visitor::virtual_grf_alloc(int size)
 716 {
 717    if (virtual_grf_array_size <= virtual_grf_count) {
 718       if (virtual_grf_array_size == 0)
 719          virtual_grf_array_size = 16;
 720       else
 721          virtual_grf_array_size *= 2;
 722       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 723                                    virtual_grf_array_size);
 724    }
 725    virtual_grf_sizes[virtual_grf_count] = size;
 726    return virtual_grf_count++;
 727 }
 728
 729 /** Fixed HW reg constructor. */
 730 fs_reg::fs_reg(enum register_file file, int reg)
 731 {
 732    init();
 733    this->file = file;
 734    this->reg = reg;
 735    this->type = BRW_REGISTER_TYPE_F;
 736 }
 737
 738 /** Fixed HW reg constructor. */
 739 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
 740 {
 741    init();
 742    this->file = file;
 743    this->reg = reg;
 744    this->type = type;
 745 }
 746
 747 /** Automatic reg constructor. */
 748 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
 749 {
 750    init();
 751
 752    this->file = GRF;
 753    this->reg = v->virtual_grf_alloc(v->type_size(type));
 754    this->reg_offset = 0;
 755    this->type = brw_type_for_base_type(type);
 756 }
 757
 758 fs_reg *
 759 fs_visitor::variable_storage(ir_variable *var)
 760 {
 761    return (fs_reg *)hash_table_find(this->variable_ht, var);
 762 }
 763
 764 void
 765 import_uniforms_callback(const void *key,
 766                          void *data,
 767                          void *closure)
 768 {
 769    struct hash_table *dst_ht = (struct hash_table *)closure;
 770    const fs_reg *reg = (const fs_reg *)data;
 771
 772    if (reg->file != UNIFORM)
 773       return;
 774
 775    hash_table_insert(dst_ht, data, key);
 776 }
 777
 778 /* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
 779  * This brings in those uniform definitions
 780  */
 781 void
 782 fs_visitor::import_uniforms(fs_visitor *v)
 783 {
 784    hash_table_call_foreach(v->variable_ht,
 785                            import_uniforms_callback,
 786                            variable_ht);
 787    this->params_remap = v->params_remap;
 788 }
 789
 790 /* Our support for uniforms is piggy-backed on the struct
 791  * gl_fragment_program, because that's where the values actually
 792  * get stored, rather than in some global gl_shader_program uniform
 793  * store.
 794  */
 795 int
 796 fs_visitor::setup_uniform_values(int loc, const glsl_type *type)
 797 {
 798    unsigned int offset = 0;
 799
 800    if (type->is_matrix()) {
 801       const glsl_type *column = glsl_type::get_instance(GLSL_TYPE_FLOAT,
 802                                                         type->vector_elements,
 803                                                         1);
 804
 805       for (unsigned int i = 0; i < type->matrix_columns; i++) {
 806          offset += setup_uniform_values(loc + offset, column);
 807       }
 808
 809       return offset;
 810    }
 811
 812    switch (type->base_type) {
 813    case GLSL_TYPE_FLOAT:
 814    case GLSL_TYPE_UINT:
 815    case GLSL_TYPE_INT:
 816    case GLSL_TYPE_BOOL:
 817       for (unsigned int i = 0; i < type->vector_elements; i++) {
 818          unsigned int param = c->prog_data.nr_params++;
 819
 820          this->param_index[param] = loc;
 821          this->param_offset[param] = i;
 822       }
 823       return 1;
 824
 825    case GLSL_TYPE_STRUCT:
 826       for (unsigned int i = 0; i < type->length; i++) {
 827          offset += setup_uniform_values(loc + offset,
 828                                         type->fields.structure[i].type);
 829       }
 830       return offset;
 831
 832    case GLSL_TYPE_ARRAY:
 833       for (unsigned int i = 0; i < type->length; i++) {
 834          offset += setup_uniform_values(loc + offset, type->fields.array);
 835       }
 836       return offset;
 837
 838    case GLSL_TYPE_SAMPLER:
 839       /* The sampler takes up a slot, but we don't use any values from it. */
 840       return 1;
 841
 842    default:
 843       assert(!"not reached");
 844       return 0;
 845    }
 846 }
 847
 848
 849 /* Our support for builtin uniforms is even scarier than non-builtin.
 850  * It sits on top of the PROG_STATE_VAR parameters that are
 851  * automatically updated from GL context state.
 852  */
 853 void
 854 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
 855 {
 856    const ir_state_slot *const slots = ir->state_slots;
 857    assert(ir->state_slots != NULL);
 858
 859    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 860       /* This state reference has already been setup by ir_to_mesa, but we'll
 861        * get the same index back here.
 862        */
 863       int index = _mesa_add_state_reference(this->fp->Base.Parameters,
 864                                             (gl_state_index *)slots[i].tokens);
 865
 866       /* Add each of the unique swizzles of the element as a parameter.
 867        * This'll end up matching the expected layout of the
 868        * array/matrix/structure we're trying to fill in.
 869        */
 870       int last_swiz = -1;
 871       for (unsigned int j = 0; j < 4; j++) {
 872          int swiz = GET_SWZ(slots[i].swizzle, j);
 873          if (swiz == last_swiz)
 874             break;
 875          last_swiz = swiz;
 876
 877          this->param_index[c->prog_data.nr_params] = index;
 878          this->param_offset[c->prog_data.nr_params] = swiz;
 879          c->prog_data.nr_params++;
 880       }
 881    }
 882 }
 883
 884 fs_reg *
 885 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
 886 {
 887    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 888    fs_reg wpos = *reg;
 889    bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
 890
 891    /* gl_FragCoord.x */
 892    if (ir->pixel_center_integer) {
 893       emit(MOV(wpos, this->pixel_x));
 894    } else {
 895       emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
 896    }
 897    wpos.reg_offset++;
 898
 899    /* gl_FragCoord.y */
 900    if (!flip && ir->pixel_center_integer) {
 901       emit(MOV(wpos, this->pixel_y));
 902    } else {
 903       fs_reg pixel_y = this->pixel_y;
 904       float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
 905
 906       if (flip) {
 907          pixel_y.negate = true;
 908          offset += c->key.drawable_height - 1.0;
 909       }
 910
 911       emit(ADD(wpos, pixel_y, fs_reg(offset)));
 912    }
 913    wpos.reg_offset++;
 914
 915    /* gl_FragCoord.z */
 916    if (intel->gen >= 6) {
 917       emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
 918    } else {
 919       emit(FS_OPCODE_LINTERP, wpos,
 920            this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
 921            this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
 922            interp_reg(FRAG_ATTRIB_WPOS, 2));
 923    }
 924    wpos.reg_offset++;
 925
 926    /* gl_FragCoord.w: Already set up in emit_interpolation */
 927    emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
 928
 929    return reg;
 930 }
 931
 932 fs_inst *
 933 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
 934                          glsl_interp_qualifier interpolation_mode,
 935                          bool is_centroid)
 936 {
 937    brw_wm_barycentric_interp_mode barycoord_mode;
 938    if (is_centroid) {
 939       if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
 940          barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
 941       else
 942          barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
 943    } else {
 944       if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
 945          barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
 946       else
 947          barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
 948    }
 949    return emit(FS_OPCODE_LINTERP, attr,
 950                this->delta_x[barycoord_mode],
 951                this->delta_y[barycoord_mode], interp);
 952 }
 953
 954 fs_reg *
 955 fs_visitor::emit_general_interpolation(ir_variable *ir)
 956 {
 957    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 958    reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
 959    fs_reg attr = *reg;
 960
 961    unsigned int array_elements;
 962    const glsl_type *type;
 963
 964    if (ir->type->is_array()) {
 965       array_elements = ir->type->length;
 966       if (array_elements == 0) {
 967          fail("dereferenced array '%s' has length 0\n", ir->name);
 968       }
 969       type = ir->type->fields.array;
 970    } else {
 971       array_elements = 1;
 972       type = ir->type;
 973    }
 974
 975    glsl_interp_qualifier interpolation_mode =
 976       ir->determine_interpolation_mode(c->key.flat_shade);
 977
 978    int location = ir->location;
 979    for (unsigned int i = 0; i < array_elements; i++) {
 980       for (unsigned int j = 0; j < type->matrix_columns; j++) {
 981          if (urb_setup[location] == -1) {
 982             /* If there's no incoming setup data for this slot, don't
 983              * emit interpolation for it.
 984              */
 985             attr.reg_offset += type->vector_elements;
 986             location++;
 987             continue;
 988          }
 989
 990          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
 991             /* Constant interpolation (flat shading) case. The SF has
 992              * handed us defined values in only the constant offset
 993              * field of the setup reg.
 994              */
 995             for (unsigned int k = 0; k < type->vector_elements; k++) {
 996                struct brw_reg interp = interp_reg(location, k);
 997                interp = suboffset(interp, 3);
 998                interp.type = reg->type;
 999                emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1000                attr.reg_offset++;
1001             }
1002          } else {
1003             /* Smooth/noperspective interpolation case. */
1004             for (unsigned int k = 0; k < type->vector_elements; k++) {
1005                /* FINISHME: At some point we probably want to push
1006                 * this farther by giving similar treatment to the
1007                 * other potentially constant components of the
1008                 * attribute, as well as making brw_vs_constval.c
1009                 * handle varyings other than gl_TexCoord.
1010                 */
1011                if (location >= FRAG_ATTRIB_TEX0 &&
1012                    location <= FRAG_ATTRIB_TEX7 &&
1013                    k == 3 && !(c->key.proj_attrib_mask & (1 << location))) {
1014                   emit(BRW_OPCODE_MOV, attr, fs_reg(1.0f));
1015                } else {
1016                   struct brw_reg interp = interp_reg(location, k);
1017                   emit_linterp(attr, fs_reg(interp), interpolation_mode,
1018                                ir->centroid);
1019                   if (brw->needs_unlit_centroid_workaround && ir->centroid) {
1020                      /* Get the pixel/sample mask into f0 so that we know
1021                       * which pixels are lit.  Then, for each channel that is
1022                       * unlit, replace the centroid data with non-centroid
1023                       * data.
1024                       */
1025                      emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS, attr);
1026                      fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1027                                                   interpolation_mode, false);
1028                      inst->predicate = BRW_PREDICATE_NORMAL;
1029                      inst->predicate_inverse = true;
1030                   }
1031                   if (intel->gen < 6) {
1032                      emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1033                   }
1034                }
1035                attr.reg_offset++;
1036             }
1037
1038          }
1039          location++;
1040       }
1041    }
1042
1043    return reg;
1044 }
1045
1046 fs_reg *
1047 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1048 {
1049    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1050
1051    /* The frontfacing comes in as a bit in the thread payload. */
1052    if (intel->gen >= 6) {
1053       emit(BRW_OPCODE_ASR, *reg,
1054            fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1055            fs_reg(15));
1056       emit(BRW_OPCODE_NOT, *reg, *reg);
1057       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1058    } else {
1059       struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1060       /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1061        * us front face
1062        */
1063       emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1064       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1065    }
1066
1067    return reg;
1068 }
1069
1070 fs_inst *
1071 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1072 {
1073    switch (opcode) {
1074    case SHADER_OPCODE_RCP:
1075    case SHADER_OPCODE_RSQ:
1076    case SHADER_OPCODE_SQRT:
1077    case SHADER_OPCODE_EXP2:
1078    case SHADER_OPCODE_LOG2:
1079    case SHADER_OPCODE_SIN:
1080    case SHADER_OPCODE_COS:
1081       break;
1082    default:
1083       assert(!"not reached: bad math opcode");
1084       return NULL;
1085    }
1086
1087    /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
1088     * might be able to do better by doing execsize = 1 math and then
1089     * expanding that result out, but we would need to be careful with
1090     * masking.
1091     *
1092     * Gen 6 hardware ignores source modifiers (negate and abs) on math
1093     * instructions, so we also move to a temp to set those up.
1094     */
1095    if (intel->gen == 6 && (src.file == UNIFORM ||
1096                            src.abs ||
1097                            src.negate)) {
1098       fs_reg expanded = fs_reg(this, glsl_type::float_type);
1099       emit(BRW_OPCODE_MOV, expanded, src);
1100       src = expanded;
1101    }
1102
1103    fs_inst *inst = emit(opcode, dst, src);
1104
1105    if (intel->gen < 6) {
1106       inst->base_mrf = 2;
1107       inst->mlen = dispatch_width / 8;
1108    }
1109
1110    return inst;
1111 }
1112
1113 fs_inst *
1114 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1115 {
1116    int base_mrf = 2;
1117    fs_inst *inst;
1118
1119    switch (opcode) {
1120    case SHADER_OPCODE_POW:
1121    case SHADER_OPCODE_INT_QUOTIENT:
1122    case SHADER_OPCODE_INT_REMAINDER:
1123       break;
1124    default:
1125       assert(!"not reached: unsupported binary math opcode.");
1126       return NULL;
1127    }
1128
1129    if (intel->gen >= 7) {
1130       inst = emit(opcode, dst, src0, src1);
1131    } else if (intel->gen == 6) {
1132       /* Can't do hstride == 0 args to gen6 math, so expand it out.
1133        *
1134        * The hardware ignores source modifiers (negate and abs) on math
1135        * instructions, so we also move to a temp to set those up.
1136        */
1137       if (src0.file == UNIFORM || src0.abs || src0.negate) {
1138          fs_reg expanded = fs_reg(this, glsl_type::float_type);
1139          expanded.type = src0.type;
1140          emit(BRW_OPCODE_MOV, expanded, src0);
1141          src0 = expanded;
1142       }
1143
1144       if (src1.file == UNIFORM || src1.abs || src1.negate) {
1145          fs_reg expanded = fs_reg(this, glsl_type::float_type);
1146          expanded.type = src1.type;
1147          emit(BRW_OPCODE_MOV, expanded, src1);
1148          src1 = expanded;
1149       }
1150
1151       inst = emit(opcode, dst, src0, src1);
1152    } else {
1153       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1154        * "Message Payload":
1155        *
1156        * "Operand0[7].  For the INT DIV functions, this operand is the
1157        *  denominator."
1158        *  ...
1159        * "Operand1[7].  For the INT DIV functions, this operand is the
1160        *  numerator."
1161        */
1162       bool is_int_div = opcode != SHADER_OPCODE_POW;
1163       fs_reg &op0 = is_int_div ? src1 : src0;
1164       fs_reg &op1 = is_int_div ? src0 : src1;
1165
1166       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1167       inst = emit(opcode, dst, op0, reg_null_f);
1168
1169       inst->base_mrf = base_mrf;
1170       inst->mlen = 2 * dispatch_width / 8;
1171    }
1172    return inst;
1173 }
1174
1175 /**
1176  * To be called after the last _mesa_add_state_reference() call, to
1177  * set up prog_data.param[] for assign_curb_setup() and
1178  * setup_pull_constants().
1179  */
1180 void
1181 fs_visitor::setup_paramvalues_refs()
1182 {
1183    if (dispatch_width != 8)
1184       return;
1185
1186    /* Set up the pointers to ParamValues now that that array is finalized. */
1187    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1188       c->prog_data.param[i] =
1189          (const float *)fp->Base.Parameters->ParameterValues[this->param_index[i]] +
1190          this->param_offset[i];
1191    }
1192 }
1193
1194 void
1195 fs_visitor::assign_curb_setup()
1196 {
1197    c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
1198    if (dispatch_width == 8) {
1199       c->prog_data.first_curbe_grf = c->nr_payload_regs;
1200    } else {
1201       c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1202    }
1203
1204    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1205    foreach_list(node, &this->instructions) {
1206       fs_inst *inst = (fs_inst *)node;
1207
1208       for (unsigned int i = 0; i < 3; i++) {
1209          if (inst->src[i].file == UNIFORM) {
1210             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1211             struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1212                                                   constant_nr / 8,
1213                                                   constant_nr % 8);
1214
1215             inst->src[i].file = FIXED_HW_REG;
1216             inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
1217          }
1218       }
1219    }
1220 }
1221
1222 void
1223 fs_visitor::calculate_urb_setup()
1224 {
1225    for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
1226       urb_setup[i] = -1;
1227    }
1228
1229    int urb_next = 0;
1230    /* Figure out where each of the incoming setup attributes lands. */
1231    if (intel->gen >= 6) {
1232       for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
1233          if (fp->Base.InputsRead & BITFIELD64_BIT(i)) {
1234             urb_setup[i] = urb_next++;
1235          }
1236       }
1237    } else {
1238       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1239       for (unsigned int i = 0; i < VERT_RESULT_MAX; i++) {
1240          /* Point size is packed into the header, not as a general attribute */
1241          if (i == VERT_RESULT_PSIZ)
1242             continue;
1243
1244          if (c->key.vp_outputs_written & BITFIELD64_BIT(i)) {
1245             int fp_index = _mesa_vert_result_to_frag_attrib((gl_vert_result) i);
1246
1247             /* The back color slot is skipped when the front color is
1248              * also written to.  In addition, some slots can be
1249              * written in the vertex shader and not read in the
1250              * fragment shader.  So the register number must always be
1251              * incremented, mapped or not.
1252              */
1253             if (fp_index >= 0)
1254                urb_setup[fp_index] = urb_next;
1255             urb_next++;
1256          }
1257       }
1258
1259       /*
1260        * It's a FS only attribute, and we did interpolation for this attribute
1261        * in SF thread. So, count it here, too.
1262        *
1263        * See compile_sf_prog() for more info.
1264        */
1265       if (fp->Base.InputsRead & BITFIELD64_BIT(FRAG_ATTRIB_PNTC))
1266          urb_setup[FRAG_ATTRIB_PNTC] = urb_next++;
1267    }
1268
1269    /* Each attribute is 4 setup channels, each of which is half a reg. */
1270    c->prog_data.urb_read_length = urb_next * 2;
1271 }
1272
1273 void
1274 fs_visitor::assign_urb_setup()
1275 {
1276    int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1277
1278    /* Offset all the urb_setup[] index by the actual position of the
1279     * setup regs, now that the location of the constants has been chosen.
1280     */
1281    foreach_list(node, &this->instructions) {
1282       fs_inst *inst = (fs_inst *)node;
1283
1284       if (inst->opcode == FS_OPCODE_LINTERP) {
1285          assert(inst->src[2].file == FIXED_HW_REG);
1286          inst->src[2].fixed_hw_reg.nr += urb_start;
1287       }
1288
1289       if (inst->opcode == FS_OPCODE_CINTERP) {
1290          assert(inst->src[0].file == FIXED_HW_REG);
1291          inst->src[0].fixed_hw_reg.nr += urb_start;
1292       }
1293    }
1294
1295    this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length;
1296 }
1297
1298 /**
1299  * Split large virtual GRFs into separate components if we can.
1300  *
1301  * This is mostly duplicated with what brw_fs_vector_splitting does,
1302  * but that's really conservative because it's afraid of doing
1303  * splitting that doesn't result in real progress after the rest of
1304  * the optimization phases, which would cause infinite looping in
1305  * optimization.  We can do it once here, safely.  This also has the
1306  * opportunity to split interpolated values, or maybe even uniforms,
1307  * which we don't have at the IR level.
1308  *
1309  * We want to split, because virtual GRFs are what we register
1310  * allocate and spill (due to contiguousness requirements for some
1311  * instructions), and they're what we naturally generate in the
1312  * codegen process, but most virtual GRFs don't actually need to be
1313  * contiguous sets of GRFs.  If we split, we'll end up with reduced
1314  * live intervals and better dead code elimination and coalescing.
1315  */
1316 void
1317 fs_visitor::split_virtual_grfs()
1318 {
1319    int num_vars = this->virtual_grf_count;
1320    bool split_grf[num_vars];
1321    int new_virtual_grf[num_vars];
1322
1323    /* Try to split anything > 0 sized. */
1324    for (int i = 0; i < num_vars; i++) {
1325       if (this->virtual_grf_sizes[i] != 1)
1326          split_grf[i] = true;
1327       else
1328          split_grf[i] = false;
1329    }
1330
1331    if (brw->has_pln &&
1332        this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1333       /* PLN opcodes rely on the delta_xy being contiguous.  We only have to
1334        * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1335        * Gen6, that was the only supported interpolation mode, and since Gen6,
1336        * delta_x and delta_y are in fixed hardware registers.
1337        */
1338       split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1339          false;
1340    }
1341
1342    foreach_list(node, &this->instructions) {
1343       fs_inst *inst = (fs_inst *)node;
1344
1345       /* If there's a SEND message that requires contiguous destination
1346        * registers, no splitting is allowed.
1347        */
1348       if (inst->regs_written() > 1) {
1349          split_grf[inst->dst.reg] = false;
1350       }
1351    }
1352
1353    /* Allocate new space for split regs.  Note that the virtual
1354     * numbers will be contiguous.
1355     */
1356    for (int i = 0; i < num_vars; i++) {
1357       if (split_grf[i]) {
1358          new_virtual_grf[i] = virtual_grf_alloc(1);
1359          for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1360             int reg = virtual_grf_alloc(1);
1361             assert(reg == new_virtual_grf[i] + j - 1);
1362             (void) reg;
1363          }
1364          this->virtual_grf_sizes[i] = 1;
1365       }
1366    }
1367
1368    foreach_list(node, &this->instructions) {
1369       fs_inst *inst = (fs_inst *)node;
1370
1371       if (inst->dst.file == GRF &&
1372           split_grf[inst->dst.reg] &&
1373           inst->dst.reg_offset != 0) {
1374          inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1375                           inst->dst.reg_offset - 1);
1376          inst->dst.reg_offset = 0;
1377       }
1378       for (int i = 0; i < 3; i++) {
1379          if (inst->src[i].file == GRF &&
1380              split_grf[inst->src[i].reg] &&
1381              inst->src[i].reg_offset != 0) {
1382             inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1383                                 inst->src[i].reg_offset - 1);
1384             inst->src[i].reg_offset = 0;
1385          }
1386       }
1387    }
1388    this->live_intervals_valid = false;
1389 }
1390
1391 /**
1392  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1393  *
1394  * During code generation, we create tons of temporary variables, many of
1395  * which get immediately killed and are never used again.  Yet, in later
1396  * optimization and analysis passes, such as compute_live_intervals, we need
1397  * to loop over all the virtual GRFs.  Compacting them can save a lot of
1398  * overhead.
1399  */
1400 void
1401 fs_visitor::compact_virtual_grfs()
1402 {
1403    /* Mark which virtual GRFs are used, and count how many. */
1404    int remap_table[this->virtual_grf_count];
1405    memset(remap_table, -1, sizeof(remap_table));
1406
1407    foreach_list(node, &this->instructions) {
1408       const fs_inst *inst = (const fs_inst *) node;
1409
1410       if (inst->dst.file == GRF)
1411          remap_table[inst->dst.reg] = 0;
1412
1413       for (int i = 0; i < 3; i++) {
1414          if (inst->src[i].file == GRF)
1415             remap_table[inst->src[i].reg] = 0;
1416       }
1417    }
1418
1419    /* In addition to registers used in instructions, fs_visitor keeps
1420     * direct references to certain special values which must be patched:
1421     */
1422    fs_reg *special[] = {
1423       &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1424       &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1425       &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1426       &delta_x[0], &delta_x[1], &delta_x[2],
1427       &delta_x[3], &delta_x[4], &delta_x[5],
1428       &delta_y[0], &delta_y[1], &delta_y[2],
1429       &delta_y[3], &delta_y[4], &delta_y[5],
1430    };
1431    STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1432    STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1433
1434    /* Treat all special values as used, to be conservative */
1435    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1436       if (special[i]->file == GRF)
1437          remap_table[special[i]->reg] = 0;
1438    }
1439
1440    /* Compact the GRF arrays. */
1441    int new_index = 0;
1442    for (int i = 0; i < this->virtual_grf_count; i++) {
1443       if (remap_table[i] != -1) {
1444          remap_table[i] = new_index;
1445          virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1446          if (live_intervals_valid) {
1447             virtual_grf_use[new_index] = virtual_grf_use[i];
1448             virtual_grf_def[new_index] = virtual_grf_def[i];
1449          }
1450          ++new_index;
1451       }
1452    }
1453
1454    this->virtual_grf_count = new_index;
1455
1456    /* Patch all the instructions to use the newly renumbered registers */
1457    foreach_list(node, &this->instructions) {
1458       fs_inst *inst = (fs_inst *) node;
1459
1460       if (inst->dst.file == GRF)
1461          inst->dst.reg = remap_table[inst->dst.reg];
1462
1463       for (int i = 0; i < 3; i++) {
1464          if (inst->src[i].file == GRF)
1465             inst->src[i].reg = remap_table[inst->src[i].reg];
1466       }
1467    }
1468
1469    /* Patch all the references to special values */
1470    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1471       if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1472          special[i]->reg = remap_table[special[i]->reg];
1473    }
1474 }
1475
1476 bool
1477 fs_visitor::remove_dead_constants()
1478 {
1479    if (dispatch_width == 8) {
1480       this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
1481
1482       for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
1483          this->params_remap[i] = -1;
1484
1485       /* Find which params are still in use. */
1486       foreach_list(node, &this->instructions) {
1487          fs_inst *inst = (fs_inst *)node;
1488
1489          for (int i = 0; i < 3; i++) {
1490             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1491
1492             if (inst->src[i].file != UNIFORM)
1493                continue;
1494
1495             assert(constant_nr < (int)c->prog_data.nr_params);
1496
1497             /* For now, set this to non-negative.  We'll give it the
1498              * actual new number in a moment, in order to keep the
1499              * register numbers nicely ordered.
1500              */
1501             this->params_remap[constant_nr] = 0;
1502          }
1503       }
1504
1505       /* Figure out what the new numbers for the params will be.  At some
1506        * point when we're doing uniform array access, we're going to want
1507        * to keep the distinction between .reg and .reg_offset, but for
1508        * now we don't care.
1509        */
1510       unsigned int new_nr_params = 0;
1511       for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1512          if (this->params_remap[i] != -1) {
1513             this->params_remap[i] = new_nr_params++;
1514          }
1515       }
1516
1517       /* Update the list of params to be uploaded to match our new numbering. */
1518       for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1519          int remapped = this->params_remap[i];
1520
1521          if (remapped == -1)
1522             continue;
1523
1524          /* We've already done setup_paramvalues_refs() so no need to worry
1525           * about param_index and param_offset.
1526           */
1527          c->prog_data.param[remapped] = c->prog_data.param[i];
1528       }
1529
1530       c->prog_data.nr_params = new_nr_params;
1531    } else {
1532       /* This should have been generated in the 8-wide pass already. */
1533       assert(this->params_remap);
1534    }
1535
1536    /* Now do the renumbering of the shader to remove unused params. */
1537    foreach_list(node, &this->instructions) {
1538       fs_inst *inst = (fs_inst *)node;
1539
1540       for (int i = 0; i < 3; i++) {
1541          int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1542
1543          if (inst->src[i].file != UNIFORM)
1544             continue;
1545
1546          assert(this->params_remap[constant_nr] != -1);
1547          inst->src[i].reg = this->params_remap[constant_nr];
1548          inst->src[i].reg_offset = 0;
1549       }
1550    }
1551
1552    return true;
1553 }
1554
1555 /*
1556  * Implements array access of uniforms by inserting a
1557  * PULL_CONSTANT_LOAD instruction.
1558  *
1559  * Unlike temporary GRF array access (where we don't support it due to
1560  * the difficulty of doing relative addressing on instruction
1561  * destinations), we could potentially do array access of uniforms
1562  * that were loaded in GRF space as push constants.  In real-world
1563  * usage we've seen, though, the arrays being used are always larger
1564  * than we could load as push constants, so just always move all
1565  * uniform array access out to a pull constant buffer.
1566  */
1567 void
1568 fs_visitor::move_uniform_array_access_to_pull_constants()
1569 {
1570    int pull_constant_loc[c->prog_data.nr_params];
1571
1572    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1573       pull_constant_loc[i] = -1;
1574    }
1575
1576    /* Walk through and find array access of uniforms.  Put a copy of that
1577     * uniform in the pull constant buffer.
1578     *
1579     * Note that we don't move constant-indexed accesses to arrays.  No
1580     * testing has been done of the performance impact of this choice.
1581     */
1582    foreach_list_safe(node, &this->instructions) {
1583       fs_inst *inst = (fs_inst *)node;
1584
1585       for (int i = 0 ; i < 3; i++) {
1586          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1587             continue;
1588
1589          int uniform = inst->src[i].reg;
1590
1591          /* If this array isn't already present in the pull constant buffer,
1592           * add it.
1593           */
1594          if (pull_constant_loc[uniform] == -1) {
1595             const float **values = &c->prog_data.param[uniform];
1596
1597             pull_constant_loc[uniform] = c->prog_data.nr_pull_params;
1598
1599             assert(param_size[uniform]);
1600
1601             for (int j = 0; j < param_size[uniform]; j++) {
1602                c->prog_data.pull_param[c->prog_data.nr_pull_params++] =
1603                   values[j];
1604             }
1605          }
1606
1607          /* Set up the annotation tracking for new generated instructions. */
1608          base_ir = inst->ir;
1609          current_annotation = inst->annotation;
1610
1611          fs_reg offset = fs_reg(this, glsl_type::int_type);
1612          inst->insert_before(ADD(offset, *inst->src[i].reladdr,
1613                                  fs_reg(pull_constant_loc[uniform] +
1614                                         inst->src[i].reg_offset)));
1615
1616          fs_reg surf_index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1617          fs_reg temp = fs_reg(this, glsl_type::float_type);
1618          exec_list list = VARYING_PULL_CONSTANT_LOAD(temp,
1619                                                      surf_index, offset);
1620          inst->insert_before(&list);
1621
1622          inst->src[i].file = temp.file;
1623          inst->src[i].reg = temp.reg;
1624          inst->src[i].reg_offset = temp.reg_offset;
1625          inst->src[i].reladdr = NULL;
1626       }
1627    }
1628 }
1629
1630 /**
1631  * Choose accesses from the UNIFORM file to demote to using the pull
1632  * constant buffer.
1633  *
1634  * We allow a fragment shader to have more than the specified minimum
1635  * maximum number of fragment shader uniform components (64).  If
1636  * there are too many of these, they'd fill up all of register space.
1637  * So, this will push some of them out to the pull constant buffer and
1638  * update the program to load them.
1639  */
1640 void
1641 fs_visitor::setup_pull_constants()
1642 {
1643    /* Only allow 16 registers (128 uniform components) as push constants. */
1644    unsigned int max_uniform_components = 16 * 8;
1645    if (c->prog_data.nr_params <= max_uniform_components)
1646       return;
1647
1648    if (dispatch_width == 16) {
1649       fail("Pull constants not supported in 16-wide\n");
1650       return;
1651    }
1652
1653    /* Just demote the end of the list.  We could probably do better
1654     * here, demoting things that are rarely used in the program first.
1655     */
1656    unsigned int pull_uniform_base = max_uniform_components;
1657
1658    int pull_constant_loc[c->prog_data.nr_params];
1659    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1660       if (i < pull_uniform_base) {
1661          pull_constant_loc[i] = -1;
1662       } else {
1663          pull_constant_loc[i] = -1;
1664          /* If our constant is already being uploaded for reladdr purposes,
1665           * reuse it.
1666           */
1667          for (unsigned int j = 0; j < c->prog_data.nr_pull_params; j++) {
1668             if (c->prog_data.pull_param[j] == c->prog_data.param[i]) {
1669                pull_constant_loc[i] = j;
1670                break;
1671             }
1672          }
1673          if (pull_constant_loc[i] == -1) {
1674             int pull_index = c->prog_data.nr_pull_params++;
1675             c->prog_data.pull_param[pull_index] = c->prog_data.param[i];
1676             pull_constant_loc[i] = pull_index;;
1677          }
1678       }
1679    }
1680    c->prog_data.nr_params = pull_uniform_base;
1681
1682    foreach_list(node, &this->instructions) {
1683       fs_inst *inst = (fs_inst *)node;
1684
1685       for (int i = 0; i < 3; i++) {
1686          if (inst->src[i].file != UNIFORM)
1687             continue;
1688
1689          int pull_index = pull_constant_loc[inst->src[i].reg +
1690                                             inst->src[i].reg_offset];
1691          if (pull_index == -1)
1692             continue;
1693
1694          assert(!inst->src[i].reladdr);
1695
1696          fs_reg dst = fs_reg(this, glsl_type::float_type);
1697          fs_reg index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1698          fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1699          fs_inst *pull =
1700             new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1701                                  dst, index, offset);
1702          pull->ir = inst->ir;
1703          pull->annotation = inst->annotation;
1704          pull->base_mrf = 14;
1705          pull->mlen = 1;
1706
1707          inst->insert_before(pull);
1708
1709          inst->src[i].file = GRF;
1710          inst->src[i].reg = dst.reg;
1711          inst->src[i].reg_offset = 0;
1712          inst->src[i].smear = pull_index & 3;
1713       }
1714    }
1715 }
1716
1717 bool
1718 fs_visitor::opt_algebraic()
1719 {
1720    bool progress = false;
1721
1722    foreach_list(node, &this->instructions) {
1723       fs_inst *inst = (fs_inst *)node;
1724
1725       switch (inst->opcode) {
1726       case BRW_OPCODE_MUL:
1727          if (inst->src[1].file != IMM)
1728             continue;
1729
1730          /* a * 1.0 = a */
1731          if (inst->src[1].is_one()) {
1732             inst->opcode = BRW_OPCODE_MOV;
1733             inst->src[1] = reg_undef;
1734             progress = true;
1735             break;
1736          }
1737
1738          /* a * 0.0 = 0.0 */
1739          if (inst->src[1].is_zero()) {
1740             inst->opcode = BRW_OPCODE_MOV;
1741             inst->src[0] = inst->src[1];
1742             inst->src[1] = reg_undef;
1743             progress = true;
1744             break;
1745          }
1746
1747          break;
1748       case BRW_OPCODE_ADD:
1749          if (inst->src[1].file != IMM)
1750             continue;
1751
1752          /* a + 0.0 = a */
1753          if (inst->src[1].is_zero()) {
1754             inst->opcode = BRW_OPCODE_MOV;
1755             inst->src[1] = reg_undef;
1756             progress = true;
1757             break;
1758          }
1759          break;
1760       default:
1761          break;
1762       }
1763    }
1764
1765    return progress;
1766 }
1767
1768 /**
1769  * Must be called after calculate_live_intervales() to remove unused
1770  * writes to registers -- register allocation will fail otherwise
1771  * because something deffed but not used won't be considered to
1772  * interfere with other regs.
1773  */
1774 bool
1775 fs_visitor::dead_code_eliminate()
1776 {
1777    bool progress = false;
1778    int pc = 0;
1779
1780    calculate_live_intervals();
1781
1782    foreach_list_safe(node, &this->instructions) {
1783       fs_inst *inst = (fs_inst *)node;
1784
1785       if (inst->dst.file == GRF && this->virtual_grf_use[inst->dst.reg] <= pc) {
1786          inst->remove();
1787          progress = true;
1788       }
1789
1790       pc++;
1791    }
1792
1793    if (progress)
1794       live_intervals_valid = false;
1795
1796    return progress;
1797 }
1798
1799 /**
1800  * Implements a second type of register coalescing: This one checks if
1801  * the two regs involved in a raw move don't interfere, in which case
1802  * they can both by stored in the same place and the MOV removed.
1803  */
1804 bool
1805 fs_visitor::register_coalesce_2()
1806 {
1807    bool progress = false;
1808
1809    calculate_live_intervals();
1810
1811    foreach_list_safe(node, &this->instructions) {
1812       fs_inst *inst = (fs_inst *)node;
1813
1814       if (inst->opcode != BRW_OPCODE_MOV ||
1815           inst->predicate ||
1816           inst->saturate ||
1817           inst->src[0].file != GRF ||
1818           inst->src[0].negate ||
1819           inst->src[0].abs ||
1820           inst->src[0].smear != -1 ||
1821           inst->dst.file != GRF ||
1822           inst->dst.type != inst->src[0].type ||
1823           virtual_grf_sizes[inst->src[0].reg] != 1 ||
1824           virtual_grf_interferes(inst->dst.reg, inst->src[0].reg)) {
1825          continue;
1826       }
1827
1828       int reg_from = inst->src[0].reg;
1829       assert(inst->src[0].reg_offset == 0);
1830       int reg_to = inst->dst.reg;
1831       int reg_to_offset = inst->dst.reg_offset;
1832
1833       foreach_list_safe(node, &this->instructions) {
1834          fs_inst *scan_inst = (fs_inst *)node;
1835
1836          if (scan_inst->dst.file == GRF &&
1837              scan_inst->dst.reg == reg_from) {
1838             scan_inst->dst.reg = reg_to;
1839             scan_inst->dst.reg_offset = reg_to_offset;
1840          }
1841          for (int i = 0; i < 3; i++) {
1842             if (scan_inst->src[i].file == GRF &&
1843                 scan_inst->src[i].reg == reg_from) {
1844                scan_inst->src[i].reg = reg_to;
1845                scan_inst->src[i].reg_offset = reg_to_offset;
1846             }
1847          }
1848       }
1849
1850       inst->remove();
1851       live_intervals_valid = false;
1852       progress = true;
1853       continue;
1854    }
1855
1856    return progress;
1857 }
1858
1859 bool
1860 fs_visitor::register_coalesce()
1861 {
1862    bool progress = false;
1863    int if_depth = 0;
1864    int loop_depth = 0;
1865
1866    foreach_list_safe(node, &this->instructions) {
1867       fs_inst *inst = (fs_inst *)node;
1868
1869       /* Make sure that we dominate the instructions we're going to
1870        * scan for interfering with our coalescing, or we won't have
1871        * scanned enough to see if anything interferes with our
1872        * coalescing.  We don't dominate the following instructions if
1873        * we're in a loop or an if block.
1874        */
1875       switch (inst->opcode) {
1876       case BRW_OPCODE_DO:
1877          loop_depth++;
1878          break;
1879       case BRW_OPCODE_WHILE:
1880          loop_depth--;
1881          break;
1882       case BRW_OPCODE_IF:
1883          if_depth++;
1884          break;
1885       case BRW_OPCODE_ENDIF:
1886          if_depth--;
1887          break;
1888       default:
1889          break;
1890       }
1891       if (loop_depth || if_depth)
1892          continue;
1893
1894       if (inst->opcode != BRW_OPCODE_MOV ||
1895           inst->predicate ||
1896           inst->saturate ||
1897           inst->dst.file != GRF || (inst->src[0].file != GRF &&
1898                                     inst->src[0].file != UNIFORM)||
1899           inst->dst.type != inst->src[0].type)
1900          continue;
1901
1902       bool has_source_modifiers = (inst->src[0].abs ||
1903                                    inst->src[0].negate ||
1904                                    inst->src[0].file == UNIFORM);
1905
1906       /* Found a move of a GRF to a GRF.  Let's see if we can coalesce
1907        * them: check for no writes to either one until the exit of the
1908        * program.
1909        */
1910       bool interfered = false;
1911
1912       for (fs_inst *scan_inst = (fs_inst *)inst->next;
1913            !scan_inst->is_tail_sentinel();
1914            scan_inst = (fs_inst *)scan_inst->next) {
1915          if (scan_inst->dst.file == GRF) {
1916             if (scan_inst->overwrites_reg(inst->dst) ||
1917                 scan_inst->overwrites_reg(inst->src[0])) {
1918                interfered = true;
1919                break;
1920             }
1921          }
1922
1923          /* The gen6 MATH instruction can't handle source modifiers or
1924           * unusual register regions, so avoid coalescing those for
1925           * now.  We should do something more specific.
1926           */
1927          if (has_source_modifiers && !can_do_source_mods(scan_inst)) {
1928             interfered = true;
1929             break;
1930          }
1931
1932          /* The accumulator result appears to get used for the
1933           * conditional modifier generation.  When negating a UD
1934           * value, there is a 33rd bit generated for the sign in the
1935           * accumulator value, so now you can't check, for example,
1936           * equality with a 32-bit value.  See piglit fs-op-neg-uint.
1937           */
1938          if (scan_inst->conditional_mod &&
1939              inst->src[0].negate &&
1940              inst->src[0].type == BRW_REGISTER_TYPE_UD) {
1941             interfered = true;
1942             break;
1943          }
1944       }
1945       if (interfered) {
1946          continue;
1947       }
1948
1949       /* Rewrite the later usage to point at the source of the move to
1950        * be removed.
1951        */
1952       for (fs_inst *scan_inst = inst;
1953            !scan_inst->is_tail_sentinel();
1954            scan_inst = (fs_inst *)scan_inst->next) {
1955          for (int i = 0; i < 3; i++) {
1956             if (scan_inst->src[i].file == GRF &&
1957                 scan_inst->src[i].reg == inst->dst.reg &&
1958                 scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
1959                fs_reg new_src = inst->src[0];
1960                if (scan_inst->src[i].abs) {
1961                   new_src.negate = 0;
1962                   new_src.abs = 1;
1963                }
1964                new_src.negate ^= scan_inst->src[i].negate;
1965                scan_inst->src[i] = new_src;
1966             }
1967          }
1968       }
1969
1970       inst->remove();
1971       progress = true;
1972    }
1973
1974    if (progress)
1975       live_intervals_valid = false;
1976
1977    return progress;
1978 }
1979
1980
1981 bool
1982 fs_visitor::compute_to_mrf()
1983 {
1984    bool progress = false;
1985    int next_ip = 0;
1986
1987    calculate_live_intervals();
1988
1989    foreach_list_safe(node, &this->instructions) {
1990       fs_inst *inst = (fs_inst *)node;
1991
1992       int ip = next_ip;
1993       next_ip++;
1994
1995       if (inst->opcode != BRW_OPCODE_MOV ||
1996           inst->predicate ||
1997           inst->dst.file != MRF || inst->src[0].file != GRF ||
1998           inst->dst.type != inst->src[0].type ||
1999           inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
2000          continue;
2001
2002       /* Work out which hardware MRF registers are written by this
2003        * instruction.
2004        */
2005       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2006       int mrf_high;
2007       if (inst->dst.reg & BRW_MRF_COMPR4) {
2008          mrf_high = mrf_low + 4;
2009       } else if (dispatch_width == 16 &&
2010                  (!inst->force_uncompressed && !inst->force_sechalf)) {
2011          mrf_high = mrf_low + 1;
2012       } else {
2013          mrf_high = mrf_low;
2014       }
2015
2016       /* Can't compute-to-MRF this GRF if someone else was going to
2017        * read it later.
2018        */
2019       if (this->virtual_grf_use[inst->src[0].reg] > ip)
2020          continue;
2021
2022       /* Found a move of a GRF to a MRF.  Let's see if we can go
2023        * rewrite the thing that made this GRF to write into the MRF.
2024        */
2025       fs_inst *scan_inst;
2026       for (scan_inst = (fs_inst *)inst->prev;
2027            scan_inst->prev != NULL;
2028            scan_inst = (fs_inst *)scan_inst->prev) {
2029          if (scan_inst->dst.file == GRF &&
2030              scan_inst->dst.reg == inst->src[0].reg) {
2031             /* Found the last thing to write our reg we want to turn
2032              * into a compute-to-MRF.
2033              */
2034
2035             /* SENDs can only write to GRFs, so no compute-to-MRF. */
2036             if (scan_inst->mlen) {
2037                break;
2038             }
2039
2040             /* If it's predicated, it (probably) didn't populate all
2041              * the channels.  We might be able to rewrite everything
2042              * that writes that reg, but it would require smarter
2043              * tracking to delay the rewriting until complete success.
2044              */
2045             if (scan_inst->predicate)
2046                break;
2047
2048             /* If it's half of register setup and not the same half as
2049              * our MOV we're trying to remove, bail for now.
2050              */
2051             if (scan_inst->force_uncompressed != inst->force_uncompressed ||
2052                 scan_inst->force_sechalf != inst->force_sechalf) {
2053                break;
2054             }
2055
2056             /* SEND instructions can't have MRF as a destination. */
2057             if (scan_inst->mlen)
2058                break;
2059
2060             if (intel->gen >= 6) {
2061                /* gen6 math instructions must have the destination be
2062                 * GRF, so no compute-to-MRF for them.
2063                 */
2064                if (scan_inst->is_math()) {
2065                   break;
2066                }
2067             }
2068
2069             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2070                /* Found the creator of our MRF's source value. */
2071                scan_inst->dst.file = MRF;
2072                scan_inst->dst.reg = inst->dst.reg;
2073                scan_inst->saturate |= inst->saturate;
2074                inst->remove();
2075                progress = true;
2076             }
2077             break;
2078          }
2079
2080          /* We don't handle flow control here.  Most computation of
2081           * values that end up in MRFs are shortly before the MRF
2082           * write anyway.
2083           */
2084          if (scan_inst->opcode == BRW_OPCODE_DO ||
2085              scan_inst->opcode == BRW_OPCODE_WHILE ||
2086              scan_inst->opcode == BRW_OPCODE_ELSE ||
2087              scan_inst->opcode == BRW_OPCODE_ENDIF) {
2088             break;
2089          }
2090
2091          /* You can't read from an MRF, so if someone else reads our
2092           * MRF's source GRF that we wanted to rewrite, that stops us.
2093           */
2094          bool interfered = false;
2095          for (int i = 0; i < 3; i++) {
2096             if (scan_inst->src[i].file == GRF &&
2097                 scan_inst->src[i].reg == inst->src[0].reg &&
2098                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2099                interfered = true;
2100             }
2101          }
2102          if (interfered)
2103             break;
2104
2105          if (scan_inst->dst.file == MRF) {
2106             /* If somebody else writes our MRF here, we can't
2107              * compute-to-MRF before that.
2108              */
2109             int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2110             int scan_mrf_high;
2111
2112             if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2113                scan_mrf_high = scan_mrf_low + 4;
2114             } else if (dispatch_width == 16 &&
2115                        (!scan_inst->force_uncompressed &&
2116                         !scan_inst->force_sechalf)) {
2117                scan_mrf_high = scan_mrf_low + 1;
2118             } else {
2119                scan_mrf_high = scan_mrf_low;
2120             }
2121
2122             if (mrf_low == scan_mrf_low ||
2123                 mrf_low == scan_mrf_high ||
2124                 mrf_high == scan_mrf_low ||
2125                 mrf_high == scan_mrf_high) {
2126                break;
2127             }
2128          }
2129
2130          if (scan_inst->mlen > 0) {
2131             /* Found a SEND instruction, which means that there are
2132              * live values in MRFs from base_mrf to base_mrf +
2133              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
2134              * above it.
2135              */
2136             if (mrf_low >= scan_inst->base_mrf &&
2137                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2138                break;
2139             }
2140             if (mrf_high >= scan_inst->base_mrf &&
2141                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2142                break;
2143             }
2144          }
2145       }
2146    }
2147
2148    if (progress)
2149       live_intervals_valid = false;
2150
2151    return progress;
2152 }
2153
2154 /**
2155  * Walks through basic blocks, looking for repeated MRF writes and
2156  * removing the later ones.
2157  */
2158 bool
2159 fs_visitor::remove_duplicate_mrf_writes()
2160 {
2161    fs_inst *last_mrf_move[16];
2162    bool progress = false;
2163
2164    /* Need to update the MRF tracking for compressed instructions. */
2165    if (dispatch_width == 16)
2166       return false;
2167
2168    memset(last_mrf_move, 0, sizeof(last_mrf_move));
2169
2170    foreach_list_safe(node, &this->instructions) {
2171       fs_inst *inst = (fs_inst *)node;
2172
2173       switch (inst->opcode) {
2174       case BRW_OPCODE_DO:
2175       case BRW_OPCODE_WHILE:
2176       case BRW_OPCODE_IF:
2177       case BRW_OPCODE_ELSE:
2178       case BRW_OPCODE_ENDIF:
2179          memset(last_mrf_move, 0, sizeof(last_mrf_move));
2180          continue;
2181       default:
2182          break;
2183       }
2184
2185       if (inst->opcode == BRW_OPCODE_MOV &&
2186           inst->dst.file == MRF) {
2187          fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2188          if (prev_inst && inst->equals(prev_inst)) {
2189             inst->remove();
2190             progress = true;
2191             continue;
2192          }
2193       }
2194
2195       /* Clear out the last-write records for MRFs that were overwritten. */
2196       if (inst->dst.file == MRF) {
2197          last_mrf_move[inst->dst.reg] = NULL;
2198       }
2199
2200       if (inst->mlen > 0) {
2201          /* Found a SEND instruction, which will include two or fewer
2202           * implied MRF writes.  We could do better here.
2203           */
2204          for (int i = 0; i < implied_mrf_writes(inst); i++) {
2205             last_mrf_move[inst->base_mrf + i] = NULL;
2206          }
2207       }
2208
2209       /* Clear out any MRF move records whose sources got overwritten. */
2210       if (inst->dst.file == GRF) {
2211          for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2212             if (last_mrf_move[i] &&
2213                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2214                last_mrf_move[i] = NULL;
2215             }
2216          }
2217       }
2218
2219       if (inst->opcode == BRW_OPCODE_MOV &&
2220           inst->dst.file == MRF &&
2221           inst->src[0].file == GRF &&
2222           !inst->predicate) {
2223          last_mrf_move[inst->dst.reg] = inst;
2224       }
2225    }
2226
2227    if (progress)
2228       live_intervals_valid = false;
2229
2230    return progress;
2231 }
2232
2233 void
2234 fs_visitor::dump_instruction(fs_inst *inst)
2235 {
2236    if (inst->predicate) {
2237       printf("(%cf0.%d) ",
2238              inst->predicate_inverse ? '-' : '+',
2239              inst->flag_subreg);
2240    }
2241
2242    if (inst->opcode < ARRAY_SIZE(opcode_descs) &&
2243        opcode_descs[inst->opcode].name) {
2244       printf("%s", opcode_descs[inst->opcode].name);
2245    } else {
2246       printf("op%d", inst->opcode);
2247    }
2248    if (inst->saturate)
2249       printf(".sat");
2250    if (inst->conditional_mod) {
2251       printf(".cmod");
2252       if (!inst->predicate &&
2253           (intel->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2254                               inst->opcode != BRW_OPCODE_IF &&
2255                               inst->opcode != BRW_OPCODE_WHILE))) {
2256          printf(".f0.%d\n", inst->flag_subreg);
2257       }
2258    }
2259    printf(" ");
2260
2261
2262    switch (inst->dst.file) {
2263    case GRF:
2264       printf("vgrf%d", inst->dst.reg);
2265       if (inst->dst.reg_offset)
2266          printf("+%d", inst->dst.reg_offset);
2267       break;
2268    case MRF:
2269       printf("m%d", inst->dst.reg);
2270       break;
2271    case BAD_FILE:
2272       printf("(null)");
2273       break;
2274    case UNIFORM:
2275       printf("***u%d***", inst->dst.reg);
2276       break;
2277    default:
2278       printf("???");
2279       break;
2280    }
2281    printf(", ");
2282
2283    for (int i = 0; i < 3; i++) {
2284       if (inst->src[i].negate)
2285          printf("-");
2286       if (inst->src[i].abs)
2287          printf("|");
2288       switch (inst->src[i].file) {
2289       case GRF:
2290          printf("vgrf%d", inst->src[i].reg);
2291          if (inst->src[i].reg_offset)
2292             printf("+%d", inst->src[i].reg_offset);
2293          break;
2294       case MRF:
2295          printf("***m%d***", inst->src[i].reg);
2296          break;
2297       case UNIFORM:
2298          printf("u%d", inst->src[i].reg);
2299          if (inst->src[i].reg_offset)
2300             printf(".%d", inst->src[i].reg_offset);
2301          break;
2302       case BAD_FILE:
2303          printf("(null)");
2304          break;
2305       default:
2306          printf("???");
2307          break;
2308       }
2309       if (inst->src[i].abs)
2310          printf("|");
2311
2312       if (i < 3)
2313          printf(", ");
2314    }
2315
2316    printf(" ");
2317
2318    if (inst->force_uncompressed)
2319       printf("1sthalf ");
2320
2321    if (inst->force_sechalf)
2322       printf("2ndhalf ");
2323
2324    printf("\n");
2325 }
2326
2327 void
2328 fs_visitor::dump_instructions()
2329 {
2330    int ip = 0;
2331    foreach_list(node, &this->instructions) {
2332       fs_inst *inst = (fs_inst *)node;
2333       printf("%d: ", ip++);
2334       dump_instruction(inst);
2335    }
2336 }
2337
2338 /**
2339  * Possibly returns an instruction that set up @param reg.
2340  *
2341  * Sometimes we want to take the result of some expression/variable
2342  * dereference tree and rewrite the instruction generating the result
2343  * of the tree.  When processing the tree, we know that the
2344  * instructions generated are all writing temporaries that are dead
2345  * outside of this tree.  So, if we have some instructions that write
2346  * a temporary, we're free to point that temp write somewhere else.
2347  *
2348  * Note that this doesn't guarantee that the instruction generated
2349  * only reg -- it might be the size=4 destination of a texture instruction.
2350  */
2351 fs_inst *
2352 fs_visitor::get_instruction_generating_reg(fs_inst *start,
2353                                            fs_inst *end,
2354                                            fs_reg reg)
2355 {
2356    if (end == start ||
2357        end->predicate ||
2358        end->force_uncompressed ||
2359        end->force_sechalf ||
2360        reg.reladdr ||
2361        !reg.equals(end->dst)) {
2362       return NULL;
2363    } else {
2364       return end;
2365    }
2366 }
2367
2368 void
2369 fs_visitor::setup_payload_gen6()
2370 {
2371    struct intel_context *intel = &brw->intel;
2372    bool uses_depth =
2373       (fp->Base.InputsRead & (1 << FRAG_ATTRIB_WPOS)) != 0;
2374    unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
2375
2376    assert(intel->gen >= 6);
2377
2378    /* R0-1: masks, pixel X/Y coordinates. */
2379    c->nr_payload_regs = 2;
2380    /* R2: only for 32-pixel dispatch.*/
2381
2382    /* R3-26: barycentric interpolation coordinates.  These appear in the
2383     * same order that they appear in the brw_wm_barycentric_interp_mode
2384     * enum.  Each set of coordinates occupies 2 registers if dispatch width
2385     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
2386     * appear if they were enabled using the "Barycentric Interpolation
2387     * Mode" bits in WM_STATE.
2388     */
2389    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
2390       if (barycentric_interp_modes & (1 << i)) {
2391          c->barycentric_coord_reg[i] = c->nr_payload_regs;
2392          c->nr_payload_regs += 2;
2393          if (dispatch_width == 16) {
2394             c->nr_payload_regs += 2;
2395          }
2396       }
2397    }
2398
2399    /* R27: interpolated depth if uses source depth */
2400    if (uses_depth) {
2401       c->source_depth_reg = c->nr_payload_regs;
2402       c->nr_payload_regs++;
2403       if (dispatch_width == 16) {
2404          /* R28: interpolated depth if not 8-wide. */
2405          c->nr_payload_regs++;
2406       }
2407    }
2408    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
2409    if (uses_depth) {
2410       c->source_w_reg = c->nr_payload_regs;
2411       c->nr_payload_regs++;
2412       if (dispatch_width == 16) {
2413          /* R30: interpolated W if not 8-wide. */
2414          c->nr_payload_regs++;
2415       }
2416    }
2417    /* R31: MSAA position offsets. */
2418    /* R32-: bary for 32-pixel. */
2419    /* R58-59: interp W for 32-pixel. */
2420
2421    if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
2422       c->source_depth_to_render_target = true;
2423    }
2424 }
2425
2426 bool
2427 fs_visitor::run()
2428 {
2429    uint32_t orig_nr_params = c->prog_data.nr_params;
2430
2431    if (intel->gen >= 6)
2432       setup_payload_gen6();
2433    else
2434       setup_payload_gen4();
2435
2436    if (0) {
2437       emit_dummy_fs();
2438    } else {
2439       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2440          emit_shader_time_begin();
2441
2442       calculate_urb_setup();
2443       if (intel->gen < 6)
2444          emit_interpolation_setup_gen4();
2445       else
2446          emit_interpolation_setup_gen6();
2447
2448       /* We handle discards by keeping track of the still-live pixels in f0.1.
2449        * Initialize it with the dispatched pixels.
2450        */
2451       if (fp->UsesKill) {
2452          fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
2453          discard_init->flag_subreg = 1;
2454       }
2455
2456       /* Generate FS IR for main().  (the visitor only descends into
2457        * functions called "main").
2458        */
2459       if (shader) {
2460          foreach_list(node, &*shader->ir) {
2461             ir_instruction *ir = (ir_instruction *)node;
2462             base_ir = ir;
2463             this->result = reg_undef;
2464             ir->accept(this);
2465          }
2466       } else {
2467          emit_fragment_program_code();
2468       }
2469       base_ir = NULL;
2470       if (failed)
2471          return false;
2472
2473       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2474          emit_shader_time_end();
2475
2476       emit_fb_writes();
2477
2478       split_virtual_grfs();
2479
2480       setup_paramvalues_refs();
2481       move_uniform_array_access_to_pull_constants();
2482       setup_pull_constants();
2483
2484       bool progress;
2485       do {
2486          progress = false;
2487
2488          compact_virtual_grfs();
2489
2490          progress = remove_duplicate_mrf_writes() || progress;
2491
2492          progress = opt_algebraic() || progress;
2493          progress = opt_cse() || progress;
2494          progress = opt_copy_propagate() || progress;
2495          progress = dead_code_eliminate() || progress;
2496          progress = register_coalesce() || progress;
2497          progress = register_coalesce_2() || progress;
2498          progress = compute_to_mrf() || progress;
2499       } while (progress);
2500
2501       remove_dead_constants();
2502
2503       schedule_instructions(false);
2504
2505       assign_curb_setup();
2506       assign_urb_setup();
2507
2508       if (0) {
2509          /* Debug of register spilling: Go spill everything. */
2510          for (int i = 0; i < virtual_grf_count; i++) {
2511             spill_reg(i);
2512          }
2513       }
2514
2515       if (0)
2516          assign_regs_trivial();
2517       else {
2518          while (!assign_regs()) {
2519             if (failed)
2520                break;
2521          }
2522       }
2523    }
2524    assert(force_uncompressed_stack == 0);
2525    assert(force_sechalf_stack == 0);
2526
2527    if (failed)
2528       return false;
2529
2530    schedule_instructions(true);
2531
2532    if (dispatch_width == 8) {
2533       c->prog_data.reg_blocks = brw_register_blocks(grf_used);
2534    } else {
2535       c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
2536
2537       /* Make sure we didn't try to sneak in an extra uniform */
2538       assert(orig_nr_params == c->prog_data.nr_params);
2539       (void) orig_nr_params;
2540    }
2541
2542    return !failed;
2543 }
2544
2545 const unsigned *
2546 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
2547                struct gl_fragment_program *fp,
2548                struct gl_shader_program *prog,
2549                unsigned *final_assembly_size)
2550 {
2551    struct intel_context *intel = &brw->intel;
2552    bool start_busy = false;
2553    float start_time = 0;
2554
2555    if (unlikely(INTEL_DEBUG & DEBUG_PERF)) {
2556       start_busy = (intel->batch.last_bo &&
2557                     drm_intel_bo_busy(intel->batch.last_bo));
2558       start_time = get_time();
2559    }
2560
2561    struct brw_shader *shader = NULL;
2562    if (prog)
2563       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
2564
2565    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
2566       if (shader) {
2567          printf("GLSL IR for native fragment shader %d:\n", prog->Name);
2568          _mesa_print_ir(shader->ir, NULL);
2569          printf("\n\n");
2570       } else {
2571          printf("ARB_fragment_program %d ir for native fragment shader\n",
2572                 fp->Base.Id);
2573          _mesa_print_program(&fp->Base);
2574       }
2575    }
2576
2577    /* Now the main event: Visit the shader IR and generate our FS IR for it.
2578     */
2579    fs_visitor v(brw, c, prog, fp, 8);
2580    if (!v.run()) {
2581       prog->LinkStatus = false;
2582       ralloc_strcat(&prog->InfoLog, v.fail_msg);
2583
2584       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
2585                     v.fail_msg);
2586
2587       return NULL;
2588    }
2589
2590    exec_list *simd16_instructions = NULL;
2591    fs_visitor v2(brw, c, prog, fp, 16);
2592    if (intel->gen >= 5 && c->prog_data.nr_pull_params == 0) {
2593       v2.import_uniforms(&v);
2594       if (!v2.run()) {
2595          perf_debug("16-wide shader failed to compile, falling back to "
2596                     "8-wide at a 10-20%% performance cost: %s", v2.fail_msg);
2597       } else {
2598          simd16_instructions = &v2.instructions;
2599       }
2600    }
2601
2602    c->prog_data.dispatch_width = 8;
2603
2604    fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
2605    const unsigned *generated = g.generate_assembly(&v.instructions,
2606                                                    simd16_instructions,
2607                                                    final_assembly_size);
2608
2609    if (unlikely(INTEL_DEBUG & DEBUG_PERF) && shader) {
2610       if (shader->compiled_once)
2611          brw_wm_debug_recompile(brw, prog, &c->key);
2612       shader->compiled_once = true;
2613
2614       if (start_busy && !drm_intel_bo_busy(intel->batch.last_bo)) {
2615          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
2616                     (get_time() - start_time) * 1000);
2617       }
2618    }
2619
2620    return generated;
2621 }
2622
2623 bool
2624 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
2625 {
2626    struct brw_context *brw = brw_context(ctx);
2627    struct intel_context *intel = &brw->intel;
2628    struct brw_wm_prog_key key;
2629
2630    if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
2631       return true;
2632
2633    struct gl_fragment_program *fp = (struct gl_fragment_program *)
2634       prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
2635    struct brw_fragment_program *bfp = brw_fragment_program(fp);
2636    bool program_uses_dfdy = fp->UsesDFdy;
2637
2638    memset(&key, 0, sizeof(key));
2639
2640    if (intel->gen < 6) {
2641       if (fp->UsesKill)
2642          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
2643
2644       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
2645          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
2646
2647       /* Just assume depth testing. */
2648       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
2649       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
2650    }
2651
2652    if (prog->Name != 0)
2653       key.proj_attrib_mask = 0xffffffff;
2654
2655    if (intel->gen < 6)
2656       key.vp_outputs_written |= BITFIELD64_BIT(FRAG_ATTRIB_WPOS);
2657
2658    for (int i = 0; i < FRAG_ATTRIB_MAX; i++) {
2659       if (!(fp->Base.InputsRead & BITFIELD64_BIT(i)))
2660          continue;
2661
2662       if (prog->Name == 0)
2663          key.proj_attrib_mask |= 1 << i;
2664
2665       if (intel->gen < 6) {
2666          int vp_index = _mesa_vert_result_to_frag_attrib((gl_vert_result) i);
2667
2668          if (vp_index >= 0)
2669             key.vp_outputs_written |= BITFIELD64_BIT(vp_index);
2670       }
2671    }
2672
2673    key.clamp_fragment_color = true;
2674
2675    for (int i = 0; i < MAX_SAMPLERS; i++) {
2676       if (fp->Base.ShadowSamplers & (1 << i)) {
2677          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
2678          key.tex.swizzles[i] =
2679             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
2680       } else {
2681          /* Color sampler: assume no swizzling. */
2682          key.tex.swizzles[i] = SWIZZLE_XYZW;
2683       }
2684    }
2685
2686    if (fp->Base.InputsRead & FRAG_BIT_WPOS) {
2687       key.drawable_height = ctx->DrawBuffer->Height;
2688    }
2689
2690    if ((fp->Base.InputsRead & FRAG_BIT_WPOS) || program_uses_dfdy) {
2691       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
2692    }
2693
2694    key.nr_color_regions = 1;
2695
2696    key.program_string_id = bfp->id;
2697
2698    uint32_t old_prog_offset = brw->wm.prog_offset;
2699    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
2700
2701    bool success = do_wm_prog(brw, prog, bfp, &key);
2702
2703    brw->wm.prog_offset = old_prog_offset;
2704    brw->wm.prog_data = old_prog_data;
2705
2706    return success;
2707 }