src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 extern "C" {
  32
  33 #include <sys/types.h>
  34
  35 #include "main/macros.h"
  36 #include "main/shaderobj.h"
  37 #include "main/uniforms.h"
  38 #include "main/fbobject.h"
  39 #include "program/prog_parameter.h"
  40 #include "program/prog_print.h"
  41 #include "program/register_allocate.h"
  42 #include "program/sampler.h"
  43 #include "program/hash_table.h"
  44 #include "brw_context.h"
  45 #include "brw_eu.h"
  46 #include "brw_wm.h"
  47 }
  48 #include "brw_fs.h"
  49 #include "glsl/glsl_types.h"
  50 #include "glsl/ir_print_visitor.h"
  51
  52 void
  53 fs_inst::init()
  54 {
  55    memset(this, 0, sizeof(*this));
  56    this->opcode = BRW_OPCODE_NOP;
  57    this->conditional_mod = BRW_CONDITIONAL_NONE;
  58
  59    this->dst = reg_undef;
  60    this->src[0] = reg_undef;
  61    this->src[1] = reg_undef;
  62    this->src[2] = reg_undef;
  63 }
  64
  65 fs_inst::fs_inst()
  66 {
  67    init();
  68 }
  69
  70 fs_inst::fs_inst(enum opcode opcode)
  71 {
  72    init();
  73    this->opcode = opcode;
  74 }
  75
  76 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
  77 {
  78    init();
  79    this->opcode = opcode;
  80    this->dst = dst;
  81
  82    if (dst.file == GRF)
  83       assert(dst.reg_offset >= 0);
  84 }
  85
  86 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
  87 {
  88    init();
  89    this->opcode = opcode;
  90    this->dst = dst;
  91    this->src[0] = src0;
  92
  93    if (dst.file == GRF)
  94       assert(dst.reg_offset >= 0);
  95    if (src[0].file == GRF)
  96       assert(src[0].reg_offset >= 0);
  97 }
  98
  99 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 100 {
 101    init();
 102    this->opcode = opcode;
 103    this->dst = dst;
 104    this->src[0] = src0;
 105    this->src[1] = src1;
 106
 107    if (dst.file == GRF)
 108       assert(dst.reg_offset >= 0);
 109    if (src[0].file == GRF)
 110       assert(src[0].reg_offset >= 0);
 111    if (src[1].file == GRF)
 112       assert(src[1].reg_offset >= 0);
 113 }
 114
 115 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
 116                  fs_reg src0, fs_reg src1, fs_reg src2)
 117 {
 118    init();
 119    this->opcode = opcode;
 120    this->dst = dst;
 121    this->src[0] = src0;
 122    this->src[1] = src1;
 123    this->src[2] = src2;
 124
 125    if (dst.file == GRF)
 126       assert(dst.reg_offset >= 0);
 127    if (src[0].file == GRF)
 128       assert(src[0].reg_offset >= 0);
 129    if (src[1].file == GRF)
 130       assert(src[1].reg_offset >= 0);
 131    if (src[2].file == GRF)
 132       assert(src[2].reg_offset >= 0);
 133 }
 134
 135 #define ALU1(op)                                                        \
 136    fs_inst *                                                            \
 137    fs_visitor::op(fs_reg dst, fs_reg src0)                              \
 138    {                                                                    \
 139       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
 140    }
 141
 142 #define ALU2(op)                                                        \
 143    fs_inst *                                                            \
 144    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1)                 \
 145    {                                                                    \
 146       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
 147    }
 148
 149 ALU1(NOT)
 150 ALU1(MOV)
 151 ALU1(FRC)
 152 ALU1(RNDD)
 153 ALU1(RNDE)
 154 ALU1(RNDZ)
 155 ALU2(ADD)
 156 ALU2(MUL)
 157 ALU2(MACH)
 158 ALU2(AND)
 159 ALU2(OR)
 160 ALU2(XOR)
 161 ALU2(SHL)
 162 ALU2(SHR)
 163 ALU2(ASR)
 164
 165 /** Gen4 predicated IF. */
 166 fs_inst *
 167 fs_visitor::IF(uint32_t predicate)
 168 {
 169    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
 170    inst->predicate = predicate;
 171    return inst;
 172 }
 173
 174 /** Gen6+ IF with embedded comparison. */
 175 fs_inst *
 176 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
 177 {
 178    assert(intel->gen >= 6);
 179    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
 180                                         reg_null_d, src0, src1);
 181    inst->conditional_mod = condition;
 182    return inst;
 183 }
 184
 185 /**
 186  * CMP: Sets the low bit of the destination channels with the result
 187  * of the comparison, while the upper bits are undefined, and updates
 188  * the flag register with the packed 16 bits of the result.
 189  */
 190 fs_inst *
 191 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
 192 {
 193    fs_inst *inst;
 194
 195    /* Take the instruction:
 196     *
 197     * CMP null<d> src0<f> src1<f>
 198     *
 199     * Original gen4 does type conversion to the destination type before
 200     * comparison, producing garbage results for floating point comparisons.
 201     * gen5 does the comparison on the execution type (resolved source types),
 202     * so dst type doesn't matter.  gen6 does comparison and then uses the
 203     * result as if it was the dst type with no conversion, which happens to
 204     * mostly work out for float-interpreted-as-int since our comparisons are
 205     * for >0, =0, <0.
 206     */
 207    if (intel->gen == 4) {
 208       dst.type = src0.type;
 209       if (dst.file == FIXED_HW_REG)
 210          dst.fixed_hw_reg.type = dst.type;
 211    }
 212
 213    resolve_ud_negate(&src0);
 214    resolve_ud_negate(&src1);
 215
 216    inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
 217    inst->conditional_mod = condition;
 218
 219    return inst;
 220 }
 221
 222 exec_list
 223 fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
 224                                        fs_reg offset)
 225 {
 226    exec_list instructions;
 227    fs_inst *inst;
 228
 229    if (intel->gen >= 7) {
 230       inst = new(mem_ctx) fs_inst(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7,
 231                                   dst, surf_index, offset);
 232       instructions.push_tail(inst);
 233    } else {
 234       int base_mrf = 13;
 235       bool header_present = true;
 236
 237       fs_reg mrf = fs_reg(MRF, base_mrf + header_present);
 238       mrf.type = BRW_REGISTER_TYPE_D;
 239
 240       /* On gen6+ we want the dword offset passed in, but on gen4/5 we need a
 241        * dword-aligned byte offset.
 242        */
 243       if (intel->gen == 6) {
 244          instructions.push_tail(MOV(mrf, offset));
 245       } else {
 246          instructions.push_tail(MUL(mrf, offset, fs_reg(4)));
 247       }
 248       inst = MOV(mrf, offset);
 249       inst = new(mem_ctx) fs_inst(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD,
 250                                   dst, surf_index);
 251       inst->header_present = header_present;
 252       inst->base_mrf = base_mrf;
 253       inst->mlen = header_present + dispatch_width / 8;
 254
 255       instructions.push_tail(inst);
 256    }
 257
 258    return instructions;
 259 }
 260
 261 bool
 262 fs_inst::equals(fs_inst *inst)
 263 {
 264    return (opcode == inst->opcode &&
 265            dst.equals(inst->dst) &&
 266            src[0].equals(inst->src[0]) &&
 267            src[1].equals(inst->src[1]) &&
 268            src[2].equals(inst->src[2]) &&
 269            saturate == inst->saturate &&
 270            predicate == inst->predicate &&
 271            conditional_mod == inst->conditional_mod &&
 272            mlen == inst->mlen &&
 273            base_mrf == inst->base_mrf &&
 274            sampler == inst->sampler &&
 275            target == inst->target &&
 276            eot == inst->eot &&
 277            header_present == inst->header_present &&
 278            shadow_compare == inst->shadow_compare &&
 279            offset == inst->offset);
 280 }
 281
 282 int
 283 fs_inst::regs_written()
 284 {
 285    if (is_tex())
 286       return 4;
 287
 288    /* The SINCOS and INT_DIV_QUOTIENT_AND_REMAINDER math functions return 2,
 289     * but we don't currently use them...nor do we have an opcode for them.
 290     */
 291
 292    return 1;
 293 }
 294
 295 bool
 296 fs_inst::overwrites_reg(const fs_reg &reg)
 297 {
 298    return (reg.file == dst.file &&
 299            reg.reg == dst.reg &&
 300            reg.reg_offset >= dst.reg_offset  &&
 301            reg.reg_offset < dst.reg_offset + regs_written());
 302 }
 303
 304 bool
 305 fs_inst::is_tex()
 306 {
 307    return (opcode == SHADER_OPCODE_TEX ||
 308            opcode == FS_OPCODE_TXB ||
 309            opcode == SHADER_OPCODE_TXD ||
 310            opcode == SHADER_OPCODE_TXF ||
 311            opcode == SHADER_OPCODE_TXL ||
 312            opcode == SHADER_OPCODE_TXS);
 313 }
 314
 315 bool
 316 fs_inst::is_math()
 317 {
 318    return (opcode == SHADER_OPCODE_RCP ||
 319            opcode == SHADER_OPCODE_RSQ ||
 320            opcode == SHADER_OPCODE_SQRT ||
 321            opcode == SHADER_OPCODE_EXP2 ||
 322            opcode == SHADER_OPCODE_LOG2 ||
 323            opcode == SHADER_OPCODE_SIN ||
 324            opcode == SHADER_OPCODE_COS ||
 325            opcode == SHADER_OPCODE_INT_QUOTIENT ||
 326            opcode == SHADER_OPCODE_INT_REMAINDER ||
 327            opcode == SHADER_OPCODE_POW);
 328 }
 329
 330 bool
 331 fs_inst::is_send_from_grf()
 332 {
 333    return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
 334            (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
 335             src[1].file == GRF));
 336 }
 337
 338 bool
 339 fs_visitor::can_do_source_mods(fs_inst *inst)
 340 {
 341    if (intel->gen == 6 && inst->is_math())
 342       return false;
 343
 344    if (inst->is_send_from_grf())
 345       return false;
 346
 347    return true;
 348 }
 349
 350 void
 351 fs_reg::init()
 352 {
 353    memset(this, 0, sizeof(*this));
 354    this->smear = -1;
 355 }
 356
 357 /** Generic unset register constructor. */
 358 fs_reg::fs_reg()
 359 {
 360    init();
 361    this->file = BAD_FILE;
 362 }
 363
 364 /** Immediate value constructor. */
 365 fs_reg::fs_reg(float f)
 366 {
 367    init();
 368    this->file = IMM;
 369    this->type = BRW_REGISTER_TYPE_F;
 370    this->imm.f = f;
 371 }
 372
 373 /** Immediate value constructor. */
 374 fs_reg::fs_reg(int32_t i)
 375 {
 376    init();
 377    this->file = IMM;
 378    this->type = BRW_REGISTER_TYPE_D;
 379    this->imm.i = i;
 380 }
 381
 382 /** Immediate value constructor. */
 383 fs_reg::fs_reg(uint32_t u)
 384 {
 385    init();
 386    this->file = IMM;
 387    this->type = BRW_REGISTER_TYPE_UD;
 388    this->imm.u = u;
 389 }
 390
 391 /** Fixed brw_reg Immediate value constructor. */
 392 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
 393 {
 394    init();
 395    this->file = FIXED_HW_REG;
 396    this->fixed_hw_reg = fixed_hw_reg;
 397    this->type = fixed_hw_reg.type;
 398 }
 399
 400 bool
 401 fs_reg::equals(const fs_reg &r) const
 402 {
 403    return (file == r.file &&
 404            reg == r.reg &&
 405            reg_offset == r.reg_offset &&
 406            type == r.type &&
 407            negate == r.negate &&
 408            abs == r.abs &&
 409            !reladdr && !r.reladdr &&
 410            memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
 411                   sizeof(fixed_hw_reg)) == 0 &&
 412            smear == r.smear &&
 413            imm.u == r.imm.u);
 414 }
 415
 416 bool
 417 fs_reg::is_zero() const
 418 {
 419    if (file != IMM)
 420       return false;
 421
 422    return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
 423 }
 424
 425 bool
 426 fs_reg::is_one() const
 427 {
 428    if (file != IMM)
 429       return false;
 430
 431    return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
 432 }
 433
 434 int
 435 fs_visitor::type_size(const struct glsl_type *type)
 436 {
 437    unsigned int size, i;
 438
 439    switch (type->base_type) {
 440    case GLSL_TYPE_UINT:
 441    case GLSL_TYPE_INT:
 442    case GLSL_TYPE_FLOAT:
 443    case GLSL_TYPE_BOOL:
 444       return type->components();
 445    case GLSL_TYPE_ARRAY:
 446       return type_size(type->fields.array) * type->length;
 447    case GLSL_TYPE_STRUCT:
 448       size = 0;
 449       for (i = 0; i < type->length; i++) {
 450          size += type_size(type->fields.structure[i].type);
 451       }
 452       return size;
 453    case GLSL_TYPE_SAMPLER:
 454       /* Samplers take up no register space, since they're baked in at
 455        * link time.
 456        */
 457       return 0;
 458    default:
 459       assert(!"not reached");
 460       return 0;
 461    }
 462 }
 463
 464 fs_reg
 465 fs_visitor::get_timestamp()
 466 {
 467    assert(intel->gen >= 7);
 468
 469    fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 470                                           BRW_ARF_TIMESTAMP,
 471                                           0),
 472                              BRW_REGISTER_TYPE_UD));
 473
 474    fs_reg dst = fs_reg(this, glsl_type::uint_type);
 475
 476    fs_inst *mov = emit(MOV(dst, ts));
 477    /* We want to read the 3 fields we care about (mostly field 0, but also 2)
 478     * even if it's not enabled in the dispatch.
 479     */
 480    mov->force_writemask_all = true;
 481    mov->force_uncompressed = true;
 482
 483    /* The caller wants the low 32 bits of the timestamp.  Since it's running
 484     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
 485     * which is plenty of time for our purposes.  It is identical across the
 486     * EUs, but since it's tracking GPU core speed it will increment at a
 487     * varying rate as render P-states change.
 488     *
 489     * The caller could also check if render P-states have changed (or anything
 490     * else that might disrupt timing) by setting smear to 2 and checking if
 491     * that field is != 0.
 492     */
 493    dst.smear = 0;
 494
 495    return dst;
 496 }
 497
 498 void
 499 fs_visitor::emit_shader_time_begin()
 500 {
 501    current_annotation = "shader time start";
 502    shader_start_time = get_timestamp();
 503 }
 504
 505 void
 506 fs_visitor::emit_shader_time_end()
 507 {
 508    current_annotation = "shader time end";
 509
 510    enum shader_time_shader_type type, written_type, reset_type;
 511    if (dispatch_width == 8) {
 512       type = ST_FS8;
 513       written_type = ST_FS8_WRITTEN;
 514       reset_type = ST_FS8_RESET;
 515    } else {
 516       assert(dispatch_width == 16);
 517       type = ST_FS16;
 518       written_type = ST_FS16_WRITTEN;
 519       reset_type = ST_FS16_RESET;
 520    }
 521
 522    fs_reg shader_end_time = get_timestamp();
 523
 524    /* Check that there weren't any timestamp reset events (assuming these
 525     * were the only two timestamp reads that happened).
 526     */
 527    fs_reg reset = shader_end_time;
 528    reset.smear = 2;
 529    fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
 530    test->conditional_mod = BRW_CONDITIONAL_Z;
 531    emit(IF(BRW_PREDICATE_NORMAL));
 532
 533    push_force_uncompressed();
 534    fs_reg start = shader_start_time;
 535    start.negate = true;
 536    fs_reg diff = fs_reg(this, glsl_type::uint_type);
 537    emit(ADD(diff, start, shader_end_time));
 538
 539    /* If there were no instructions between the two timestamp gets, the diff
 540     * is 2 cycles.  Remove that overhead, so I can forget about that when
 541     * trying to determine the time taken for single instructions.
 542     */
 543    emit(ADD(diff, diff, fs_reg(-2u)));
 544
 545    emit_shader_time_write(type, diff);
 546    emit_shader_time_write(written_type, fs_reg(1u));
 547    emit(BRW_OPCODE_ELSE);
 548    emit_shader_time_write(reset_type, fs_reg(1u));
 549    emit(BRW_OPCODE_ENDIF);
 550
 551    pop_force_uncompressed();
 552 }
 553
 554 void
 555 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
 556                                    fs_reg value)
 557 {
 558    /* Choose an index in the buffer and set up tracking information for our
 559     * printouts.
 560     */
 561    int shader_time_index = brw->shader_time.num_entries++;
 562    assert(shader_time_index <= brw->shader_time.max_entries);
 563    brw->shader_time.types[shader_time_index] = type;
 564    if (prog) {
 565       _mesa_reference_shader_program(ctx,
 566                                      &brw->shader_time.programs[shader_time_index],
 567                                      prog);
 568    }
 569
 570    int base_mrf = 6;
 571
 572    fs_reg offset_mrf = fs_reg(MRF, base_mrf);
 573    offset_mrf.type = BRW_REGISTER_TYPE_UD;
 574    emit(MOV(offset_mrf, fs_reg(shader_time_index * 4)));
 575
 576    fs_reg time_mrf = fs_reg(MRF, base_mrf + 1);
 577    time_mrf.type = BRW_REGISTER_TYPE_UD;
 578    emit(MOV(time_mrf, value));
 579
 580    fs_inst *inst = emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD));
 581    inst->base_mrf = base_mrf;
 582    inst->mlen = 2;
 583 }
 584
 585 void
 586 fs_visitor::fail(const char *format, ...)
 587 {
 588    va_list va;
 589    char *msg;
 590
 591    if (failed)
 592       return;
 593
 594    failed = true;
 595
 596    va_start(va, format);
 597    msg = ralloc_vasprintf(mem_ctx, format, va);
 598    va_end(va);
 599    msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
 600
 601    this->fail_msg = msg;
 602
 603    if (INTEL_DEBUG & DEBUG_WM) {
 604       fprintf(stderr, "%s",  msg);
 605    }
 606 }
 607
 608 fs_inst *
 609 fs_visitor::emit(enum opcode opcode)
 610 {
 611    return emit(fs_inst(opcode));
 612 }
 613
 614 fs_inst *
 615 fs_visitor::emit(enum opcode opcode, fs_reg dst)
 616 {
 617    return emit(fs_inst(opcode, dst));
 618 }
 619
 620 fs_inst *
 621 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
 622 {
 623    return emit(fs_inst(opcode, dst, src0));
 624 }
 625
 626 fs_inst *
 627 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 628 {
 629    return emit(fs_inst(opcode, dst, src0, src1));
 630 }
 631
 632 fs_inst *
 633 fs_visitor::emit(enum opcode opcode, fs_reg dst,
 634                  fs_reg src0, fs_reg src1, fs_reg src2)
 635 {
 636    return emit(fs_inst(opcode, dst, src0, src1, src2));
 637 }
 638
 639 void
 640 fs_visitor::push_force_uncompressed()
 641 {
 642    force_uncompressed_stack++;
 643 }
 644
 645 void
 646 fs_visitor::pop_force_uncompressed()
 647 {
 648    force_uncompressed_stack--;
 649    assert(force_uncompressed_stack >= 0);
 650 }
 651
 652 void
 653 fs_visitor::push_force_sechalf()
 654 {
 655    force_sechalf_stack++;
 656 }
 657
 658 void
 659 fs_visitor::pop_force_sechalf()
 660 {
 661    force_sechalf_stack--;
 662    assert(force_sechalf_stack >= 0);
 663 }
 664
 665 /**
 666  * Returns how many MRFs an FS opcode will write over.
 667  *
 668  * Note that this is not the 0 or 1 implied writes in an actual gen
 669  * instruction -- the FS opcodes often generate MOVs in addition.
 670  */
 671 int
 672 fs_visitor::implied_mrf_writes(fs_inst *inst)
 673 {
 674    if (inst->mlen == 0)
 675       return 0;
 676
 677    switch (inst->opcode) {
 678    case SHADER_OPCODE_RCP:
 679    case SHADER_OPCODE_RSQ:
 680    case SHADER_OPCODE_SQRT:
 681    case SHADER_OPCODE_EXP2:
 682    case SHADER_OPCODE_LOG2:
 683    case SHADER_OPCODE_SIN:
 684    case SHADER_OPCODE_COS:
 685       return 1 * dispatch_width / 8;
 686    case SHADER_OPCODE_POW:
 687    case SHADER_OPCODE_INT_QUOTIENT:
 688    case SHADER_OPCODE_INT_REMAINDER:
 689       return 2 * dispatch_width / 8;
 690    case SHADER_OPCODE_TEX:
 691    case FS_OPCODE_TXB:
 692    case SHADER_OPCODE_TXD:
 693    case SHADER_OPCODE_TXF:
 694    case SHADER_OPCODE_TXL:
 695    case SHADER_OPCODE_TXS:
 696       return 1;
 697    case SHADER_OPCODE_SHADER_TIME_ADD:
 698       return 0;
 699    case FS_OPCODE_FB_WRITE:
 700       return 2;
 701    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 702    case FS_OPCODE_UNSPILL:
 703       return 1;
 704    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
 705       return inst->header_present;
 706    case FS_OPCODE_SPILL:
 707       return 2;
 708    default:
 709       assert(!"not reached");
 710       return inst->mlen;
 711    }
 712 }
 713
 714 int
 715 fs_visitor::virtual_grf_alloc(int size)
 716 {
 717    if (virtual_grf_array_size <= virtual_grf_count) {
 718       if (virtual_grf_array_size == 0)
 719          virtual_grf_array_size = 16;
 720       else
 721          virtual_grf_array_size *= 2;
 722       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 723                                    virtual_grf_array_size);
 724    }
 725    virtual_grf_sizes[virtual_grf_count] = size;
 726    return virtual_grf_count++;
 727 }
 728
 729 /** Fixed HW reg constructor. */
 730 fs_reg::fs_reg(enum register_file file, int reg)
 731 {
 732    init();
 733    this->file = file;
 734    this->reg = reg;
 735    this->type = BRW_REGISTER_TYPE_F;
 736 }
 737
 738 /** Fixed HW reg constructor. */
 739 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
 740 {
 741    init();
 742    this->file = file;
 743    this->reg = reg;
 744    this->type = type;
 745 }
 746
 747 /** Automatic reg constructor. */
 748 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
 749 {
 750    init();
 751
 752    this->file = GRF;
 753    this->reg = v->virtual_grf_alloc(v->type_size(type));
 754    this->reg_offset = 0;
 755    this->type = brw_type_for_base_type(type);
 756 }
 757
 758 fs_reg *
 759 fs_visitor::variable_storage(ir_variable *var)
 760 {
 761    return (fs_reg *)hash_table_find(this->variable_ht, var);
 762 }
 763
 764 void
 765 import_uniforms_callback(const void *key,
 766                          void *data,
 767                          void *closure)
 768 {
 769    struct hash_table *dst_ht = (struct hash_table *)closure;
 770    const fs_reg *reg = (const fs_reg *)data;
 771
 772    if (reg->file != UNIFORM)
 773       return;
 774
 775    hash_table_insert(dst_ht, data, key);
 776 }
 777
 778 /* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
 779  * This brings in those uniform definitions
 780  */
 781 void
 782 fs_visitor::import_uniforms(fs_visitor *v)
 783 {
 784    hash_table_call_foreach(v->variable_ht,
 785                            import_uniforms_callback,
 786                            variable_ht);
 787    this->params_remap = v->params_remap;
 788 }
 789
 790 /* Our support for uniforms is piggy-backed on the struct
 791  * gl_fragment_program, because that's where the values actually
 792  * get stored, rather than in some global gl_shader_program uniform
 793  * store.
 794  */
 795 void
 796 fs_visitor::setup_uniform_values(ir_variable *ir)
 797 {
 798    int namelen = strlen(ir->name);
 799
 800    /* The data for our (non-builtin) uniforms is stored in a series of
 801     * gl_uniform_driver_storage structs for each subcomponent that
 802     * glGetUniformLocation() could name.  We know it's been set up in the same
 803     * order we'd walk the type, so walk the list of storage and find anything
 804     * with our name, or the prefix of a component that starts with our name.
 805     */
 806    unsigned params_before = c->prog_data.nr_params;
 807    for (unsigned u = 0; u < prog->NumUserUniformStorage; u++) {
 808       struct gl_uniform_storage *storage = &prog->UniformStorage[u];
 809
 810       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 811           (storage->name[namelen] != 0 &&
 812            storage->name[namelen] != '.' &&
 813            storage->name[namelen] != '[')) {
 814          continue;
 815       }
 816
 817       unsigned slots = storage->type->component_slots();
 818       if (storage->array_elements)
 819          slots *= storage->array_elements;
 820
 821       for (unsigned i = 0; i < slots; i++) {
 822          c->prog_data.param[c->prog_data.nr_params++] =
 823             &storage->storage[i].f;
 824       }
 825    }
 826
 827    /* Make sure we actually initialized the right amount of stuff here. */
 828    assert(params_before + ir->type->component_slots() ==
 829           c->prog_data.nr_params);
 830 }
 831
 832
 833 /* Our support for builtin uniforms is even scarier than non-builtin.
 834  * It sits on top of the PROG_STATE_VAR parameters that are
 835  * automatically updated from GL context state.
 836  */
 837 void
 838 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
 839 {
 840    const ir_state_slot *const slots = ir->state_slots;
 841    assert(ir->state_slots != NULL);
 842
 843    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 844       /* This state reference has already been setup by ir_to_mesa, but we'll
 845        * get the same index back here.
 846        */
 847       int index = _mesa_add_state_reference(this->fp->Base.Parameters,
 848                                             (gl_state_index *)slots[i].tokens);
 849
 850       /* Add each of the unique swizzles of the element as a parameter.
 851        * This'll end up matching the expected layout of the
 852        * array/matrix/structure we're trying to fill in.
 853        */
 854       int last_swiz = -1;
 855       for (unsigned int j = 0; j < 4; j++) {
 856          int swiz = GET_SWZ(slots[i].swizzle, j);
 857          if (swiz == last_swiz)
 858             break;
 859          last_swiz = swiz;
 860
 861          c->prog_data.param[c->prog_data.nr_params++] =
 862             &fp->Base.Parameters->ParameterValues[index][swiz].f;
 863       }
 864    }
 865 }
 866
 867 fs_reg *
 868 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
 869 {
 870    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 871    fs_reg wpos = *reg;
 872    bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
 873
 874    /* gl_FragCoord.x */
 875    if (ir->pixel_center_integer) {
 876       emit(MOV(wpos, this->pixel_x));
 877    } else {
 878       emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
 879    }
 880    wpos.reg_offset++;
 881
 882    /* gl_FragCoord.y */
 883    if (!flip && ir->pixel_center_integer) {
 884       emit(MOV(wpos, this->pixel_y));
 885    } else {
 886       fs_reg pixel_y = this->pixel_y;
 887       float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
 888
 889       if (flip) {
 890          pixel_y.negate = true;
 891          offset += c->key.drawable_height - 1.0;
 892       }
 893
 894       emit(ADD(wpos, pixel_y, fs_reg(offset)));
 895    }
 896    wpos.reg_offset++;
 897
 898    /* gl_FragCoord.z */
 899    if (intel->gen >= 6) {
 900       emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
 901    } else {
 902       emit(FS_OPCODE_LINTERP, wpos,
 903            this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
 904            this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
 905            interp_reg(FRAG_ATTRIB_WPOS, 2));
 906    }
 907    wpos.reg_offset++;
 908
 909    /* gl_FragCoord.w: Already set up in emit_interpolation */
 910    emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
 911
 912    return reg;
 913 }
 914
 915 fs_inst *
 916 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
 917                          glsl_interp_qualifier interpolation_mode,
 918                          bool is_centroid)
 919 {
 920    brw_wm_barycentric_interp_mode barycoord_mode;
 921    if (is_centroid) {
 922       if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
 923          barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
 924       else
 925          barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
 926    } else {
 927       if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
 928          barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
 929       else
 930          barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
 931    }
 932    return emit(FS_OPCODE_LINTERP, attr,
 933                this->delta_x[barycoord_mode],
 934                this->delta_y[barycoord_mode], interp);
 935 }
 936
 937 fs_reg *
 938 fs_visitor::emit_general_interpolation(ir_variable *ir)
 939 {
 940    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 941    reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
 942    fs_reg attr = *reg;
 943
 944    unsigned int array_elements;
 945    const glsl_type *type;
 946
 947    if (ir->type->is_array()) {
 948       array_elements = ir->type->length;
 949       if (array_elements == 0) {
 950          fail("dereferenced array '%s' has length 0\n", ir->name);
 951       }
 952       type = ir->type->fields.array;
 953    } else {
 954       array_elements = 1;
 955       type = ir->type;
 956    }
 957
 958    glsl_interp_qualifier interpolation_mode =
 959       ir->determine_interpolation_mode(c->key.flat_shade);
 960
 961    int location = ir->location;
 962    for (unsigned int i = 0; i < array_elements; i++) {
 963       for (unsigned int j = 0; j < type->matrix_columns; j++) {
 964          if (urb_setup[location] == -1) {
 965             /* If there's no incoming setup data for this slot, don't
 966              * emit interpolation for it.
 967              */
 968             attr.reg_offset += type->vector_elements;
 969             location++;
 970             continue;
 971          }
 972
 973          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
 974             /* Constant interpolation (flat shading) case. The SF has
 975              * handed us defined values in only the constant offset
 976              * field of the setup reg.
 977              */
 978             for (unsigned int k = 0; k < type->vector_elements; k++) {
 979                struct brw_reg interp = interp_reg(location, k);
 980                interp = suboffset(interp, 3);
 981                interp.type = reg->type;
 982                emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
 983                attr.reg_offset++;
 984             }
 985          } else {
 986             /* Smooth/noperspective interpolation case. */
 987             for (unsigned int k = 0; k < type->vector_elements; k++) {
 988                /* FINISHME: At some point we probably want to push
 989                 * this farther by giving similar treatment to the
 990                 * other potentially constant components of the
 991                 * attribute, as well as making brw_vs_constval.c
 992                 * handle varyings other than gl_TexCoord.
 993                 */
 994                if (location >= FRAG_ATTRIB_TEX0 &&
 995                    location <= FRAG_ATTRIB_TEX7 &&
 996                    k == 3 && !(c->key.proj_attrib_mask & (1 << location))) {
 997                   emit(BRW_OPCODE_MOV, attr, fs_reg(1.0f));
 998                } else {
 999                   struct brw_reg interp = interp_reg(location, k);
1000                   emit_linterp(attr, fs_reg(interp), interpolation_mode,
1001                                ir->centroid);
1002                   if (brw->needs_unlit_centroid_workaround && ir->centroid) {
1003                      /* Get the pixel/sample mask into f0 so that we know
1004                       * which pixels are lit.  Then, for each channel that is
1005                       * unlit, replace the centroid data with non-centroid
1006                       * data.
1007                       */
1008                      emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS, attr);
1009                      fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1010                                                   interpolation_mode, false);
1011                      inst->predicate = BRW_PREDICATE_NORMAL;
1012                      inst->predicate_inverse = true;
1013                   }
1014                   if (intel->gen < 6) {
1015                      emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1016                   }
1017                }
1018                attr.reg_offset++;
1019             }
1020
1021          }
1022          location++;
1023       }
1024    }
1025
1026    return reg;
1027 }
1028
1029 fs_reg *
1030 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1031 {
1032    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1033
1034    /* The frontfacing comes in as a bit in the thread payload. */
1035    if (intel->gen >= 6) {
1036       emit(BRW_OPCODE_ASR, *reg,
1037            fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1038            fs_reg(15));
1039       emit(BRW_OPCODE_NOT, *reg, *reg);
1040       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1041    } else {
1042       struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1043       /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1044        * us front face
1045        */
1046       emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1047       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1048    }
1049
1050    return reg;
1051 }
1052
1053 fs_reg
1054 fs_visitor::fix_math_operand(fs_reg src)
1055 {
1056    /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1057     * might be able to do better by doing execsize = 1 math and then
1058     * expanding that result out, but we would need to be careful with
1059     * masking.
1060     *
1061     * The hardware ignores source modifiers (negate and abs) on math
1062     * instructions, so we also move to a temp to set those up.
1063     */
1064    if (intel->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1065        !src.abs && !src.negate)
1066       return src;
1067
1068    /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1069     * operands to math
1070     */
1071    if (intel->gen >= 7 && src.file != IMM)
1072       return src;
1073
1074    fs_reg expanded = fs_reg(this, glsl_type::float_type);
1075    expanded.type = src.type;
1076    emit(BRW_OPCODE_MOV, expanded, src);
1077    return expanded;
1078 }
1079
1080 fs_inst *
1081 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1082 {
1083    switch (opcode) {
1084    case SHADER_OPCODE_RCP:
1085    case SHADER_OPCODE_RSQ:
1086    case SHADER_OPCODE_SQRT:
1087    case SHADER_OPCODE_EXP2:
1088    case SHADER_OPCODE_LOG2:
1089    case SHADER_OPCODE_SIN:
1090    case SHADER_OPCODE_COS:
1091       break;
1092    default:
1093       assert(!"not reached: bad math opcode");
1094       return NULL;
1095    }
1096
1097    /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
1098     * might be able to do better by doing execsize = 1 math and then
1099     * expanding that result out, but we would need to be careful with
1100     * masking.
1101     *
1102     * Gen 6 hardware ignores source modifiers (negate and abs) on math
1103     * instructions, so we also move to a temp to set those up.
1104     */
1105    if (intel->gen >= 6)
1106       src = fix_math_operand(src);
1107
1108    fs_inst *inst = emit(opcode, dst, src);
1109
1110    if (intel->gen < 6) {
1111       inst->base_mrf = 2;
1112       inst->mlen = dispatch_width / 8;
1113    }
1114
1115    return inst;
1116 }
1117
1118 fs_inst *
1119 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1120 {
1121    int base_mrf = 2;
1122    fs_inst *inst;
1123
1124    switch (opcode) {
1125    case SHADER_OPCODE_INT_QUOTIENT:
1126    case SHADER_OPCODE_INT_REMAINDER:
1127       if (intel->gen >= 7 && dispatch_width == 16)
1128          fail("16-wide INTDIV unsupported\n");
1129       break;
1130    case SHADER_OPCODE_POW:
1131       break;
1132    default:
1133       assert(!"not reached: unsupported binary math opcode.");
1134       return NULL;
1135    }
1136
1137    if (intel->gen >= 6) {
1138       src0 = fix_math_operand(src0);
1139       src1 = fix_math_operand(src1);
1140
1141       inst = emit(opcode, dst, src0, src1);
1142    } else {
1143       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1144        * "Message Payload":
1145        *
1146        * "Operand0[7].  For the INT DIV functions, this operand is the
1147        *  denominator."
1148        *  ...
1149        * "Operand1[7].  For the INT DIV functions, this operand is the
1150        *  numerator."
1151        */
1152       bool is_int_div = opcode != SHADER_OPCODE_POW;
1153       fs_reg &op0 = is_int_div ? src1 : src0;
1154       fs_reg &op1 = is_int_div ? src0 : src1;
1155
1156       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1157       inst = emit(opcode, dst, op0, reg_null_f);
1158
1159       inst->base_mrf = base_mrf;
1160       inst->mlen = 2 * dispatch_width / 8;
1161    }
1162    return inst;
1163 }
1164
1165 void
1166 fs_visitor::assign_curb_setup()
1167 {
1168    c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
1169    if (dispatch_width == 8) {
1170       c->prog_data.first_curbe_grf = c->nr_payload_regs;
1171    } else {
1172       c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1173    }
1174
1175    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1176    foreach_list(node, &this->instructions) {
1177       fs_inst *inst = (fs_inst *)node;
1178
1179       for (unsigned int i = 0; i < 3; i++) {
1180          if (inst->src[i].file == UNIFORM) {
1181             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1182             struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1183                                                   constant_nr / 8,
1184                                                   constant_nr % 8);
1185
1186             inst->src[i].file = FIXED_HW_REG;
1187             inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
1188          }
1189       }
1190    }
1191 }
1192
1193 void
1194 fs_visitor::calculate_urb_setup()
1195 {
1196    for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
1197       urb_setup[i] = -1;
1198    }
1199
1200    int urb_next = 0;
1201    /* Figure out where each of the incoming setup attributes lands. */
1202    if (intel->gen >= 6) {
1203       for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
1204          if (fp->Base.InputsRead & BITFIELD64_BIT(i)) {
1205             urb_setup[i] = urb_next++;
1206          }
1207       }
1208    } else {
1209       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1210       for (unsigned int i = 0; i < VERT_RESULT_MAX; i++) {
1211          /* Point size is packed into the header, not as a general attribute */
1212          if (i == VERT_RESULT_PSIZ)
1213             continue;
1214
1215          if (c->key.vp_outputs_written & BITFIELD64_BIT(i)) {
1216             int fp_index = _mesa_vert_result_to_frag_attrib((gl_vert_result) i);
1217
1218             /* The back color slot is skipped when the front color is
1219              * also written to.  In addition, some slots can be
1220              * written in the vertex shader and not read in the
1221              * fragment shader.  So the register number must always be
1222              * incremented, mapped or not.
1223              */
1224             if (fp_index >= 0)
1225                urb_setup[fp_index] = urb_next;
1226             urb_next++;
1227          }
1228       }
1229
1230       /*
1231        * It's a FS only attribute, and we did interpolation for this attribute
1232        * in SF thread. So, count it here, too.
1233        *
1234        * See compile_sf_prog() for more info.
1235        */
1236       if (fp->Base.InputsRead & BITFIELD64_BIT(FRAG_ATTRIB_PNTC))
1237          urb_setup[FRAG_ATTRIB_PNTC] = urb_next++;
1238    }
1239
1240    /* Each attribute is 4 setup channels, each of which is half a reg. */
1241    c->prog_data.urb_read_length = urb_next * 2;
1242 }
1243
1244 void
1245 fs_visitor::assign_urb_setup()
1246 {
1247    int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1248
1249    /* Offset all the urb_setup[] index by the actual position of the
1250     * setup regs, now that the location of the constants has been chosen.
1251     */
1252    foreach_list(node, &this->instructions) {
1253       fs_inst *inst = (fs_inst *)node;
1254
1255       if (inst->opcode == FS_OPCODE_LINTERP) {
1256          assert(inst->src[2].file == FIXED_HW_REG);
1257          inst->src[2].fixed_hw_reg.nr += urb_start;
1258       }
1259
1260       if (inst->opcode == FS_OPCODE_CINTERP) {
1261          assert(inst->src[0].file == FIXED_HW_REG);
1262          inst->src[0].fixed_hw_reg.nr += urb_start;
1263       }
1264    }
1265
1266    this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length;
1267 }
1268
1269 /**
1270  * Split large virtual GRFs into separate components if we can.
1271  *
1272  * This is mostly duplicated with what brw_fs_vector_splitting does,
1273  * but that's really conservative because it's afraid of doing
1274  * splitting that doesn't result in real progress after the rest of
1275  * the optimization phases, which would cause infinite looping in
1276  * optimization.  We can do it once here, safely.  This also has the
1277  * opportunity to split interpolated values, or maybe even uniforms,
1278  * which we don't have at the IR level.
1279  *
1280  * We want to split, because virtual GRFs are what we register
1281  * allocate and spill (due to contiguousness requirements for some
1282  * instructions), and they're what we naturally generate in the
1283  * codegen process, but most virtual GRFs don't actually need to be
1284  * contiguous sets of GRFs.  If we split, we'll end up with reduced
1285  * live intervals and better dead code elimination and coalescing.
1286  */
1287 void
1288 fs_visitor::split_virtual_grfs()
1289 {
1290    int num_vars = this->virtual_grf_count;
1291    bool split_grf[num_vars];
1292    int new_virtual_grf[num_vars];
1293
1294    /* Try to split anything > 0 sized. */
1295    for (int i = 0; i < num_vars; i++) {
1296       if (this->virtual_grf_sizes[i] != 1)
1297          split_grf[i] = true;
1298       else
1299          split_grf[i] = false;
1300    }
1301
1302    if (brw->has_pln &&
1303        this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1304       /* PLN opcodes rely on the delta_xy being contiguous.  We only have to
1305        * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1306        * Gen6, that was the only supported interpolation mode, and since Gen6,
1307        * delta_x and delta_y are in fixed hardware registers.
1308        */
1309       split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1310          false;
1311    }
1312
1313    foreach_list(node, &this->instructions) {
1314       fs_inst *inst = (fs_inst *)node;
1315
1316       /* If there's a SEND message that requires contiguous destination
1317        * registers, no splitting is allowed.
1318        */
1319       if (inst->regs_written() > 1) {
1320          split_grf[inst->dst.reg] = false;
1321       }
1322    }
1323
1324    /* Allocate new space for split regs.  Note that the virtual
1325     * numbers will be contiguous.
1326     */
1327    for (int i = 0; i < num_vars; i++) {
1328       if (split_grf[i]) {
1329          new_virtual_grf[i] = virtual_grf_alloc(1);
1330          for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1331             int reg = virtual_grf_alloc(1);
1332             assert(reg == new_virtual_grf[i] + j - 1);
1333             (void) reg;
1334          }
1335          this->virtual_grf_sizes[i] = 1;
1336       }
1337    }
1338
1339    foreach_list(node, &this->instructions) {
1340       fs_inst *inst = (fs_inst *)node;
1341
1342       if (inst->dst.file == GRF &&
1343           split_grf[inst->dst.reg] &&
1344           inst->dst.reg_offset != 0) {
1345          inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1346                           inst->dst.reg_offset - 1);
1347          inst->dst.reg_offset = 0;
1348       }
1349       for (int i = 0; i < 3; i++) {
1350          if (inst->src[i].file == GRF &&
1351              split_grf[inst->src[i].reg] &&
1352              inst->src[i].reg_offset != 0) {
1353             inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1354                                 inst->src[i].reg_offset - 1);
1355             inst->src[i].reg_offset = 0;
1356          }
1357       }
1358    }
1359    this->live_intervals_valid = false;
1360 }
1361
1362 /**
1363  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1364  *
1365  * During code generation, we create tons of temporary variables, many of
1366  * which get immediately killed and are never used again.  Yet, in later
1367  * optimization and analysis passes, such as compute_live_intervals, we need
1368  * to loop over all the virtual GRFs.  Compacting them can save a lot of
1369  * overhead.
1370  */
1371 void
1372 fs_visitor::compact_virtual_grfs()
1373 {
1374    /* Mark which virtual GRFs are used, and count how many. */
1375    int remap_table[this->virtual_grf_count];
1376    memset(remap_table, -1, sizeof(remap_table));
1377
1378    foreach_list(node, &this->instructions) {
1379       const fs_inst *inst = (const fs_inst *) node;
1380
1381       if (inst->dst.file == GRF)
1382          remap_table[inst->dst.reg] = 0;
1383
1384       for (int i = 0; i < 3; i++) {
1385          if (inst->src[i].file == GRF)
1386             remap_table[inst->src[i].reg] = 0;
1387       }
1388    }
1389
1390    /* In addition to registers used in instructions, fs_visitor keeps
1391     * direct references to certain special values which must be patched:
1392     */
1393    fs_reg *special[] = {
1394       &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1395       &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1396       &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1397       &delta_x[0], &delta_x[1], &delta_x[2],
1398       &delta_x[3], &delta_x[4], &delta_x[5],
1399       &delta_y[0], &delta_y[1], &delta_y[2],
1400       &delta_y[3], &delta_y[4], &delta_y[5],
1401    };
1402    STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1403    STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1404
1405    /* Treat all special values as used, to be conservative */
1406    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1407       if (special[i]->file == GRF)
1408          remap_table[special[i]->reg] = 0;
1409    }
1410
1411    /* Compact the GRF arrays. */
1412    int new_index = 0;
1413    for (int i = 0; i < this->virtual_grf_count; i++) {
1414       if (remap_table[i] != -1) {
1415          remap_table[i] = new_index;
1416          virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1417          if (live_intervals_valid) {
1418             virtual_grf_use[new_index] = virtual_grf_use[i];
1419             virtual_grf_def[new_index] = virtual_grf_def[i];
1420          }
1421          ++new_index;
1422       }
1423    }
1424
1425    this->virtual_grf_count = new_index;
1426
1427    /* Patch all the instructions to use the newly renumbered registers */
1428    foreach_list(node, &this->instructions) {
1429       fs_inst *inst = (fs_inst *) node;
1430
1431       if (inst->dst.file == GRF)
1432          inst->dst.reg = remap_table[inst->dst.reg];
1433
1434       for (int i = 0; i < 3; i++) {
1435          if (inst->src[i].file == GRF)
1436             inst->src[i].reg = remap_table[inst->src[i].reg];
1437       }
1438    }
1439
1440    /* Patch all the references to special values */
1441    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1442       if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1443          special[i]->reg = remap_table[special[i]->reg];
1444    }
1445 }
1446
1447 bool
1448 fs_visitor::remove_dead_constants()
1449 {
1450    if (dispatch_width == 8) {
1451       this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
1452
1453       for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
1454          this->params_remap[i] = -1;
1455
1456       /* Find which params are still in use. */
1457       foreach_list(node, &this->instructions) {
1458          fs_inst *inst = (fs_inst *)node;
1459
1460          for (int i = 0; i < 3; i++) {
1461             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1462
1463             if (inst->src[i].file != UNIFORM)
1464                continue;
1465
1466             assert(constant_nr < (int)c->prog_data.nr_params);
1467
1468             /* For now, set this to non-negative.  We'll give it the
1469              * actual new number in a moment, in order to keep the
1470              * register numbers nicely ordered.
1471              */
1472             this->params_remap[constant_nr] = 0;
1473          }
1474       }
1475
1476       /* Figure out what the new numbers for the params will be.  At some
1477        * point when we're doing uniform array access, we're going to want
1478        * to keep the distinction between .reg and .reg_offset, but for
1479        * now we don't care.
1480        */
1481       unsigned int new_nr_params = 0;
1482       for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1483          if (this->params_remap[i] != -1) {
1484             this->params_remap[i] = new_nr_params++;
1485          }
1486       }
1487
1488       /* Update the list of params to be uploaded to match our new numbering. */
1489       for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1490          int remapped = this->params_remap[i];
1491
1492          if (remapped == -1)
1493             continue;
1494
1495          c->prog_data.param[remapped] = c->prog_data.param[i];
1496       }
1497
1498       c->prog_data.nr_params = new_nr_params;
1499    } else {
1500       /* This should have been generated in the 8-wide pass already. */
1501       assert(this->params_remap);
1502    }
1503
1504    /* Now do the renumbering of the shader to remove unused params. */
1505    foreach_list(node, &this->instructions) {
1506       fs_inst *inst = (fs_inst *)node;
1507
1508       for (int i = 0; i < 3; i++) {
1509          int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1510
1511          if (inst->src[i].file != UNIFORM)
1512             continue;
1513
1514          assert(this->params_remap[constant_nr] != -1);
1515          inst->src[i].reg = this->params_remap[constant_nr];
1516          inst->src[i].reg_offset = 0;
1517       }
1518    }
1519
1520    return true;
1521 }
1522
1523 /*
1524  * Implements array access of uniforms by inserting a
1525  * PULL_CONSTANT_LOAD instruction.
1526  *
1527  * Unlike temporary GRF array access (where we don't support it due to
1528  * the difficulty of doing relative addressing on instruction
1529  * destinations), we could potentially do array access of uniforms
1530  * that were loaded in GRF space as push constants.  In real-world
1531  * usage we've seen, though, the arrays being used are always larger
1532  * than we could load as push constants, so just always move all
1533  * uniform array access out to a pull constant buffer.
1534  */
1535 void
1536 fs_visitor::move_uniform_array_access_to_pull_constants()
1537 {
1538    int pull_constant_loc[c->prog_data.nr_params];
1539
1540    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1541       pull_constant_loc[i] = -1;
1542    }
1543
1544    /* Walk through and find array access of uniforms.  Put a copy of that
1545     * uniform in the pull constant buffer.
1546     *
1547     * Note that we don't move constant-indexed accesses to arrays.  No
1548     * testing has been done of the performance impact of this choice.
1549     */
1550    foreach_list_safe(node, &this->instructions) {
1551       fs_inst *inst = (fs_inst *)node;
1552
1553       for (int i = 0 ; i < 3; i++) {
1554          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1555             continue;
1556
1557          int uniform = inst->src[i].reg;
1558
1559          /* If this array isn't already present in the pull constant buffer,
1560           * add it.
1561           */
1562          if (pull_constant_loc[uniform] == -1) {
1563             const float **values = &c->prog_data.param[uniform];
1564
1565             pull_constant_loc[uniform] = c->prog_data.nr_pull_params;
1566
1567             assert(param_size[uniform]);
1568
1569             for (int j = 0; j < param_size[uniform]; j++) {
1570                c->prog_data.pull_param[c->prog_data.nr_pull_params++] =
1571                   values[j];
1572             }
1573          }
1574
1575          /* Set up the annotation tracking for new generated instructions. */
1576          base_ir = inst->ir;
1577          current_annotation = inst->annotation;
1578
1579          fs_reg offset = fs_reg(this, glsl_type::int_type);
1580          inst->insert_before(ADD(offset, *inst->src[i].reladdr,
1581                                  fs_reg(pull_constant_loc[uniform] +
1582                                         inst->src[i].reg_offset)));
1583
1584          fs_reg surf_index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1585          fs_reg temp = fs_reg(this, glsl_type::float_type);
1586          exec_list list = VARYING_PULL_CONSTANT_LOAD(temp,
1587                                                      surf_index, offset);
1588          inst->insert_before(&list);
1589
1590          inst->src[i].file = temp.file;
1591          inst->src[i].reg = temp.reg;
1592          inst->src[i].reg_offset = temp.reg_offset;
1593          inst->src[i].reladdr = NULL;
1594       }
1595    }
1596 }
1597
1598 /**
1599  * Choose accesses from the UNIFORM file to demote to using the pull
1600  * constant buffer.
1601  *
1602  * We allow a fragment shader to have more than the specified minimum
1603  * maximum number of fragment shader uniform components (64).  If
1604  * there are too many of these, they'd fill up all of register space.
1605  * So, this will push some of them out to the pull constant buffer and
1606  * update the program to load them.
1607  */
1608 void
1609 fs_visitor::setup_pull_constants()
1610 {
1611    /* Only allow 16 registers (128 uniform components) as push constants. */
1612    unsigned int max_uniform_components = 16 * 8;
1613    if (c->prog_data.nr_params <= max_uniform_components)
1614       return;
1615
1616    if (dispatch_width == 16) {
1617       fail("Pull constants not supported in 16-wide\n");
1618       return;
1619    }
1620
1621    /* Just demote the end of the list.  We could probably do better
1622     * here, demoting things that are rarely used in the program first.
1623     */
1624    unsigned int pull_uniform_base = max_uniform_components;
1625
1626    int pull_constant_loc[c->prog_data.nr_params];
1627    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1628       if (i < pull_uniform_base) {
1629          pull_constant_loc[i] = -1;
1630       } else {
1631          pull_constant_loc[i] = -1;
1632          /* If our constant is already being uploaded for reladdr purposes,
1633           * reuse it.
1634           */
1635          for (unsigned int j = 0; j < c->prog_data.nr_pull_params; j++) {
1636             if (c->prog_data.pull_param[j] == c->prog_data.param[i]) {
1637                pull_constant_loc[i] = j;
1638                break;
1639             }
1640          }
1641          if (pull_constant_loc[i] == -1) {
1642             int pull_index = c->prog_data.nr_pull_params++;
1643             c->prog_data.pull_param[pull_index] = c->prog_data.param[i];
1644             pull_constant_loc[i] = pull_index;;
1645          }
1646       }
1647    }
1648    c->prog_data.nr_params = pull_uniform_base;
1649
1650    foreach_list(node, &this->instructions) {
1651       fs_inst *inst = (fs_inst *)node;
1652
1653       for (int i = 0; i < 3; i++) {
1654          if (inst->src[i].file != UNIFORM)
1655             continue;
1656
1657          int pull_index = pull_constant_loc[inst->src[i].reg +
1658                                             inst->src[i].reg_offset];
1659          if (pull_index == -1)
1660             continue;
1661
1662          assert(!inst->src[i].reladdr);
1663
1664          fs_reg dst = fs_reg(this, glsl_type::float_type);
1665          fs_reg index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1666          fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1667          fs_inst *pull =
1668             new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1669                                  dst, index, offset);
1670          pull->ir = inst->ir;
1671          pull->annotation = inst->annotation;
1672          pull->base_mrf = 14;
1673          pull->mlen = 1;
1674
1675          inst->insert_before(pull);
1676
1677          inst->src[i].file = GRF;
1678          inst->src[i].reg = dst.reg;
1679          inst->src[i].reg_offset = 0;
1680          inst->src[i].smear = pull_index & 3;
1681       }
1682    }
1683 }
1684
1685 bool
1686 fs_visitor::opt_algebraic()
1687 {
1688    bool progress = false;
1689
1690    foreach_list(node, &this->instructions) {
1691       fs_inst *inst = (fs_inst *)node;
1692
1693       switch (inst->opcode) {
1694       case BRW_OPCODE_MUL:
1695          if (inst->src[1].file != IMM)
1696             continue;
1697
1698          /* a * 1.0 = a */
1699          if (inst->src[1].is_one()) {
1700             inst->opcode = BRW_OPCODE_MOV;
1701             inst->src[1] = reg_undef;
1702             progress = true;
1703             break;
1704          }
1705
1706          /* a * 0.0 = 0.0 */
1707          if (inst->src[1].is_zero()) {
1708             inst->opcode = BRW_OPCODE_MOV;
1709             inst->src[0] = inst->src[1];
1710             inst->src[1] = reg_undef;
1711             progress = true;
1712             break;
1713          }
1714
1715          break;
1716       case BRW_OPCODE_ADD:
1717          if (inst->src[1].file != IMM)
1718             continue;
1719
1720          /* a + 0.0 = a */
1721          if (inst->src[1].is_zero()) {
1722             inst->opcode = BRW_OPCODE_MOV;
1723             inst->src[1] = reg_undef;
1724             progress = true;
1725             break;
1726          }
1727          break;
1728       default:
1729          break;
1730       }
1731    }
1732
1733    return progress;
1734 }
1735
1736 /**
1737  * Must be called after calculate_live_intervales() to remove unused
1738  * writes to registers -- register allocation will fail otherwise
1739  * because something deffed but not used won't be considered to
1740  * interfere with other regs.
1741  */
1742 bool
1743 fs_visitor::dead_code_eliminate()
1744 {
1745    bool progress = false;
1746    int pc = 0;
1747
1748    calculate_live_intervals();
1749
1750    foreach_list_safe(node, &this->instructions) {
1751       fs_inst *inst = (fs_inst *)node;
1752
1753       if (inst->dst.file == GRF && this->virtual_grf_use[inst->dst.reg] <= pc) {
1754          inst->remove();
1755          progress = true;
1756       }
1757
1758       pc++;
1759    }
1760
1761    if (progress)
1762       live_intervals_valid = false;
1763
1764    return progress;
1765 }
1766
1767 /**
1768  * Implements a second type of register coalescing: This one checks if
1769  * the two regs involved in a raw move don't interfere, in which case
1770  * they can both by stored in the same place and the MOV removed.
1771  */
1772 bool
1773 fs_visitor::register_coalesce_2()
1774 {
1775    bool progress = false;
1776
1777    calculate_live_intervals();
1778
1779    foreach_list_safe(node, &this->instructions) {
1780       fs_inst *inst = (fs_inst *)node;
1781
1782       if (inst->opcode != BRW_OPCODE_MOV ||
1783           inst->predicate ||
1784           inst->saturate ||
1785           inst->src[0].file != GRF ||
1786           inst->src[0].negate ||
1787           inst->src[0].abs ||
1788           inst->src[0].smear != -1 ||
1789           inst->dst.file != GRF ||
1790           inst->dst.type != inst->src[0].type ||
1791           virtual_grf_sizes[inst->src[0].reg] != 1 ||
1792           virtual_grf_interferes(inst->dst.reg, inst->src[0].reg)) {
1793          continue;
1794       }
1795
1796       int reg_from = inst->src[0].reg;
1797       assert(inst->src[0].reg_offset == 0);
1798       int reg_to = inst->dst.reg;
1799       int reg_to_offset = inst->dst.reg_offset;
1800
1801       foreach_list(node, &this->instructions) {
1802          fs_inst *scan_inst = (fs_inst *)node;
1803
1804          if (scan_inst->dst.file == GRF &&
1805              scan_inst->dst.reg == reg_from) {
1806             scan_inst->dst.reg = reg_to;
1807             scan_inst->dst.reg_offset = reg_to_offset;
1808          }
1809          for (int i = 0; i < 3; i++) {
1810             if (scan_inst->src[i].file == GRF &&
1811                 scan_inst->src[i].reg == reg_from) {
1812                scan_inst->src[i].reg = reg_to;
1813                scan_inst->src[i].reg_offset = reg_to_offset;
1814             }
1815          }
1816       }
1817
1818       inst->remove();
1819
1820       /* We don't need to recalculate live intervals inside the loop despite
1821        * flagging live_intervals_valid because we only use live intervals for
1822        * the interferes test, and we must have had a situation where the
1823        * intervals were:
1824        *
1825        *  from  to
1826        *  ^
1827        *  |
1828        *  v
1829        *        ^
1830        *        |
1831        *        v
1832        *
1833        * Some register R that might get coalesced with one of these two could
1834        * only be referencing "to", otherwise "from"'s range would have been
1835        * longer.  R's range could also only start at the end of "to" or later,
1836        * otherwise it will conflict with "to" when we try to coalesce "to"
1837        * into Rw anyway.
1838        */
1839       live_intervals_valid = false;
1840
1841       progress = true;
1842       continue;
1843    }
1844
1845    return progress;
1846 }
1847
1848 bool
1849 fs_visitor::register_coalesce()
1850 {
1851    bool progress = false;
1852    int if_depth = 0;
1853    int loop_depth = 0;
1854
1855    foreach_list_safe(node, &this->instructions) {
1856       fs_inst *inst = (fs_inst *)node;
1857
1858       /* Make sure that we dominate the instructions we're going to
1859        * scan for interfering with our coalescing, or we won't have
1860        * scanned enough to see if anything interferes with our
1861        * coalescing.  We don't dominate the following instructions if
1862        * we're in a loop or an if block.
1863        */
1864       switch (inst->opcode) {
1865       case BRW_OPCODE_DO:
1866          loop_depth++;
1867          break;
1868       case BRW_OPCODE_WHILE:
1869          loop_depth--;
1870          break;
1871       case BRW_OPCODE_IF:
1872          if_depth++;
1873          break;
1874       case BRW_OPCODE_ENDIF:
1875          if_depth--;
1876          break;
1877       default:
1878          break;
1879       }
1880       if (loop_depth || if_depth)
1881          continue;
1882
1883       if (inst->opcode != BRW_OPCODE_MOV ||
1884           inst->predicate ||
1885           inst->saturate ||
1886           inst->dst.file != GRF || (inst->src[0].file != GRF &&
1887                                     inst->src[0].file != UNIFORM)||
1888           inst->dst.type != inst->src[0].type)
1889          continue;
1890
1891       bool has_source_modifiers = (inst->src[0].abs ||
1892                                    inst->src[0].negate ||
1893                                    inst->src[0].file == UNIFORM);
1894
1895       /* Found a move of a GRF to a GRF.  Let's see if we can coalesce
1896        * them: check for no writes to either one until the exit of the
1897        * program.
1898        */
1899       bool interfered = false;
1900
1901       for (fs_inst *scan_inst = (fs_inst *)inst->next;
1902            !scan_inst->is_tail_sentinel();
1903            scan_inst = (fs_inst *)scan_inst->next) {
1904          if (scan_inst->dst.file == GRF) {
1905             if (scan_inst->overwrites_reg(inst->dst) ||
1906                 scan_inst->overwrites_reg(inst->src[0])) {
1907                interfered = true;
1908                break;
1909             }
1910          }
1911
1912          /* The gen6 MATH instruction can't handle source modifiers or
1913           * unusual register regions, so avoid coalescing those for
1914           * now.  We should do something more specific.
1915           */
1916          if (has_source_modifiers && !can_do_source_mods(scan_inst)) {
1917             interfered = true;
1918             break;
1919          }
1920
1921          /* The accumulator result appears to get used for the
1922           * conditional modifier generation.  When negating a UD
1923           * value, there is a 33rd bit generated for the sign in the
1924           * accumulator value, so now you can't check, for example,
1925           * equality with a 32-bit value.  See piglit fs-op-neg-uint.
1926           */
1927          if (scan_inst->conditional_mod &&
1928              inst->src[0].negate &&
1929              inst->src[0].type == BRW_REGISTER_TYPE_UD) {
1930             interfered = true;
1931             break;
1932          }
1933       }
1934       if (interfered) {
1935          continue;
1936       }
1937
1938       /* Rewrite the later usage to point at the source of the move to
1939        * be removed.
1940        */
1941       for (fs_inst *scan_inst = inst;
1942            !scan_inst->is_tail_sentinel();
1943            scan_inst = (fs_inst *)scan_inst->next) {
1944          for (int i = 0; i < 3; i++) {
1945             if (scan_inst->src[i].file == GRF &&
1946                 scan_inst->src[i].reg == inst->dst.reg &&
1947                 scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
1948                fs_reg new_src = inst->src[0];
1949                if (scan_inst->src[i].abs) {
1950                   new_src.negate = 0;
1951                   new_src.abs = 1;
1952                }
1953                new_src.negate ^= scan_inst->src[i].negate;
1954                scan_inst->src[i] = new_src;
1955             }
1956          }
1957       }
1958
1959       inst->remove();
1960       progress = true;
1961    }
1962
1963    if (progress)
1964       live_intervals_valid = false;
1965
1966    return progress;
1967 }
1968
1969
1970 bool
1971 fs_visitor::compute_to_mrf()
1972 {
1973    bool progress = false;
1974    int next_ip = 0;
1975
1976    calculate_live_intervals();
1977
1978    foreach_list_safe(node, &this->instructions) {
1979       fs_inst *inst = (fs_inst *)node;
1980
1981       int ip = next_ip;
1982       next_ip++;
1983
1984       if (inst->opcode != BRW_OPCODE_MOV ||
1985           inst->predicate ||
1986           inst->dst.file != MRF || inst->src[0].file != GRF ||
1987           inst->dst.type != inst->src[0].type ||
1988           inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
1989          continue;
1990
1991       /* Work out which hardware MRF registers are written by this
1992        * instruction.
1993        */
1994       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
1995       int mrf_high;
1996       if (inst->dst.reg & BRW_MRF_COMPR4) {
1997          mrf_high = mrf_low + 4;
1998       } else if (dispatch_width == 16 &&
1999                  (!inst->force_uncompressed && !inst->force_sechalf)) {
2000          mrf_high = mrf_low + 1;
2001       } else {
2002          mrf_high = mrf_low;
2003       }
2004
2005       /* Can't compute-to-MRF this GRF if someone else was going to
2006        * read it later.
2007        */
2008       if (this->virtual_grf_use[inst->src[0].reg] > ip)
2009          continue;
2010
2011       /* Found a move of a GRF to a MRF.  Let's see if we can go
2012        * rewrite the thing that made this GRF to write into the MRF.
2013        */
2014       fs_inst *scan_inst;
2015       for (scan_inst = (fs_inst *)inst->prev;
2016            scan_inst->prev != NULL;
2017            scan_inst = (fs_inst *)scan_inst->prev) {
2018          if (scan_inst->dst.file == GRF &&
2019              scan_inst->dst.reg == inst->src[0].reg) {
2020             /* Found the last thing to write our reg we want to turn
2021              * into a compute-to-MRF.
2022              */
2023
2024             /* SENDs can only write to GRFs, so no compute-to-MRF. */
2025             if (scan_inst->mlen) {
2026                break;
2027             }
2028
2029             /* If it's predicated, it (probably) didn't populate all
2030              * the channels.  We might be able to rewrite everything
2031              * that writes that reg, but it would require smarter
2032              * tracking to delay the rewriting until complete success.
2033              */
2034             if (scan_inst->predicate)
2035                break;
2036
2037             /* If it's half of register setup and not the same half as
2038              * our MOV we're trying to remove, bail for now.
2039              */
2040             if (scan_inst->force_uncompressed != inst->force_uncompressed ||
2041                 scan_inst->force_sechalf != inst->force_sechalf) {
2042                break;
2043             }
2044
2045             /* SEND instructions can't have MRF as a destination. */
2046             if (scan_inst->mlen)
2047                break;
2048
2049             if (intel->gen >= 6) {
2050                /* gen6 math instructions must have the destination be
2051                 * GRF, so no compute-to-MRF for them.
2052                 */
2053                if (scan_inst->is_math()) {
2054                   break;
2055                }
2056             }
2057
2058             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2059                /* Found the creator of our MRF's source value. */
2060                scan_inst->dst.file = MRF;
2061                scan_inst->dst.reg = inst->dst.reg;
2062                scan_inst->saturate |= inst->saturate;
2063                inst->remove();
2064                progress = true;
2065             }
2066             break;
2067          }
2068
2069          /* We don't handle flow control here.  Most computation of
2070           * values that end up in MRFs are shortly before the MRF
2071           * write anyway.
2072           */
2073          if (scan_inst->opcode == BRW_OPCODE_DO ||
2074              scan_inst->opcode == BRW_OPCODE_WHILE ||
2075              scan_inst->opcode == BRW_OPCODE_ELSE ||
2076              scan_inst->opcode == BRW_OPCODE_ENDIF) {
2077             break;
2078          }
2079
2080          /* You can't read from an MRF, so if someone else reads our
2081           * MRF's source GRF that we wanted to rewrite, that stops us.
2082           */
2083          bool interfered = false;
2084          for (int i = 0; i < 3; i++) {
2085             if (scan_inst->src[i].file == GRF &&
2086                 scan_inst->src[i].reg == inst->src[0].reg &&
2087                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2088                interfered = true;
2089             }
2090          }
2091          if (interfered)
2092             break;
2093
2094          if (scan_inst->dst.file == MRF) {
2095             /* If somebody else writes our MRF here, we can't
2096              * compute-to-MRF before that.
2097              */
2098             int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2099             int scan_mrf_high;
2100
2101             if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2102                scan_mrf_high = scan_mrf_low + 4;
2103             } else if (dispatch_width == 16 &&
2104                        (!scan_inst->force_uncompressed &&
2105                         !scan_inst->force_sechalf)) {
2106                scan_mrf_high = scan_mrf_low + 1;
2107             } else {
2108                scan_mrf_high = scan_mrf_low;
2109             }
2110
2111             if (mrf_low == scan_mrf_low ||
2112                 mrf_low == scan_mrf_high ||
2113                 mrf_high == scan_mrf_low ||
2114                 mrf_high == scan_mrf_high) {
2115                break;
2116             }
2117          }
2118
2119          if (scan_inst->mlen > 0) {
2120             /* Found a SEND instruction, which means that there are
2121              * live values in MRFs from base_mrf to base_mrf +
2122              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
2123              * above it.
2124              */
2125             if (mrf_low >= scan_inst->base_mrf &&
2126                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2127                break;
2128             }
2129             if (mrf_high >= scan_inst->base_mrf &&
2130                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2131                break;
2132             }
2133          }
2134       }
2135    }
2136
2137    if (progress)
2138       live_intervals_valid = false;
2139
2140    return progress;
2141 }
2142
2143 /**
2144  * Walks through basic blocks, looking for repeated MRF writes and
2145  * removing the later ones.
2146  */
2147 bool
2148 fs_visitor::remove_duplicate_mrf_writes()
2149 {
2150    fs_inst *last_mrf_move[16];
2151    bool progress = false;
2152
2153    /* Need to update the MRF tracking for compressed instructions. */
2154    if (dispatch_width == 16)
2155       return false;
2156
2157    memset(last_mrf_move, 0, sizeof(last_mrf_move));
2158
2159    foreach_list_safe(node, &this->instructions) {
2160       fs_inst *inst = (fs_inst *)node;
2161
2162       switch (inst->opcode) {
2163       case BRW_OPCODE_DO:
2164       case BRW_OPCODE_WHILE:
2165       case BRW_OPCODE_IF:
2166       case BRW_OPCODE_ELSE:
2167       case BRW_OPCODE_ENDIF:
2168          memset(last_mrf_move, 0, sizeof(last_mrf_move));
2169          continue;
2170       default:
2171          break;
2172       }
2173
2174       if (inst->opcode == BRW_OPCODE_MOV &&
2175           inst->dst.file == MRF) {
2176          fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2177          if (prev_inst && inst->equals(prev_inst)) {
2178             inst->remove();
2179             progress = true;
2180             continue;
2181          }
2182       }
2183
2184       /* Clear out the last-write records for MRFs that were overwritten. */
2185       if (inst->dst.file == MRF) {
2186          last_mrf_move[inst->dst.reg] = NULL;
2187       }
2188
2189       if (inst->mlen > 0) {
2190          /* Found a SEND instruction, which will include two or fewer
2191           * implied MRF writes.  We could do better here.
2192           */
2193          for (int i = 0; i < implied_mrf_writes(inst); i++) {
2194             last_mrf_move[inst->base_mrf + i] = NULL;
2195          }
2196       }
2197
2198       /* Clear out any MRF move records whose sources got overwritten. */
2199       if (inst->dst.file == GRF) {
2200          for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2201             if (last_mrf_move[i] &&
2202                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2203                last_mrf_move[i] = NULL;
2204             }
2205          }
2206       }
2207
2208       if (inst->opcode == BRW_OPCODE_MOV &&
2209           inst->dst.file == MRF &&
2210           inst->src[0].file == GRF &&
2211           !inst->predicate) {
2212          last_mrf_move[inst->dst.reg] = inst;
2213       }
2214    }
2215
2216    if (progress)
2217       live_intervals_valid = false;
2218
2219    return progress;
2220 }
2221
2222 void
2223 fs_visitor::dump_instruction(fs_inst *inst)
2224 {
2225    if (inst->predicate) {
2226       printf("(%cf0.%d) ",
2227              inst->predicate_inverse ? '-' : '+',
2228              inst->flag_subreg);
2229    }
2230
2231    if (inst->opcode < ARRAY_SIZE(opcode_descs) &&
2232        opcode_descs[inst->opcode].name) {
2233       printf("%s", opcode_descs[inst->opcode].name);
2234    } else {
2235       printf("op%d", inst->opcode);
2236    }
2237    if (inst->saturate)
2238       printf(".sat");
2239    if (inst->conditional_mod) {
2240       printf(".cmod");
2241       if (!inst->predicate &&
2242           (intel->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2243                               inst->opcode != BRW_OPCODE_IF &&
2244                               inst->opcode != BRW_OPCODE_WHILE))) {
2245          printf(".f0.%d\n", inst->flag_subreg);
2246       }
2247    }
2248    printf(" ");
2249
2250
2251    switch (inst->dst.file) {
2252    case GRF:
2253       printf("vgrf%d", inst->dst.reg);
2254       if (inst->dst.reg_offset)
2255          printf("+%d", inst->dst.reg_offset);
2256       break;
2257    case MRF:
2258       printf("m%d", inst->dst.reg);
2259       break;
2260    case BAD_FILE:
2261       printf("(null)");
2262       break;
2263    case UNIFORM:
2264       printf("***u%d***", inst->dst.reg);
2265       break;
2266    default:
2267       printf("???");
2268       break;
2269    }
2270    printf(", ");
2271
2272    for (int i = 0; i < 3; i++) {
2273       if (inst->src[i].negate)
2274          printf("-");
2275       if (inst->src[i].abs)
2276          printf("|");
2277       switch (inst->src[i].file) {
2278       case GRF:
2279          printf("vgrf%d", inst->src[i].reg);
2280          if (inst->src[i].reg_offset)
2281             printf("+%d", inst->src[i].reg_offset);
2282          break;
2283       case MRF:
2284          printf("***m%d***", inst->src[i].reg);
2285          break;
2286       case UNIFORM:
2287          printf("u%d", inst->src[i].reg);
2288          if (inst->src[i].reg_offset)
2289             printf(".%d", inst->src[i].reg_offset);
2290          break;
2291       case BAD_FILE:
2292          printf("(null)");
2293          break;
2294       default:
2295          printf("???");
2296          break;
2297       }
2298       if (inst->src[i].abs)
2299          printf("|");
2300
2301       if (i < 3)
2302          printf(", ");
2303    }
2304
2305    printf(" ");
2306
2307    if (inst->force_uncompressed)
2308       printf("1sthalf ");
2309
2310    if (inst->force_sechalf)
2311       printf("2ndhalf ");
2312
2313    printf("\n");
2314 }
2315
2316 void
2317 fs_visitor::dump_instructions()
2318 {
2319    int ip = 0;
2320    foreach_list(node, &this->instructions) {
2321       fs_inst *inst = (fs_inst *)node;
2322       printf("%d: ", ip++);
2323       dump_instruction(inst);
2324    }
2325 }
2326
2327 /**
2328  * Possibly returns an instruction that set up @param reg.
2329  *
2330  * Sometimes we want to take the result of some expression/variable
2331  * dereference tree and rewrite the instruction generating the result
2332  * of the tree.  When processing the tree, we know that the
2333  * instructions generated are all writing temporaries that are dead
2334  * outside of this tree.  So, if we have some instructions that write
2335  * a temporary, we're free to point that temp write somewhere else.
2336  *
2337  * Note that this doesn't guarantee that the instruction generated
2338  * only reg -- it might be the size=4 destination of a texture instruction.
2339  */
2340 fs_inst *
2341 fs_visitor::get_instruction_generating_reg(fs_inst *start,
2342                                            fs_inst *end,
2343                                            fs_reg reg)
2344 {
2345    if (end == start ||
2346        end->predicate ||
2347        end->force_uncompressed ||
2348        end->force_sechalf ||
2349        reg.reladdr ||
2350        !reg.equals(end->dst)) {
2351       return NULL;
2352    } else {
2353       return end;
2354    }
2355 }
2356
2357 void
2358 fs_visitor::setup_payload_gen6()
2359 {
2360    struct intel_context *intel = &brw->intel;
2361    bool uses_depth =
2362       (fp->Base.InputsRead & (1 << FRAG_ATTRIB_WPOS)) != 0;
2363    unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
2364
2365    assert(intel->gen >= 6);
2366
2367    /* R0-1: masks, pixel X/Y coordinates. */
2368    c->nr_payload_regs = 2;
2369    /* R2: only for 32-pixel dispatch.*/
2370
2371    /* R3-26: barycentric interpolation coordinates.  These appear in the
2372     * same order that they appear in the brw_wm_barycentric_interp_mode
2373     * enum.  Each set of coordinates occupies 2 registers if dispatch width
2374     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
2375     * appear if they were enabled using the "Barycentric Interpolation
2376     * Mode" bits in WM_STATE.
2377     */
2378    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
2379       if (barycentric_interp_modes & (1 << i)) {
2380          c->barycentric_coord_reg[i] = c->nr_payload_regs;
2381          c->nr_payload_regs += 2;
2382          if (dispatch_width == 16) {
2383             c->nr_payload_regs += 2;
2384          }
2385       }
2386    }
2387
2388    /* R27: interpolated depth if uses source depth */
2389    if (uses_depth) {
2390       c->source_depth_reg = c->nr_payload_regs;
2391       c->nr_payload_regs++;
2392       if (dispatch_width == 16) {
2393          /* R28: interpolated depth if not 8-wide. */
2394          c->nr_payload_regs++;
2395       }
2396    }
2397    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
2398    if (uses_depth) {
2399       c->source_w_reg = c->nr_payload_regs;
2400       c->nr_payload_regs++;
2401       if (dispatch_width == 16) {
2402          /* R30: interpolated W if not 8-wide. */
2403          c->nr_payload_regs++;
2404       }
2405    }
2406    /* R31: MSAA position offsets. */
2407    /* R32-: bary for 32-pixel. */
2408    /* R58-59: interp W for 32-pixel. */
2409
2410    if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
2411       c->source_depth_to_render_target = true;
2412    }
2413 }
2414
2415 bool
2416 fs_visitor::run()
2417 {
2418    sanity_param_count = fp->Base.Parameters->NumParameters;
2419    uint32_t orig_nr_params = c->prog_data.nr_params;
2420
2421    if (intel->gen >= 6)
2422       setup_payload_gen6();
2423    else
2424       setup_payload_gen4();
2425
2426    if (0) {
2427       emit_dummy_fs();
2428    } else {
2429       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2430          emit_shader_time_begin();
2431
2432       calculate_urb_setup();
2433       if (intel->gen < 6)
2434          emit_interpolation_setup_gen4();
2435       else
2436          emit_interpolation_setup_gen6();
2437
2438       /* We handle discards by keeping track of the still-live pixels in f0.1.
2439        * Initialize it with the dispatched pixels.
2440        */
2441       if (fp->UsesKill) {
2442          fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
2443          discard_init->flag_subreg = 1;
2444       }
2445
2446       /* Generate FS IR for main().  (the visitor only descends into
2447        * functions called "main").
2448        */
2449       if (shader) {
2450          foreach_list(node, &*shader->ir) {
2451             ir_instruction *ir = (ir_instruction *)node;
2452             base_ir = ir;
2453             this->result = reg_undef;
2454             ir->accept(this);
2455          }
2456       } else {
2457          emit_fragment_program_code();
2458       }
2459       base_ir = NULL;
2460       if (failed)
2461          return false;
2462
2463       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2464          emit_shader_time_end();
2465
2466       emit_fb_writes();
2467
2468       split_virtual_grfs();
2469
2470       move_uniform_array_access_to_pull_constants();
2471       setup_pull_constants();
2472
2473       bool progress;
2474       do {
2475          progress = false;
2476
2477          compact_virtual_grfs();
2478
2479          progress = remove_duplicate_mrf_writes() || progress;
2480
2481          progress = opt_algebraic() || progress;
2482          progress = opt_cse() || progress;
2483          progress = opt_copy_propagate() || progress;
2484          progress = dead_code_eliminate() || progress;
2485          progress = register_coalesce() || progress;
2486          progress = register_coalesce_2() || progress;
2487          progress = compute_to_mrf() || progress;
2488       } while (progress);
2489
2490       remove_dead_constants();
2491
2492       schedule_instructions(false);
2493
2494       assign_curb_setup();
2495       assign_urb_setup();
2496
2497       if (0) {
2498          /* Debug of register spilling: Go spill everything. */
2499          for (int i = 0; i < virtual_grf_count; i++) {
2500             spill_reg(i);
2501          }
2502       }
2503
2504       if (0)
2505          assign_regs_trivial();
2506       else {
2507          while (!assign_regs()) {
2508             if (failed)
2509                break;
2510          }
2511       }
2512    }
2513    assert(force_uncompressed_stack == 0);
2514    assert(force_sechalf_stack == 0);
2515
2516    if (failed)
2517       return false;
2518
2519    schedule_instructions(true);
2520
2521    if (dispatch_width == 8) {
2522       c->prog_data.reg_blocks = brw_register_blocks(grf_used);
2523    } else {
2524       c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
2525
2526       /* Make sure we didn't try to sneak in an extra uniform */
2527       assert(orig_nr_params == c->prog_data.nr_params);
2528       (void) orig_nr_params;
2529    }
2530
2531    /* If any state parameters were appended, then ParameterValues could have
2532     * been realloced, in which case the driver uniform storage set up by
2533     * _mesa_associate_uniform_storage() would point to freed memory.  Make
2534     * sure that didn't happen.
2535     */
2536    assert(sanity_param_count == fp->Base.Parameters->NumParameters);
2537
2538    return !failed;
2539 }
2540
2541 const unsigned *
2542 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
2543                struct gl_fragment_program *fp,
2544                struct gl_shader_program *prog,
2545                unsigned *final_assembly_size)
2546 {
2547    struct intel_context *intel = &brw->intel;
2548    bool start_busy = false;
2549    float start_time = 0;
2550
2551    if (unlikely(INTEL_DEBUG & DEBUG_PERF)) {
2552       start_busy = (intel->batch.last_bo &&
2553                     drm_intel_bo_busy(intel->batch.last_bo));
2554       start_time = get_time();
2555    }
2556
2557    struct brw_shader *shader = NULL;
2558    if (prog)
2559       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
2560
2561    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
2562       if (shader) {
2563          printf("GLSL IR for native fragment shader %d:\n", prog->Name);
2564          _mesa_print_ir(shader->ir, NULL);
2565          printf("\n\n");
2566       } else {
2567          printf("ARB_fragment_program %d ir for native fragment shader\n",
2568                 fp->Base.Id);
2569          _mesa_print_program(&fp->Base);
2570       }
2571    }
2572
2573    /* Now the main event: Visit the shader IR and generate our FS IR for it.
2574     */
2575    fs_visitor v(brw, c, prog, fp, 8);
2576    if (!v.run()) {
2577       prog->LinkStatus = false;
2578       ralloc_strcat(&prog->InfoLog, v.fail_msg);
2579
2580       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
2581                     v.fail_msg);
2582
2583       return NULL;
2584    }
2585
2586    exec_list *simd16_instructions = NULL;
2587    fs_visitor v2(brw, c, prog, fp, 16);
2588    if (intel->gen >= 5 && c->prog_data.nr_pull_params == 0) {
2589       v2.import_uniforms(&v);
2590       if (!v2.run()) {
2591          perf_debug("16-wide shader failed to compile, falling back to "
2592                     "8-wide at a 10-20%% performance cost: %s", v2.fail_msg);
2593       } else {
2594          simd16_instructions = &v2.instructions;
2595       }
2596    }
2597
2598    c->prog_data.dispatch_width = 8;
2599
2600    fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
2601    const unsigned *generated = g.generate_assembly(&v.instructions,
2602                                                    simd16_instructions,
2603                                                    final_assembly_size);
2604
2605    if (unlikely(INTEL_DEBUG & DEBUG_PERF) && shader) {
2606       if (shader->compiled_once)
2607          brw_wm_debug_recompile(brw, prog, &c->key);
2608       shader->compiled_once = true;
2609
2610       if (start_busy && !drm_intel_bo_busy(intel->batch.last_bo)) {
2611          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
2612                     (get_time() - start_time) * 1000);
2613       }
2614    }
2615
2616    return generated;
2617 }
2618
2619 bool
2620 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
2621 {
2622    struct brw_context *brw = brw_context(ctx);
2623    struct intel_context *intel = &brw->intel;
2624    struct brw_wm_prog_key key;
2625
2626    if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
2627       return true;
2628
2629    struct gl_fragment_program *fp = (struct gl_fragment_program *)
2630       prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
2631    struct brw_fragment_program *bfp = brw_fragment_program(fp);
2632    bool program_uses_dfdy = fp->UsesDFdy;
2633
2634    memset(&key, 0, sizeof(key));
2635
2636    if (intel->gen < 6) {
2637       if (fp->UsesKill)
2638          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
2639
2640       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
2641          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
2642
2643       /* Just assume depth testing. */
2644       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
2645       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
2646    }
2647
2648    if (prog->Name != 0)
2649       key.proj_attrib_mask = 0xffffffff;
2650
2651    if (intel->gen < 6)
2652       key.vp_outputs_written |= BITFIELD64_BIT(FRAG_ATTRIB_WPOS);
2653
2654    for (int i = 0; i < FRAG_ATTRIB_MAX; i++) {
2655       if (!(fp->Base.InputsRead & BITFIELD64_BIT(i)))
2656          continue;
2657
2658       if (prog->Name == 0)
2659          key.proj_attrib_mask |= 1 << i;
2660
2661       if (intel->gen < 6) {
2662          int vp_index = _mesa_vert_result_to_frag_attrib((gl_vert_result) i);
2663
2664          if (vp_index >= 0)
2665             key.vp_outputs_written |= BITFIELD64_BIT(vp_index);
2666       }
2667    }
2668
2669    key.clamp_fragment_color = true;
2670
2671    for (int i = 0; i < MAX_SAMPLERS; i++) {
2672       if (fp->Base.ShadowSamplers & (1 << i)) {
2673          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
2674          key.tex.swizzles[i] =
2675             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
2676       } else {
2677          /* Color sampler: assume no swizzling. */
2678          key.tex.swizzles[i] = SWIZZLE_XYZW;
2679       }
2680    }
2681
2682    if (fp->Base.InputsRead & FRAG_BIT_WPOS) {
2683       key.drawable_height = ctx->DrawBuffer->Height;
2684    }
2685
2686    if ((fp->Base.InputsRead & FRAG_BIT_WPOS) || program_uses_dfdy) {
2687       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
2688    }
2689
2690    key.nr_color_regions = 1;
2691
2692    key.program_string_id = bfp->id;
2693
2694    uint32_t old_prog_offset = brw->wm.prog_offset;
2695    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
2696
2697    bool success = do_wm_prog(brw, prog, bfp, &key);
2698
2699    brw->wm.prog_offset = old_prog_offset;
2700    brw->wm.prog_data = old_prog_data;
2701
2702    return success;
2703 }