src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 extern "C" {
  32
  33 #include <sys/types.h>
  34
  35 #include "main/macros.h"
  36 #include "main/shaderobj.h"
  37 #include "main/uniforms.h"
  38 #include "main/fbobject.h"
  39 #include "program/prog_parameter.h"
  40 #include "program/prog_print.h"
  41 #include "program/register_allocate.h"
  42 #include "program/sampler.h"
  43 #include "program/hash_table.h"
  44 #include "brw_context.h"
  45 #include "brw_eu.h"
  46 #include "brw_wm.h"
  47 }
  48 #include "brw_fs.h"
  49 #include "glsl/glsl_types.h"
  50 #include "glsl/ir_print_visitor.h"
  51
  52 void
  53 fs_inst::init()
  54 {
  55    memset(this, 0, sizeof(*this));
  56    this->opcode = BRW_OPCODE_NOP;
  57    this->conditional_mod = BRW_CONDITIONAL_NONE;
  58
  59    this->dst = reg_undef;
  60    this->src[0] = reg_undef;
  61    this->src[1] = reg_undef;
  62    this->src[2] = reg_undef;
  63 }
  64
  65 fs_inst::fs_inst()
  66 {
  67    init();
  68 }
  69
  70 fs_inst::fs_inst(enum opcode opcode)
  71 {
  72    init();
  73    this->opcode = opcode;
  74 }
  75
  76 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
  77 {
  78    init();
  79    this->opcode = opcode;
  80    this->dst = dst;
  81
  82    if (dst.file == GRF)
  83       assert(dst.reg_offset >= 0);
  84 }
  85
  86 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
  87 {
  88    init();
  89    this->opcode = opcode;
  90    this->dst = dst;
  91    this->src[0] = src0;
  92
  93    if (dst.file == GRF)
  94       assert(dst.reg_offset >= 0);
  95    if (src[0].file == GRF)
  96       assert(src[0].reg_offset >= 0);
  97 }
  98
  99 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 100 {
 101    init();
 102    this->opcode = opcode;
 103    this->dst = dst;
 104    this->src[0] = src0;
 105    this->src[1] = src1;
 106
 107    if (dst.file == GRF)
 108       assert(dst.reg_offset >= 0);
 109    if (src[0].file == GRF)
 110       assert(src[0].reg_offset >= 0);
 111    if (src[1].file == GRF)
 112       assert(src[1].reg_offset >= 0);
 113 }
 114
 115 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
 116                  fs_reg src0, fs_reg src1, fs_reg src2)
 117 {
 118    init();
 119    this->opcode = opcode;
 120    this->dst = dst;
 121    this->src[0] = src0;
 122    this->src[1] = src1;
 123    this->src[2] = src2;
 124
 125    if (dst.file == GRF)
 126       assert(dst.reg_offset >= 0);
 127    if (src[0].file == GRF)
 128       assert(src[0].reg_offset >= 0);
 129    if (src[1].file == GRF)
 130       assert(src[1].reg_offset >= 0);
 131    if (src[2].file == GRF)
 132       assert(src[2].reg_offset >= 0);
 133 }
 134
 135 #define ALU1(op)                                                        \
 136    fs_inst *                                                            \
 137    fs_visitor::op(fs_reg dst, fs_reg src0)                              \
 138    {                                                                    \
 139       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
 140    }
 141
 142 #define ALU2(op)                                                        \
 143    fs_inst *                                                            \
 144    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1)                 \
 145    {                                                                    \
 146       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
 147    }
 148
 149 ALU1(NOT)
 150 ALU1(MOV)
 151 ALU1(FRC)
 152 ALU1(RNDD)
 153 ALU1(RNDE)
 154 ALU1(RNDZ)
 155 ALU2(ADD)
 156 ALU2(MUL)
 157 ALU2(MACH)
 158 ALU2(AND)
 159 ALU2(OR)
 160 ALU2(XOR)
 161 ALU2(SHL)
 162 ALU2(SHR)
 163 ALU2(ASR)
 164
 165 /** Gen4 predicated IF. */
 166 fs_inst *
 167 fs_visitor::IF(uint32_t predicate)
 168 {
 169    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
 170    inst->predicate = predicate;
 171    return inst;
 172 }
 173
 174 /** Gen6+ IF with embedded comparison. */
 175 fs_inst *
 176 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
 177 {
 178    assert(intel->gen >= 6);
 179    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
 180                                         reg_null_d, src0, src1);
 181    inst->conditional_mod = condition;
 182    return inst;
 183 }
 184
 185 /**
 186  * CMP: Sets the low bit of the destination channels with the result
 187  * of the comparison, while the upper bits are undefined, and updates
 188  * the flag register with the packed 16 bits of the result.
 189  */
 190 fs_inst *
 191 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
 192 {
 193    fs_inst *inst;
 194
 195    /* Take the instruction:
 196     *
 197     * CMP null<d> src0<f> src1<f>
 198     *
 199     * Original gen4 does type conversion to the destination type before
 200     * comparison, producing garbage results for floating point comparisons.
 201     * gen5 does the comparison on the execution type (resolved source types),
 202     * so dst type doesn't matter.  gen6 does comparison and then uses the
 203     * result as if it was the dst type with no conversion, which happens to
 204     * mostly work out for float-interpreted-as-int since our comparisons are
 205     * for >0, =0, <0.
 206     */
 207    if (intel->gen == 4) {
 208       dst.type = src0.type;
 209       if (dst.file == FIXED_HW_REG)
 210          dst.fixed_hw_reg.type = dst.type;
 211    }
 212
 213    resolve_ud_negate(&src0);
 214    resolve_ud_negate(&src1);
 215
 216    inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
 217    inst->conditional_mod = condition;
 218
 219    return inst;
 220 }
 221
 222 exec_list
 223 fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
 224                                        fs_reg offset)
 225 {
 226    exec_list instructions;
 227    fs_inst *inst;
 228
 229    if (intel->gen >= 7) {
 230       inst = new(mem_ctx) fs_inst(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7,
 231                                   dst, surf_index, offset);
 232       instructions.push_tail(inst);
 233    } else {
 234       int base_mrf = 13;
 235       bool header_present = true;
 236
 237       fs_reg mrf = fs_reg(MRF, base_mrf + header_present);
 238       mrf.type = BRW_REGISTER_TYPE_D;
 239
 240       /* On gen6+ we want the dword offset passed in, but on gen4/5 we need a
 241        * dword-aligned byte offset.
 242        */
 243       if (intel->gen == 6) {
 244          instructions.push_tail(MOV(mrf, offset));
 245       } else {
 246          instructions.push_tail(MUL(mrf, offset, fs_reg(4)));
 247       }
 248       inst = MOV(mrf, offset);
 249       inst = new(mem_ctx) fs_inst(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD,
 250                                   dst, surf_index);
 251       inst->header_present = header_present;
 252       inst->base_mrf = base_mrf;
 253       inst->mlen = header_present + dispatch_width / 8;
 254
 255       instructions.push_tail(inst);
 256    }
 257
 258    return instructions;
 259 }
 260
 261 bool
 262 fs_inst::equals(fs_inst *inst)
 263 {
 264    return (opcode == inst->opcode &&
 265            dst.equals(inst->dst) &&
 266            src[0].equals(inst->src[0]) &&
 267            src[1].equals(inst->src[1]) &&
 268            src[2].equals(inst->src[2]) &&
 269            saturate == inst->saturate &&
 270            predicate == inst->predicate &&
 271            conditional_mod == inst->conditional_mod &&
 272            mlen == inst->mlen &&
 273            base_mrf == inst->base_mrf &&
 274            sampler == inst->sampler &&
 275            target == inst->target &&
 276            eot == inst->eot &&
 277            header_present == inst->header_present &&
 278            shadow_compare == inst->shadow_compare &&
 279            offset == inst->offset);
 280 }
 281
 282 int
 283 fs_inst::regs_written()
 284 {
 285    if (is_tex())
 286       return 4;
 287
 288    /* The SINCOS and INT_DIV_QUOTIENT_AND_REMAINDER math functions return 2,
 289     * but we don't currently use them...nor do we have an opcode for them.
 290     */
 291
 292    return 1;
 293 }
 294
 295 bool
 296 fs_inst::overwrites_reg(const fs_reg &reg)
 297 {
 298    return (reg.file == dst.file &&
 299            reg.reg == dst.reg &&
 300            reg.reg_offset >= dst.reg_offset  &&
 301            reg.reg_offset < dst.reg_offset + regs_written());
 302 }
 303
 304 bool
 305 fs_inst::is_tex()
 306 {
 307    return (opcode == SHADER_OPCODE_TEX ||
 308            opcode == FS_OPCODE_TXB ||
 309            opcode == SHADER_OPCODE_TXD ||
 310            opcode == SHADER_OPCODE_TXF ||
 311            opcode == SHADER_OPCODE_TXL ||
 312            opcode == SHADER_OPCODE_TXS);
 313 }
 314
 315 bool
 316 fs_inst::is_math()
 317 {
 318    return (opcode == SHADER_OPCODE_RCP ||
 319            opcode == SHADER_OPCODE_RSQ ||
 320            opcode == SHADER_OPCODE_SQRT ||
 321            opcode == SHADER_OPCODE_EXP2 ||
 322            opcode == SHADER_OPCODE_LOG2 ||
 323            opcode == SHADER_OPCODE_SIN ||
 324            opcode == SHADER_OPCODE_COS ||
 325            opcode == SHADER_OPCODE_INT_QUOTIENT ||
 326            opcode == SHADER_OPCODE_INT_REMAINDER ||
 327            opcode == SHADER_OPCODE_POW);
 328 }
 329
 330 bool
 331 fs_inst::is_send_from_grf()
 332 {
 333    return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
 334            (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
 335             src[1].file == GRF));
 336 }
 337
 338 bool
 339 fs_visitor::can_do_source_mods(fs_inst *inst)
 340 {
 341    if (intel->gen == 6 && inst->is_math())
 342       return false;
 343
 344    if (inst->is_send_from_grf())
 345       return false;
 346
 347    return true;
 348 }
 349
 350 void
 351 fs_reg::init()
 352 {
 353    memset(this, 0, sizeof(*this));
 354    this->smear = -1;
 355 }
 356
 357 /** Generic unset register constructor. */
 358 fs_reg::fs_reg()
 359 {
 360    init();
 361    this->file = BAD_FILE;
 362 }
 363
 364 /** Immediate value constructor. */
 365 fs_reg::fs_reg(float f)
 366 {
 367    init();
 368    this->file = IMM;
 369    this->type = BRW_REGISTER_TYPE_F;
 370    this->imm.f = f;
 371 }
 372
 373 /** Immediate value constructor. */
 374 fs_reg::fs_reg(int32_t i)
 375 {
 376    init();
 377    this->file = IMM;
 378    this->type = BRW_REGISTER_TYPE_D;
 379    this->imm.i = i;
 380 }
 381
 382 /** Immediate value constructor. */
 383 fs_reg::fs_reg(uint32_t u)
 384 {
 385    init();
 386    this->file = IMM;
 387    this->type = BRW_REGISTER_TYPE_UD;
 388    this->imm.u = u;
 389 }
 390
 391 /** Fixed brw_reg Immediate value constructor. */
 392 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
 393 {
 394    init();
 395    this->file = FIXED_HW_REG;
 396    this->fixed_hw_reg = fixed_hw_reg;
 397    this->type = fixed_hw_reg.type;
 398 }
 399
 400 bool
 401 fs_reg::equals(const fs_reg &r) const
 402 {
 403    return (file == r.file &&
 404            reg == r.reg &&
 405            reg_offset == r.reg_offset &&
 406            type == r.type &&
 407            negate == r.negate &&
 408            abs == r.abs &&
 409            !reladdr && !r.reladdr &&
 410            memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
 411                   sizeof(fixed_hw_reg)) == 0 &&
 412            smear == r.smear &&
 413            imm.u == r.imm.u);
 414 }
 415
 416 bool
 417 fs_reg::is_zero() const
 418 {
 419    if (file != IMM)
 420       return false;
 421
 422    return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
 423 }
 424
 425 bool
 426 fs_reg::is_one() const
 427 {
 428    if (file != IMM)
 429       return false;
 430
 431    return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
 432 }
 433
 434 int
 435 fs_visitor::type_size(const struct glsl_type *type)
 436 {
 437    unsigned int size, i;
 438
 439    switch (type->base_type) {
 440    case GLSL_TYPE_UINT:
 441    case GLSL_TYPE_INT:
 442    case GLSL_TYPE_FLOAT:
 443    case GLSL_TYPE_BOOL:
 444       return type->components();
 445    case GLSL_TYPE_ARRAY:
 446       return type_size(type->fields.array) * type->length;
 447    case GLSL_TYPE_STRUCT:
 448       size = 0;
 449       for (i = 0; i < type->length; i++) {
 450          size += type_size(type->fields.structure[i].type);
 451       }
 452       return size;
 453    case GLSL_TYPE_SAMPLER:
 454       /* Samplers take up no register space, since they're baked in at
 455        * link time.
 456        */
 457       return 0;
 458    case GLSL_TYPE_VOID:
 459    case GLSL_TYPE_ERROR:
 460    case GLSL_TYPE_INTERFACE:
 461       assert(!"not reached");
 462       break;
 463    }
 464
 465    return 0;
 466 }
 467
 468 fs_reg
 469 fs_visitor::get_timestamp()
 470 {
 471    assert(intel->gen >= 7);
 472
 473    fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 474                                           BRW_ARF_TIMESTAMP,
 475                                           0),
 476                              BRW_REGISTER_TYPE_UD));
 477
 478    fs_reg dst = fs_reg(this, glsl_type::uint_type);
 479
 480    fs_inst *mov = emit(MOV(dst, ts));
 481    /* We want to read the 3 fields we care about (mostly field 0, but also 2)
 482     * even if it's not enabled in the dispatch.
 483     */
 484    mov->force_writemask_all = true;
 485    mov->force_uncompressed = true;
 486
 487    /* The caller wants the low 32 bits of the timestamp.  Since it's running
 488     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
 489     * which is plenty of time for our purposes.  It is identical across the
 490     * EUs, but since it's tracking GPU core speed it will increment at a
 491     * varying rate as render P-states change.
 492     *
 493     * The caller could also check if render P-states have changed (or anything
 494     * else that might disrupt timing) by setting smear to 2 and checking if
 495     * that field is != 0.
 496     */
 497    dst.smear = 0;
 498
 499    return dst;
 500 }
 501
 502 void
 503 fs_visitor::emit_shader_time_begin()
 504 {
 505    current_annotation = "shader time start";
 506    shader_start_time = get_timestamp();
 507 }
 508
 509 void
 510 fs_visitor::emit_shader_time_end()
 511 {
 512    current_annotation = "shader time end";
 513
 514    enum shader_time_shader_type type, written_type, reset_type;
 515    if (dispatch_width == 8) {
 516       type = ST_FS8;
 517       written_type = ST_FS8_WRITTEN;
 518       reset_type = ST_FS8_RESET;
 519    } else {
 520       assert(dispatch_width == 16);
 521       type = ST_FS16;
 522       written_type = ST_FS16_WRITTEN;
 523       reset_type = ST_FS16_RESET;
 524    }
 525
 526    fs_reg shader_end_time = get_timestamp();
 527
 528    /* Check that there weren't any timestamp reset events (assuming these
 529     * were the only two timestamp reads that happened).
 530     */
 531    fs_reg reset = shader_end_time;
 532    reset.smear = 2;
 533    fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
 534    test->conditional_mod = BRW_CONDITIONAL_Z;
 535    emit(IF(BRW_PREDICATE_NORMAL));
 536
 537    push_force_uncompressed();
 538    fs_reg start = shader_start_time;
 539    start.negate = true;
 540    fs_reg diff = fs_reg(this, glsl_type::uint_type);
 541    emit(ADD(diff, start, shader_end_time));
 542
 543    /* If there were no instructions between the two timestamp gets, the diff
 544     * is 2 cycles.  Remove that overhead, so I can forget about that when
 545     * trying to determine the time taken for single instructions.
 546     */
 547    emit(ADD(diff, diff, fs_reg(-2u)));
 548
 549    emit_shader_time_write(type, diff);
 550    emit_shader_time_write(written_type, fs_reg(1u));
 551    emit(BRW_OPCODE_ELSE);
 552    emit_shader_time_write(reset_type, fs_reg(1u));
 553    emit(BRW_OPCODE_ENDIF);
 554
 555    pop_force_uncompressed();
 556 }
 557
 558 void
 559 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
 560                                    fs_reg value)
 561 {
 562    /* Choose an index in the buffer and set up tracking information for our
 563     * printouts.
 564     */
 565    int shader_time_index = brw->shader_time.num_entries++;
 566    assert(shader_time_index <= brw->shader_time.max_entries);
 567    brw->shader_time.types[shader_time_index] = type;
 568    if (prog) {
 569       _mesa_reference_shader_program(ctx,
 570                                      &brw->shader_time.programs[shader_time_index],
 571                                      prog);
 572    }
 573
 574    int base_mrf = 6;
 575
 576    fs_reg offset_mrf = fs_reg(MRF, base_mrf);
 577    offset_mrf.type = BRW_REGISTER_TYPE_UD;
 578    emit(MOV(offset_mrf, fs_reg(shader_time_index * 4)));
 579
 580    fs_reg time_mrf = fs_reg(MRF, base_mrf + 1);
 581    time_mrf.type = BRW_REGISTER_TYPE_UD;
 582    emit(MOV(time_mrf, value));
 583
 584    fs_inst *inst = emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD));
 585    inst->base_mrf = base_mrf;
 586    inst->mlen = 2;
 587 }
 588
 589 void
 590 fs_visitor::fail(const char *format, ...)
 591 {
 592    va_list va;
 593    char *msg;
 594
 595    if (failed)
 596       return;
 597
 598    failed = true;
 599
 600    va_start(va, format);
 601    msg = ralloc_vasprintf(mem_ctx, format, va);
 602    va_end(va);
 603    msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
 604
 605    this->fail_msg = msg;
 606
 607    if (INTEL_DEBUG & DEBUG_WM) {
 608       fprintf(stderr, "%s",  msg);
 609    }
 610 }
 611
 612 fs_inst *
 613 fs_visitor::emit(enum opcode opcode)
 614 {
 615    return emit(fs_inst(opcode));
 616 }
 617
 618 fs_inst *
 619 fs_visitor::emit(enum opcode opcode, fs_reg dst)
 620 {
 621    return emit(fs_inst(opcode, dst));
 622 }
 623
 624 fs_inst *
 625 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
 626 {
 627    return emit(fs_inst(opcode, dst, src0));
 628 }
 629
 630 fs_inst *
 631 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 632 {
 633    return emit(fs_inst(opcode, dst, src0, src1));
 634 }
 635
 636 fs_inst *
 637 fs_visitor::emit(enum opcode opcode, fs_reg dst,
 638                  fs_reg src0, fs_reg src1, fs_reg src2)
 639 {
 640    return emit(fs_inst(opcode, dst, src0, src1, src2));
 641 }
 642
 643 void
 644 fs_visitor::push_force_uncompressed()
 645 {
 646    force_uncompressed_stack++;
 647 }
 648
 649 void
 650 fs_visitor::pop_force_uncompressed()
 651 {
 652    force_uncompressed_stack--;
 653    assert(force_uncompressed_stack >= 0);
 654 }
 655
 656 void
 657 fs_visitor::push_force_sechalf()
 658 {
 659    force_sechalf_stack++;
 660 }
 661
 662 void
 663 fs_visitor::pop_force_sechalf()
 664 {
 665    force_sechalf_stack--;
 666    assert(force_sechalf_stack >= 0);
 667 }
 668
 669 /**
 670  * Returns how many MRFs an FS opcode will write over.
 671  *
 672  * Note that this is not the 0 or 1 implied writes in an actual gen
 673  * instruction -- the FS opcodes often generate MOVs in addition.
 674  */
 675 int
 676 fs_visitor::implied_mrf_writes(fs_inst *inst)
 677 {
 678    if (inst->mlen == 0)
 679       return 0;
 680
 681    switch (inst->opcode) {
 682    case SHADER_OPCODE_RCP:
 683    case SHADER_OPCODE_RSQ:
 684    case SHADER_OPCODE_SQRT:
 685    case SHADER_OPCODE_EXP2:
 686    case SHADER_OPCODE_LOG2:
 687    case SHADER_OPCODE_SIN:
 688    case SHADER_OPCODE_COS:
 689       return 1 * dispatch_width / 8;
 690    case SHADER_OPCODE_POW:
 691    case SHADER_OPCODE_INT_QUOTIENT:
 692    case SHADER_OPCODE_INT_REMAINDER:
 693       return 2 * dispatch_width / 8;
 694    case SHADER_OPCODE_TEX:
 695    case FS_OPCODE_TXB:
 696    case SHADER_OPCODE_TXD:
 697    case SHADER_OPCODE_TXF:
 698    case SHADER_OPCODE_TXL:
 699    case SHADER_OPCODE_TXS:
 700       return 1;
 701    case SHADER_OPCODE_SHADER_TIME_ADD:
 702       return 0;
 703    case FS_OPCODE_FB_WRITE:
 704       return 2;
 705    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 706    case FS_OPCODE_UNSPILL:
 707       return 1;
 708    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
 709       return inst->header_present;
 710    case FS_OPCODE_SPILL:
 711       return 2;
 712    default:
 713       assert(!"not reached");
 714       return inst->mlen;
 715    }
 716 }
 717
 718 int
 719 fs_visitor::virtual_grf_alloc(int size)
 720 {
 721    if (virtual_grf_array_size <= virtual_grf_count) {
 722       if (virtual_grf_array_size == 0)
 723          virtual_grf_array_size = 16;
 724       else
 725          virtual_grf_array_size *= 2;
 726       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 727                                    virtual_grf_array_size);
 728    }
 729    virtual_grf_sizes[virtual_grf_count] = size;
 730    return virtual_grf_count++;
 731 }
 732
 733 /** Fixed HW reg constructor. */
 734 fs_reg::fs_reg(enum register_file file, int reg)
 735 {
 736    init();
 737    this->file = file;
 738    this->reg = reg;
 739    this->type = BRW_REGISTER_TYPE_F;
 740 }
 741
 742 /** Fixed HW reg constructor. */
 743 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
 744 {
 745    init();
 746    this->file = file;
 747    this->reg = reg;
 748    this->type = type;
 749 }
 750
 751 /** Automatic reg constructor. */
 752 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
 753 {
 754    init();
 755
 756    this->file = GRF;
 757    this->reg = v->virtual_grf_alloc(v->type_size(type));
 758    this->reg_offset = 0;
 759    this->type = brw_type_for_base_type(type);
 760 }
 761
 762 fs_reg *
 763 fs_visitor::variable_storage(ir_variable *var)
 764 {
 765    return (fs_reg *)hash_table_find(this->variable_ht, var);
 766 }
 767
 768 void
 769 import_uniforms_callback(const void *key,
 770                          void *data,
 771                          void *closure)
 772 {
 773    struct hash_table *dst_ht = (struct hash_table *)closure;
 774    const fs_reg *reg = (const fs_reg *)data;
 775
 776    if (reg->file != UNIFORM)
 777       return;
 778
 779    hash_table_insert(dst_ht, data, key);
 780 }
 781
 782 /* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
 783  * This brings in those uniform definitions
 784  */
 785 void
 786 fs_visitor::import_uniforms(fs_visitor *v)
 787 {
 788    hash_table_call_foreach(v->variable_ht,
 789                            import_uniforms_callback,
 790                            variable_ht);
 791    this->params_remap = v->params_remap;
 792 }
 793
 794 /* Our support for uniforms is piggy-backed on the struct
 795  * gl_fragment_program, because that's where the values actually
 796  * get stored, rather than in some global gl_shader_program uniform
 797  * store.
 798  */
 799 void
 800 fs_visitor::setup_uniform_values(ir_variable *ir)
 801 {
 802    int namelen = strlen(ir->name);
 803
 804    /* The data for our (non-builtin) uniforms is stored in a series of
 805     * gl_uniform_driver_storage structs for each subcomponent that
 806     * glGetUniformLocation() could name.  We know it's been set up in the same
 807     * order we'd walk the type, so walk the list of storage and find anything
 808     * with our name, or the prefix of a component that starts with our name.
 809     */
 810    unsigned params_before = c->prog_data.nr_params;
 811    for (unsigned u = 0; u < prog->NumUserUniformStorage; u++) {
 812       struct gl_uniform_storage *storage = &prog->UniformStorage[u];
 813
 814       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 815           (storage->name[namelen] != 0 &&
 816            storage->name[namelen] != '.' &&
 817            storage->name[namelen] != '[')) {
 818          continue;
 819       }
 820
 821       unsigned slots = storage->type->component_slots();
 822       if (storage->array_elements)
 823          slots *= storage->array_elements;
 824
 825       for (unsigned i = 0; i < slots; i++) {
 826          c->prog_data.param[c->prog_data.nr_params++] =
 827             &storage->storage[i].f;
 828       }
 829    }
 830
 831    /* Make sure we actually initialized the right amount of stuff here. */
 832    assert(params_before + ir->type->component_slots() ==
 833           c->prog_data.nr_params);
 834 }
 835
 836
 837 /* Our support for builtin uniforms is even scarier than non-builtin.
 838  * It sits on top of the PROG_STATE_VAR parameters that are
 839  * automatically updated from GL context state.
 840  */
 841 void
 842 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
 843 {
 844    const ir_state_slot *const slots = ir->state_slots;
 845    assert(ir->state_slots != NULL);
 846
 847    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 848       /* This state reference has already been setup by ir_to_mesa, but we'll
 849        * get the same index back here.
 850        */
 851       int index = _mesa_add_state_reference(this->fp->Base.Parameters,
 852                                             (gl_state_index *)slots[i].tokens);
 853
 854       /* Add each of the unique swizzles of the element as a parameter.
 855        * This'll end up matching the expected layout of the
 856        * array/matrix/structure we're trying to fill in.
 857        */
 858       int last_swiz = -1;
 859       for (unsigned int j = 0; j < 4; j++) {
 860          int swiz = GET_SWZ(slots[i].swizzle, j);
 861          if (swiz == last_swiz)
 862             break;
 863          last_swiz = swiz;
 864
 865          c->prog_data.param[c->prog_data.nr_params++] =
 866             &fp->Base.Parameters->ParameterValues[index][swiz].f;
 867       }
 868    }
 869 }
 870
 871 fs_reg *
 872 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
 873 {
 874    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 875    fs_reg wpos = *reg;
 876    bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
 877
 878    /* gl_FragCoord.x */
 879    if (ir->pixel_center_integer) {
 880       emit(MOV(wpos, this->pixel_x));
 881    } else {
 882       emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
 883    }
 884    wpos.reg_offset++;
 885
 886    /* gl_FragCoord.y */
 887    if (!flip && ir->pixel_center_integer) {
 888       emit(MOV(wpos, this->pixel_y));
 889    } else {
 890       fs_reg pixel_y = this->pixel_y;
 891       float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
 892
 893       if (flip) {
 894          pixel_y.negate = true;
 895          offset += c->key.drawable_height - 1.0;
 896       }
 897
 898       emit(ADD(wpos, pixel_y, fs_reg(offset)));
 899    }
 900    wpos.reg_offset++;
 901
 902    /* gl_FragCoord.z */
 903    if (intel->gen >= 6) {
 904       emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
 905    } else {
 906       emit(FS_OPCODE_LINTERP, wpos,
 907            this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
 908            this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
 909            interp_reg(FRAG_ATTRIB_WPOS, 2));
 910    }
 911    wpos.reg_offset++;
 912
 913    /* gl_FragCoord.w: Already set up in emit_interpolation */
 914    emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
 915
 916    return reg;
 917 }
 918
 919 fs_inst *
 920 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
 921                          glsl_interp_qualifier interpolation_mode,
 922                          bool is_centroid)
 923 {
 924    brw_wm_barycentric_interp_mode barycoord_mode;
 925    if (is_centroid) {
 926       if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
 927          barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
 928       else
 929          barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
 930    } else {
 931       if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
 932          barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
 933       else
 934          barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
 935    }
 936    return emit(FS_OPCODE_LINTERP, attr,
 937                this->delta_x[barycoord_mode],
 938                this->delta_y[barycoord_mode], interp);
 939 }
 940
 941 fs_reg *
 942 fs_visitor::emit_general_interpolation(ir_variable *ir)
 943 {
 944    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 945    reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
 946    fs_reg attr = *reg;
 947
 948    unsigned int array_elements;
 949    const glsl_type *type;
 950
 951    if (ir->type->is_array()) {
 952       array_elements = ir->type->length;
 953       if (array_elements == 0) {
 954          fail("dereferenced array '%s' has length 0\n", ir->name);
 955       }
 956       type = ir->type->fields.array;
 957    } else {
 958       array_elements = 1;
 959       type = ir->type;
 960    }
 961
 962    glsl_interp_qualifier interpolation_mode =
 963       ir->determine_interpolation_mode(c->key.flat_shade);
 964
 965    int location = ir->location;
 966    for (unsigned int i = 0; i < array_elements; i++) {
 967       for (unsigned int j = 0; j < type->matrix_columns; j++) {
 968          if (urb_setup[location] == -1) {
 969             /* If there's no incoming setup data for this slot, don't
 970              * emit interpolation for it.
 971              */
 972             attr.reg_offset += type->vector_elements;
 973             location++;
 974             continue;
 975          }
 976
 977          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
 978             /* Constant interpolation (flat shading) case. The SF has
 979              * handed us defined values in only the constant offset
 980              * field of the setup reg.
 981              */
 982             for (unsigned int k = 0; k < type->vector_elements; k++) {
 983                struct brw_reg interp = interp_reg(location, k);
 984                interp = suboffset(interp, 3);
 985                interp.type = reg->type;
 986                emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
 987                attr.reg_offset++;
 988             }
 989          } else {
 990             /* Smooth/noperspective interpolation case. */
 991             for (unsigned int k = 0; k < type->vector_elements; k++) {
 992                /* FINISHME: At some point we probably want to push
 993                 * this farther by giving similar treatment to the
 994                 * other potentially constant components of the
 995                 * attribute, as well as making brw_vs_constval.c
 996                 * handle varyings other than gl_TexCoord.
 997                 */
 998                if (location >= FRAG_ATTRIB_TEX0 &&
 999                    location <= FRAG_ATTRIB_TEX7 &&
1000                    k == 3 && !(c->key.proj_attrib_mask & (1 << location))) {
1001                   emit(BRW_OPCODE_MOV, attr, fs_reg(1.0f));
1002                } else {
1003                   struct brw_reg interp = interp_reg(location, k);
1004                   emit_linterp(attr, fs_reg(interp), interpolation_mode,
1005                                ir->centroid);
1006                   if (brw->needs_unlit_centroid_workaround && ir->centroid) {
1007                      /* Get the pixel/sample mask into f0 so that we know
1008                       * which pixels are lit.  Then, for each channel that is
1009                       * unlit, replace the centroid data with non-centroid
1010                       * data.
1011                       */
1012                      emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS, attr);
1013                      fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1014                                                   interpolation_mode, false);
1015                      inst->predicate = BRW_PREDICATE_NORMAL;
1016                      inst->predicate_inverse = true;
1017                   }
1018                   if (intel->gen < 6) {
1019                      emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1020                   }
1021                }
1022                attr.reg_offset++;
1023             }
1024
1025          }
1026          location++;
1027       }
1028    }
1029
1030    return reg;
1031 }
1032
1033 fs_reg *
1034 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1035 {
1036    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1037
1038    /* The frontfacing comes in as a bit in the thread payload. */
1039    if (intel->gen >= 6) {
1040       emit(BRW_OPCODE_ASR, *reg,
1041            fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1042            fs_reg(15));
1043       emit(BRW_OPCODE_NOT, *reg, *reg);
1044       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1045    } else {
1046       struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1047       /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1048        * us front face
1049        */
1050       emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1051       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1052    }
1053
1054    return reg;
1055 }
1056
1057 fs_reg
1058 fs_visitor::fix_math_operand(fs_reg src)
1059 {
1060    /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1061     * might be able to do better by doing execsize = 1 math and then
1062     * expanding that result out, but we would need to be careful with
1063     * masking.
1064     *
1065     * The hardware ignores source modifiers (negate and abs) on math
1066     * instructions, so we also move to a temp to set those up.
1067     */
1068    if (intel->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1069        !src.abs && !src.negate)
1070       return src;
1071
1072    /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1073     * operands to math
1074     */
1075    if (intel->gen >= 7 && src.file != IMM)
1076       return src;
1077
1078    fs_reg expanded = fs_reg(this, glsl_type::float_type);
1079    expanded.type = src.type;
1080    emit(BRW_OPCODE_MOV, expanded, src);
1081    return expanded;
1082 }
1083
1084 fs_inst *
1085 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1086 {
1087    switch (opcode) {
1088    case SHADER_OPCODE_RCP:
1089    case SHADER_OPCODE_RSQ:
1090    case SHADER_OPCODE_SQRT:
1091    case SHADER_OPCODE_EXP2:
1092    case SHADER_OPCODE_LOG2:
1093    case SHADER_OPCODE_SIN:
1094    case SHADER_OPCODE_COS:
1095       break;
1096    default:
1097       assert(!"not reached: bad math opcode");
1098       return NULL;
1099    }
1100
1101    /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
1102     * might be able to do better by doing execsize = 1 math and then
1103     * expanding that result out, but we would need to be careful with
1104     * masking.
1105     *
1106     * Gen 6 hardware ignores source modifiers (negate and abs) on math
1107     * instructions, so we also move to a temp to set those up.
1108     */
1109    if (intel->gen >= 6)
1110       src = fix_math_operand(src);
1111
1112    fs_inst *inst = emit(opcode, dst, src);
1113
1114    if (intel->gen < 6) {
1115       inst->base_mrf = 2;
1116       inst->mlen = dispatch_width / 8;
1117    }
1118
1119    return inst;
1120 }
1121
1122 fs_inst *
1123 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1124 {
1125    int base_mrf = 2;
1126    fs_inst *inst;
1127
1128    switch (opcode) {
1129    case SHADER_OPCODE_INT_QUOTIENT:
1130    case SHADER_OPCODE_INT_REMAINDER:
1131       if (intel->gen >= 7 && dispatch_width == 16)
1132          fail("16-wide INTDIV unsupported\n");
1133       break;
1134    case SHADER_OPCODE_POW:
1135       break;
1136    default:
1137       assert(!"not reached: unsupported binary math opcode.");
1138       return NULL;
1139    }
1140
1141    if (intel->gen >= 6) {
1142       src0 = fix_math_operand(src0);
1143       src1 = fix_math_operand(src1);
1144
1145       inst = emit(opcode, dst, src0, src1);
1146    } else {
1147       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1148        * "Message Payload":
1149        *
1150        * "Operand0[7].  For the INT DIV functions, this operand is the
1151        *  denominator."
1152        *  ...
1153        * "Operand1[7].  For the INT DIV functions, this operand is the
1154        *  numerator."
1155        */
1156       bool is_int_div = opcode != SHADER_OPCODE_POW;
1157       fs_reg &op0 = is_int_div ? src1 : src0;
1158       fs_reg &op1 = is_int_div ? src0 : src1;
1159
1160       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1161       inst = emit(opcode, dst, op0, reg_null_f);
1162
1163       inst->base_mrf = base_mrf;
1164       inst->mlen = 2 * dispatch_width / 8;
1165    }
1166    return inst;
1167 }
1168
1169 void
1170 fs_visitor::assign_curb_setup()
1171 {
1172    c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
1173    if (dispatch_width == 8) {
1174       c->prog_data.first_curbe_grf = c->nr_payload_regs;
1175    } else {
1176       c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1177    }
1178
1179    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1180    foreach_list(node, &this->instructions) {
1181       fs_inst *inst = (fs_inst *)node;
1182
1183       for (unsigned int i = 0; i < 3; i++) {
1184          if (inst->src[i].file == UNIFORM) {
1185             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1186             struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1187                                                   constant_nr / 8,
1188                                                   constant_nr % 8);
1189
1190             inst->src[i].file = FIXED_HW_REG;
1191             inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
1192          }
1193       }
1194    }
1195 }
1196
1197 void
1198 fs_visitor::calculate_urb_setup()
1199 {
1200    for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
1201       urb_setup[i] = -1;
1202    }
1203
1204    int urb_next = 0;
1205    /* Figure out where each of the incoming setup attributes lands. */
1206    if (intel->gen >= 6) {
1207       for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
1208          if (fp->Base.InputsRead & BITFIELD64_BIT(i)) {
1209             urb_setup[i] = urb_next++;
1210          }
1211       }
1212    } else {
1213       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1214       for (unsigned int i = 0; i < VERT_RESULT_MAX; i++) {
1215          /* Point size is packed into the header, not as a general attribute */
1216          if (i == VERT_RESULT_PSIZ)
1217             continue;
1218
1219          if (c->key.vp_outputs_written & BITFIELD64_BIT(i)) {
1220             int fp_index = _mesa_vert_result_to_frag_attrib((gl_vert_result) i);
1221
1222             /* The back color slot is skipped when the front color is
1223              * also written to.  In addition, some slots can be
1224              * written in the vertex shader and not read in the
1225              * fragment shader.  So the register number must always be
1226              * incremented, mapped or not.
1227              */
1228             if (fp_index >= 0)
1229                urb_setup[fp_index] = urb_next;
1230             urb_next++;
1231          }
1232       }
1233
1234       /*
1235        * It's a FS only attribute, and we did interpolation for this attribute
1236        * in SF thread. So, count it here, too.
1237        *
1238        * See compile_sf_prog() for more info.
1239        */
1240       if (fp->Base.InputsRead & BITFIELD64_BIT(FRAG_ATTRIB_PNTC))
1241          urb_setup[FRAG_ATTRIB_PNTC] = urb_next++;
1242    }
1243
1244    /* Each attribute is 4 setup channels, each of which is half a reg. */
1245    c->prog_data.urb_read_length = urb_next * 2;
1246 }
1247
1248 void
1249 fs_visitor::assign_urb_setup()
1250 {
1251    int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1252
1253    /* Offset all the urb_setup[] index by the actual position of the
1254     * setup regs, now that the location of the constants has been chosen.
1255     */
1256    foreach_list(node, &this->instructions) {
1257       fs_inst *inst = (fs_inst *)node;
1258
1259       if (inst->opcode == FS_OPCODE_LINTERP) {
1260          assert(inst->src[2].file == FIXED_HW_REG);
1261          inst->src[2].fixed_hw_reg.nr += urb_start;
1262       }
1263
1264       if (inst->opcode == FS_OPCODE_CINTERP) {
1265          assert(inst->src[0].file == FIXED_HW_REG);
1266          inst->src[0].fixed_hw_reg.nr += urb_start;
1267       }
1268    }
1269
1270    this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length;
1271 }
1272
1273 /**
1274  * Split large virtual GRFs into separate components if we can.
1275  *
1276  * This is mostly duplicated with what brw_fs_vector_splitting does,
1277  * but that's really conservative because it's afraid of doing
1278  * splitting that doesn't result in real progress after the rest of
1279  * the optimization phases, which would cause infinite looping in
1280  * optimization.  We can do it once here, safely.  This also has the
1281  * opportunity to split interpolated values, or maybe even uniforms,
1282  * which we don't have at the IR level.
1283  *
1284  * We want to split, because virtual GRFs are what we register
1285  * allocate and spill (due to contiguousness requirements for some
1286  * instructions), and they're what we naturally generate in the
1287  * codegen process, but most virtual GRFs don't actually need to be
1288  * contiguous sets of GRFs.  If we split, we'll end up with reduced
1289  * live intervals and better dead code elimination and coalescing.
1290  */
1291 void
1292 fs_visitor::split_virtual_grfs()
1293 {
1294    int num_vars = this->virtual_grf_count;
1295    bool split_grf[num_vars];
1296    int new_virtual_grf[num_vars];
1297
1298    /* Try to split anything > 0 sized. */
1299    for (int i = 0; i < num_vars; i++) {
1300       if (this->virtual_grf_sizes[i] != 1)
1301          split_grf[i] = true;
1302       else
1303          split_grf[i] = false;
1304    }
1305
1306    if (brw->has_pln &&
1307        this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1308       /* PLN opcodes rely on the delta_xy being contiguous.  We only have to
1309        * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1310        * Gen6, that was the only supported interpolation mode, and since Gen6,
1311        * delta_x and delta_y are in fixed hardware registers.
1312        */
1313       split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1314          false;
1315    }
1316
1317    foreach_list(node, &this->instructions) {
1318       fs_inst *inst = (fs_inst *)node;
1319
1320       /* If there's a SEND message that requires contiguous destination
1321        * registers, no splitting is allowed.
1322        */
1323       if (inst->regs_written() > 1) {
1324          split_grf[inst->dst.reg] = false;
1325       }
1326    }
1327
1328    /* Allocate new space for split regs.  Note that the virtual
1329     * numbers will be contiguous.
1330     */
1331    for (int i = 0; i < num_vars; i++) {
1332       if (split_grf[i]) {
1333          new_virtual_grf[i] = virtual_grf_alloc(1);
1334          for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1335             int reg = virtual_grf_alloc(1);
1336             assert(reg == new_virtual_grf[i] + j - 1);
1337             (void) reg;
1338          }
1339          this->virtual_grf_sizes[i] = 1;
1340       }
1341    }
1342
1343    foreach_list(node, &this->instructions) {
1344       fs_inst *inst = (fs_inst *)node;
1345
1346       if (inst->dst.file == GRF &&
1347           split_grf[inst->dst.reg] &&
1348           inst->dst.reg_offset != 0) {
1349          inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1350                           inst->dst.reg_offset - 1);
1351          inst->dst.reg_offset = 0;
1352       }
1353       for (int i = 0; i < 3; i++) {
1354          if (inst->src[i].file == GRF &&
1355              split_grf[inst->src[i].reg] &&
1356              inst->src[i].reg_offset != 0) {
1357             inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1358                                 inst->src[i].reg_offset - 1);
1359             inst->src[i].reg_offset = 0;
1360          }
1361       }
1362    }
1363    this->live_intervals_valid = false;
1364 }
1365
1366 /**
1367  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1368  *
1369  * During code generation, we create tons of temporary variables, many of
1370  * which get immediately killed and are never used again.  Yet, in later
1371  * optimization and analysis passes, such as compute_live_intervals, we need
1372  * to loop over all the virtual GRFs.  Compacting them can save a lot of
1373  * overhead.
1374  */
1375 void
1376 fs_visitor::compact_virtual_grfs()
1377 {
1378    /* Mark which virtual GRFs are used, and count how many. */
1379    int remap_table[this->virtual_grf_count];
1380    memset(remap_table, -1, sizeof(remap_table));
1381
1382    foreach_list(node, &this->instructions) {
1383       const fs_inst *inst = (const fs_inst *) node;
1384
1385       if (inst->dst.file == GRF)
1386          remap_table[inst->dst.reg] = 0;
1387
1388       for (int i = 0; i < 3; i++) {
1389          if (inst->src[i].file == GRF)
1390             remap_table[inst->src[i].reg] = 0;
1391       }
1392    }
1393
1394    /* In addition to registers used in instructions, fs_visitor keeps
1395     * direct references to certain special values which must be patched:
1396     */
1397    fs_reg *special[] = {
1398       &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1399       &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1400       &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1401       &delta_x[0], &delta_x[1], &delta_x[2],
1402       &delta_x[3], &delta_x[4], &delta_x[5],
1403       &delta_y[0], &delta_y[1], &delta_y[2],
1404       &delta_y[3], &delta_y[4], &delta_y[5],
1405    };
1406    STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1407    STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1408
1409    /* Treat all special values as used, to be conservative */
1410    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1411       if (special[i]->file == GRF)
1412          remap_table[special[i]->reg] = 0;
1413    }
1414
1415    /* Compact the GRF arrays. */
1416    int new_index = 0;
1417    for (int i = 0; i < this->virtual_grf_count; i++) {
1418       if (remap_table[i] != -1) {
1419          remap_table[i] = new_index;
1420          virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1421          if (live_intervals_valid) {
1422             virtual_grf_use[new_index] = virtual_grf_use[i];
1423             virtual_grf_def[new_index] = virtual_grf_def[i];
1424          }
1425          ++new_index;
1426       }
1427    }
1428
1429    this->virtual_grf_count = new_index;
1430
1431    /* Patch all the instructions to use the newly renumbered registers */
1432    foreach_list(node, &this->instructions) {
1433       fs_inst *inst = (fs_inst *) node;
1434
1435       if (inst->dst.file == GRF)
1436          inst->dst.reg = remap_table[inst->dst.reg];
1437
1438       for (int i = 0; i < 3; i++) {
1439          if (inst->src[i].file == GRF)
1440             inst->src[i].reg = remap_table[inst->src[i].reg];
1441       }
1442    }
1443
1444    /* Patch all the references to special values */
1445    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1446       if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1447          special[i]->reg = remap_table[special[i]->reg];
1448    }
1449 }
1450
1451 bool
1452 fs_visitor::remove_dead_constants()
1453 {
1454    if (dispatch_width == 8) {
1455       this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
1456
1457       for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
1458          this->params_remap[i] = -1;
1459
1460       /* Find which params are still in use. */
1461       foreach_list(node, &this->instructions) {
1462          fs_inst *inst = (fs_inst *)node;
1463
1464          for (int i = 0; i < 3; i++) {
1465             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1466
1467             if (inst->src[i].file != UNIFORM)
1468                continue;
1469
1470             assert(constant_nr < (int)c->prog_data.nr_params);
1471
1472             /* For now, set this to non-negative.  We'll give it the
1473              * actual new number in a moment, in order to keep the
1474              * register numbers nicely ordered.
1475              */
1476             this->params_remap[constant_nr] = 0;
1477          }
1478       }
1479
1480       /* Figure out what the new numbers for the params will be.  At some
1481        * point when we're doing uniform array access, we're going to want
1482        * to keep the distinction between .reg and .reg_offset, but for
1483        * now we don't care.
1484        */
1485       unsigned int new_nr_params = 0;
1486       for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1487          if (this->params_remap[i] != -1) {
1488             this->params_remap[i] = new_nr_params++;
1489          }
1490       }
1491
1492       /* Update the list of params to be uploaded to match our new numbering. */
1493       for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1494          int remapped = this->params_remap[i];
1495
1496          if (remapped == -1)
1497             continue;
1498
1499          c->prog_data.param[remapped] = c->prog_data.param[i];
1500       }
1501
1502       c->prog_data.nr_params = new_nr_params;
1503    } else {
1504       /* This should have been generated in the 8-wide pass already. */
1505       assert(this->params_remap);
1506    }
1507
1508    /* Now do the renumbering of the shader to remove unused params. */
1509    foreach_list(node, &this->instructions) {
1510       fs_inst *inst = (fs_inst *)node;
1511
1512       for (int i = 0; i < 3; i++) {
1513          int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1514
1515          if (inst->src[i].file != UNIFORM)
1516             continue;
1517
1518          assert(this->params_remap[constant_nr] != -1);
1519          inst->src[i].reg = this->params_remap[constant_nr];
1520          inst->src[i].reg_offset = 0;
1521       }
1522    }
1523
1524    return true;
1525 }
1526
1527 /*
1528  * Implements array access of uniforms by inserting a
1529  * PULL_CONSTANT_LOAD instruction.
1530  *
1531  * Unlike temporary GRF array access (where we don't support it due to
1532  * the difficulty of doing relative addressing on instruction
1533  * destinations), we could potentially do array access of uniforms
1534  * that were loaded in GRF space as push constants.  In real-world
1535  * usage we've seen, though, the arrays being used are always larger
1536  * than we could load as push constants, so just always move all
1537  * uniform array access out to a pull constant buffer.
1538  */
1539 void
1540 fs_visitor::move_uniform_array_access_to_pull_constants()
1541 {
1542    int pull_constant_loc[c->prog_data.nr_params];
1543
1544    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1545       pull_constant_loc[i] = -1;
1546    }
1547
1548    /* Walk through and find array access of uniforms.  Put a copy of that
1549     * uniform in the pull constant buffer.
1550     *
1551     * Note that we don't move constant-indexed accesses to arrays.  No
1552     * testing has been done of the performance impact of this choice.
1553     */
1554    foreach_list_safe(node, &this->instructions) {
1555       fs_inst *inst = (fs_inst *)node;
1556
1557       for (int i = 0 ; i < 3; i++) {
1558          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1559             continue;
1560
1561          int uniform = inst->src[i].reg;
1562
1563          /* If this array isn't already present in the pull constant buffer,
1564           * add it.
1565           */
1566          if (pull_constant_loc[uniform] == -1) {
1567             const float **values = &c->prog_data.param[uniform];
1568
1569             pull_constant_loc[uniform] = c->prog_data.nr_pull_params;
1570
1571             assert(param_size[uniform]);
1572
1573             for (int j = 0; j < param_size[uniform]; j++) {
1574                c->prog_data.pull_param[c->prog_data.nr_pull_params++] =
1575                   values[j];
1576             }
1577          }
1578
1579          /* Set up the annotation tracking for new generated instructions. */
1580          base_ir = inst->ir;
1581          current_annotation = inst->annotation;
1582
1583          fs_reg offset = fs_reg(this, glsl_type::int_type);
1584          inst->insert_before(ADD(offset, *inst->src[i].reladdr,
1585                                  fs_reg(pull_constant_loc[uniform] +
1586                                         inst->src[i].reg_offset)));
1587
1588          fs_reg surf_index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1589          fs_reg temp = fs_reg(this, glsl_type::float_type);
1590          exec_list list = VARYING_PULL_CONSTANT_LOAD(temp,
1591                                                      surf_index, offset);
1592          inst->insert_before(&list);
1593
1594          inst->src[i].file = temp.file;
1595          inst->src[i].reg = temp.reg;
1596          inst->src[i].reg_offset = temp.reg_offset;
1597          inst->src[i].reladdr = NULL;
1598       }
1599    }
1600 }
1601
1602 /**
1603  * Choose accesses from the UNIFORM file to demote to using the pull
1604  * constant buffer.
1605  *
1606  * We allow a fragment shader to have more than the specified minimum
1607  * maximum number of fragment shader uniform components (64).  If
1608  * there are too many of these, they'd fill up all of register space.
1609  * So, this will push some of them out to the pull constant buffer and
1610  * update the program to load them.
1611  */
1612 void
1613 fs_visitor::setup_pull_constants()
1614 {
1615    /* Only allow 16 registers (128 uniform components) as push constants. */
1616    unsigned int max_uniform_components = 16 * 8;
1617    if (c->prog_data.nr_params <= max_uniform_components)
1618       return;
1619
1620    if (dispatch_width == 16) {
1621       fail("Pull constants not supported in 16-wide\n");
1622       return;
1623    }
1624
1625    /* Just demote the end of the list.  We could probably do better
1626     * here, demoting things that are rarely used in the program first.
1627     */
1628    unsigned int pull_uniform_base = max_uniform_components;
1629
1630    int pull_constant_loc[c->prog_data.nr_params];
1631    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1632       if (i < pull_uniform_base) {
1633          pull_constant_loc[i] = -1;
1634       } else {
1635          pull_constant_loc[i] = -1;
1636          /* If our constant is already being uploaded for reladdr purposes,
1637           * reuse it.
1638           */
1639          for (unsigned int j = 0; j < c->prog_data.nr_pull_params; j++) {
1640             if (c->prog_data.pull_param[j] == c->prog_data.param[i]) {
1641                pull_constant_loc[i] = j;
1642                break;
1643             }
1644          }
1645          if (pull_constant_loc[i] == -1) {
1646             int pull_index = c->prog_data.nr_pull_params++;
1647             c->prog_data.pull_param[pull_index] = c->prog_data.param[i];
1648             pull_constant_loc[i] = pull_index;;
1649          }
1650       }
1651    }
1652    c->prog_data.nr_params = pull_uniform_base;
1653
1654    foreach_list(node, &this->instructions) {
1655       fs_inst *inst = (fs_inst *)node;
1656
1657       for (int i = 0; i < 3; i++) {
1658          if (inst->src[i].file != UNIFORM)
1659             continue;
1660
1661          int pull_index = pull_constant_loc[inst->src[i].reg +
1662                                             inst->src[i].reg_offset];
1663          if (pull_index == -1)
1664             continue;
1665
1666          assert(!inst->src[i].reladdr);
1667
1668          fs_reg dst = fs_reg(this, glsl_type::float_type);
1669          fs_reg index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1670          fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1671          fs_inst *pull =
1672             new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1673                                  dst, index, offset);
1674          pull->ir = inst->ir;
1675          pull->annotation = inst->annotation;
1676          pull->base_mrf = 14;
1677          pull->mlen = 1;
1678
1679          inst->insert_before(pull);
1680
1681          inst->src[i].file = GRF;
1682          inst->src[i].reg = dst.reg;
1683          inst->src[i].reg_offset = 0;
1684          inst->src[i].smear = pull_index & 3;
1685       }
1686    }
1687 }
1688
1689 bool
1690 fs_visitor::opt_algebraic()
1691 {
1692    bool progress = false;
1693
1694    foreach_list(node, &this->instructions) {
1695       fs_inst *inst = (fs_inst *)node;
1696
1697       switch (inst->opcode) {
1698       case BRW_OPCODE_MUL:
1699          if (inst->src[1].file != IMM)
1700             continue;
1701
1702          /* a * 1.0 = a */
1703          if (inst->src[1].is_one()) {
1704             inst->opcode = BRW_OPCODE_MOV;
1705             inst->src[1] = reg_undef;
1706             progress = true;
1707             break;
1708          }
1709
1710          /* a * 0.0 = 0.0 */
1711          if (inst->src[1].is_zero()) {
1712             inst->opcode = BRW_OPCODE_MOV;
1713             inst->src[0] = inst->src[1];
1714             inst->src[1] = reg_undef;
1715             progress = true;
1716             break;
1717          }
1718
1719          break;
1720       case BRW_OPCODE_ADD:
1721          if (inst->src[1].file != IMM)
1722             continue;
1723
1724          /* a + 0.0 = a */
1725          if (inst->src[1].is_zero()) {
1726             inst->opcode = BRW_OPCODE_MOV;
1727             inst->src[1] = reg_undef;
1728             progress = true;
1729             break;
1730          }
1731          break;
1732       default:
1733          break;
1734       }
1735    }
1736
1737    return progress;
1738 }
1739
1740 /**
1741  * Must be called after calculate_live_intervales() to remove unused
1742  * writes to registers -- register allocation will fail otherwise
1743  * because something deffed but not used won't be considered to
1744  * interfere with other regs.
1745  */
1746 bool
1747 fs_visitor::dead_code_eliminate()
1748 {
1749    bool progress = false;
1750    int pc = 0;
1751
1752    calculate_live_intervals();
1753
1754    foreach_list_safe(node, &this->instructions) {
1755       fs_inst *inst = (fs_inst *)node;
1756
1757       if (inst->dst.file == GRF && this->virtual_grf_use[inst->dst.reg] <= pc) {
1758          inst->remove();
1759          progress = true;
1760       }
1761
1762       pc++;
1763    }
1764
1765    if (progress)
1766       live_intervals_valid = false;
1767
1768    return progress;
1769 }
1770
1771 /**
1772  * Implements a second type of register coalescing: This one checks if
1773  * the two regs involved in a raw move don't interfere, in which case
1774  * they can both by stored in the same place and the MOV removed.
1775  */
1776 bool
1777 fs_visitor::register_coalesce_2()
1778 {
1779    bool progress = false;
1780
1781    calculate_live_intervals();
1782
1783    foreach_list_safe(node, &this->instructions) {
1784       fs_inst *inst = (fs_inst *)node;
1785
1786       if (inst->opcode != BRW_OPCODE_MOV ||
1787           inst->predicate ||
1788           inst->saturate ||
1789           inst->src[0].file != GRF ||
1790           inst->src[0].negate ||
1791           inst->src[0].abs ||
1792           inst->src[0].smear != -1 ||
1793           inst->dst.file != GRF ||
1794           inst->dst.type != inst->src[0].type ||
1795           virtual_grf_sizes[inst->src[0].reg] != 1 ||
1796           virtual_grf_interferes(inst->dst.reg, inst->src[0].reg)) {
1797          continue;
1798       }
1799
1800       int reg_from = inst->src[0].reg;
1801       assert(inst->src[0].reg_offset == 0);
1802       int reg_to = inst->dst.reg;
1803       int reg_to_offset = inst->dst.reg_offset;
1804
1805       foreach_list(node, &this->instructions) {
1806          fs_inst *scan_inst = (fs_inst *)node;
1807
1808          if (scan_inst->dst.file == GRF &&
1809              scan_inst->dst.reg == reg_from) {
1810             scan_inst->dst.reg = reg_to;
1811             scan_inst->dst.reg_offset = reg_to_offset;
1812          }
1813          for (int i = 0; i < 3; i++) {
1814             if (scan_inst->src[i].file == GRF &&
1815                 scan_inst->src[i].reg == reg_from) {
1816                scan_inst->src[i].reg = reg_to;
1817                scan_inst->src[i].reg_offset = reg_to_offset;
1818             }
1819          }
1820       }
1821
1822       inst->remove();
1823
1824       /* We don't need to recalculate live intervals inside the loop despite
1825        * flagging live_intervals_valid because we only use live intervals for
1826        * the interferes test, and we must have had a situation where the
1827        * intervals were:
1828        *
1829        *  from  to
1830        *  ^
1831        *  |
1832        *  v
1833        *        ^
1834        *        |
1835        *        v
1836        *
1837        * Some register R that might get coalesced with one of these two could
1838        * only be referencing "to", otherwise "from"'s range would have been
1839        * longer.  R's range could also only start at the end of "to" or later,
1840        * otherwise it will conflict with "to" when we try to coalesce "to"
1841        * into Rw anyway.
1842        */
1843       live_intervals_valid = false;
1844
1845       progress = true;
1846       continue;
1847    }
1848
1849    return progress;
1850 }
1851
1852 bool
1853 fs_visitor::register_coalesce()
1854 {
1855    bool progress = false;
1856    int if_depth = 0;
1857    int loop_depth = 0;
1858
1859    foreach_list_safe(node, &this->instructions) {
1860       fs_inst *inst = (fs_inst *)node;
1861
1862       /* Make sure that we dominate the instructions we're going to
1863        * scan for interfering with our coalescing, or we won't have
1864        * scanned enough to see if anything interferes with our
1865        * coalescing.  We don't dominate the following instructions if
1866        * we're in a loop or an if block.
1867        */
1868       switch (inst->opcode) {
1869       case BRW_OPCODE_DO:
1870          loop_depth++;
1871          break;
1872       case BRW_OPCODE_WHILE:
1873          loop_depth--;
1874          break;
1875       case BRW_OPCODE_IF:
1876          if_depth++;
1877          break;
1878       case BRW_OPCODE_ENDIF:
1879          if_depth--;
1880          break;
1881       default:
1882          break;
1883       }
1884       if (loop_depth || if_depth)
1885          continue;
1886
1887       if (inst->opcode != BRW_OPCODE_MOV ||
1888           inst->predicate ||
1889           inst->saturate ||
1890           inst->dst.file != GRF || (inst->src[0].file != GRF &&
1891                                     inst->src[0].file != UNIFORM)||
1892           inst->dst.type != inst->src[0].type)
1893          continue;
1894
1895       bool has_source_modifiers = (inst->src[0].abs ||
1896                                    inst->src[0].negate ||
1897                                    inst->src[0].file == UNIFORM);
1898
1899       /* Found a move of a GRF to a GRF.  Let's see if we can coalesce
1900        * them: check for no writes to either one until the exit of the
1901        * program.
1902        */
1903       bool interfered = false;
1904
1905       for (fs_inst *scan_inst = (fs_inst *)inst->next;
1906            !scan_inst->is_tail_sentinel();
1907            scan_inst = (fs_inst *)scan_inst->next) {
1908          if (scan_inst->dst.file == GRF) {
1909             if (scan_inst->overwrites_reg(inst->dst) ||
1910                 scan_inst->overwrites_reg(inst->src[0])) {
1911                interfered = true;
1912                break;
1913             }
1914          }
1915
1916          /* The gen6 MATH instruction can't handle source modifiers or
1917           * unusual register regions, so avoid coalescing those for
1918           * now.  We should do something more specific.
1919           */
1920          if (has_source_modifiers && !can_do_source_mods(scan_inst)) {
1921             interfered = true;
1922             break;
1923          }
1924
1925          /* The accumulator result appears to get used for the
1926           * conditional modifier generation.  When negating a UD
1927           * value, there is a 33rd bit generated for the sign in the
1928           * accumulator value, so now you can't check, for example,
1929           * equality with a 32-bit value.  See piglit fs-op-neg-uint.
1930           */
1931          if (scan_inst->conditional_mod &&
1932              inst->src[0].negate &&
1933              inst->src[0].type == BRW_REGISTER_TYPE_UD) {
1934             interfered = true;
1935             break;
1936          }
1937       }
1938       if (interfered) {
1939          continue;
1940       }
1941
1942       /* Rewrite the later usage to point at the source of the move to
1943        * be removed.
1944        */
1945       for (fs_inst *scan_inst = inst;
1946            !scan_inst->is_tail_sentinel();
1947            scan_inst = (fs_inst *)scan_inst->next) {
1948          for (int i = 0; i < 3; i++) {
1949             if (scan_inst->src[i].file == GRF &&
1950                 scan_inst->src[i].reg == inst->dst.reg &&
1951                 scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
1952                fs_reg new_src = inst->src[0];
1953                if (scan_inst->src[i].abs) {
1954                   new_src.negate = 0;
1955                   new_src.abs = 1;
1956                }
1957                new_src.negate ^= scan_inst->src[i].negate;
1958                scan_inst->src[i] = new_src;
1959             }
1960          }
1961       }
1962
1963       inst->remove();
1964       progress = true;
1965    }
1966
1967    if (progress)
1968       live_intervals_valid = false;
1969
1970    return progress;
1971 }
1972
1973
1974 bool
1975 fs_visitor::compute_to_mrf()
1976 {
1977    bool progress = false;
1978    int next_ip = 0;
1979
1980    calculate_live_intervals();
1981
1982    foreach_list_safe(node, &this->instructions) {
1983       fs_inst *inst = (fs_inst *)node;
1984
1985       int ip = next_ip;
1986       next_ip++;
1987
1988       if (inst->opcode != BRW_OPCODE_MOV ||
1989           inst->predicate ||
1990           inst->dst.file != MRF || inst->src[0].file != GRF ||
1991           inst->dst.type != inst->src[0].type ||
1992           inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
1993          continue;
1994
1995       /* Work out which hardware MRF registers are written by this
1996        * instruction.
1997        */
1998       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
1999       int mrf_high;
2000       if (inst->dst.reg & BRW_MRF_COMPR4) {
2001          mrf_high = mrf_low + 4;
2002       } else if (dispatch_width == 16 &&
2003                  (!inst->force_uncompressed && !inst->force_sechalf)) {
2004          mrf_high = mrf_low + 1;
2005       } else {
2006          mrf_high = mrf_low;
2007       }
2008
2009       /* Can't compute-to-MRF this GRF if someone else was going to
2010        * read it later.
2011        */
2012       if (this->virtual_grf_use[inst->src[0].reg] > ip)
2013          continue;
2014
2015       /* Found a move of a GRF to a MRF.  Let's see if we can go
2016        * rewrite the thing that made this GRF to write into the MRF.
2017        */
2018       fs_inst *scan_inst;
2019       for (scan_inst = (fs_inst *)inst->prev;
2020            scan_inst->prev != NULL;
2021            scan_inst = (fs_inst *)scan_inst->prev) {
2022          if (scan_inst->dst.file == GRF &&
2023              scan_inst->dst.reg == inst->src[0].reg) {
2024             /* Found the last thing to write our reg we want to turn
2025              * into a compute-to-MRF.
2026              */
2027
2028             /* SENDs can only write to GRFs, so no compute-to-MRF. */
2029             if (scan_inst->mlen) {
2030                break;
2031             }
2032
2033             /* If it's predicated, it (probably) didn't populate all
2034              * the channels.  We might be able to rewrite everything
2035              * that writes that reg, but it would require smarter
2036              * tracking to delay the rewriting until complete success.
2037              */
2038             if (scan_inst->predicate)
2039                break;
2040
2041             /* If it's half of register setup and not the same half as
2042              * our MOV we're trying to remove, bail for now.
2043              */
2044             if (scan_inst->force_uncompressed != inst->force_uncompressed ||
2045                 scan_inst->force_sechalf != inst->force_sechalf) {
2046                break;
2047             }
2048
2049             /* SEND instructions can't have MRF as a destination. */
2050             if (scan_inst->mlen)
2051                break;
2052
2053             if (intel->gen >= 6) {
2054                /* gen6 math instructions must have the destination be
2055                 * GRF, so no compute-to-MRF for them.
2056                 */
2057                if (scan_inst->is_math()) {
2058                   break;
2059                }
2060             }
2061
2062             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2063                /* Found the creator of our MRF's source value. */
2064                scan_inst->dst.file = MRF;
2065                scan_inst->dst.reg = inst->dst.reg;
2066                scan_inst->saturate |= inst->saturate;
2067                inst->remove();
2068                progress = true;
2069             }
2070             break;
2071          }
2072
2073          /* We don't handle flow control here.  Most computation of
2074           * values that end up in MRFs are shortly before the MRF
2075           * write anyway.
2076           */
2077          if (scan_inst->opcode == BRW_OPCODE_DO ||
2078              scan_inst->opcode == BRW_OPCODE_WHILE ||
2079              scan_inst->opcode == BRW_OPCODE_ELSE ||
2080              scan_inst->opcode == BRW_OPCODE_ENDIF) {
2081             break;
2082          }
2083
2084          /* You can't read from an MRF, so if someone else reads our
2085           * MRF's source GRF that we wanted to rewrite, that stops us.
2086           */
2087          bool interfered = false;
2088          for (int i = 0; i < 3; i++) {
2089             if (scan_inst->src[i].file == GRF &&
2090                 scan_inst->src[i].reg == inst->src[0].reg &&
2091                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2092                interfered = true;
2093             }
2094          }
2095          if (interfered)
2096             break;
2097
2098          if (scan_inst->dst.file == MRF) {
2099             /* If somebody else writes our MRF here, we can't
2100              * compute-to-MRF before that.
2101              */
2102             int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2103             int scan_mrf_high;
2104
2105             if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2106                scan_mrf_high = scan_mrf_low + 4;
2107             } else if (dispatch_width == 16 &&
2108                        (!scan_inst->force_uncompressed &&
2109                         !scan_inst->force_sechalf)) {
2110                scan_mrf_high = scan_mrf_low + 1;
2111             } else {
2112                scan_mrf_high = scan_mrf_low;
2113             }
2114
2115             if (mrf_low == scan_mrf_low ||
2116                 mrf_low == scan_mrf_high ||
2117                 mrf_high == scan_mrf_low ||
2118                 mrf_high == scan_mrf_high) {
2119                break;
2120             }
2121          }
2122
2123          if (scan_inst->mlen > 0) {
2124             /* Found a SEND instruction, which means that there are
2125              * live values in MRFs from base_mrf to base_mrf +
2126              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
2127              * above it.
2128              */
2129             if (mrf_low >= scan_inst->base_mrf &&
2130                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2131                break;
2132             }
2133             if (mrf_high >= scan_inst->base_mrf &&
2134                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2135                break;
2136             }
2137          }
2138       }
2139    }
2140
2141    if (progress)
2142       live_intervals_valid = false;
2143
2144    return progress;
2145 }
2146
2147 /**
2148  * Walks through basic blocks, looking for repeated MRF writes and
2149  * removing the later ones.
2150  */
2151 bool
2152 fs_visitor::remove_duplicate_mrf_writes()
2153 {
2154    fs_inst *last_mrf_move[16];
2155    bool progress = false;
2156
2157    /* Need to update the MRF tracking for compressed instructions. */
2158    if (dispatch_width == 16)
2159       return false;
2160
2161    memset(last_mrf_move, 0, sizeof(last_mrf_move));
2162
2163    foreach_list_safe(node, &this->instructions) {
2164       fs_inst *inst = (fs_inst *)node;
2165
2166       switch (inst->opcode) {
2167       case BRW_OPCODE_DO:
2168       case BRW_OPCODE_WHILE:
2169       case BRW_OPCODE_IF:
2170       case BRW_OPCODE_ELSE:
2171       case BRW_OPCODE_ENDIF:
2172          memset(last_mrf_move, 0, sizeof(last_mrf_move));
2173          continue;
2174       default:
2175          break;
2176       }
2177
2178       if (inst->opcode == BRW_OPCODE_MOV &&
2179           inst->dst.file == MRF) {
2180          fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2181          if (prev_inst && inst->equals(prev_inst)) {
2182             inst->remove();
2183             progress = true;
2184             continue;
2185          }
2186       }
2187
2188       /* Clear out the last-write records for MRFs that were overwritten. */
2189       if (inst->dst.file == MRF) {
2190          last_mrf_move[inst->dst.reg] = NULL;
2191       }
2192
2193       if (inst->mlen > 0) {
2194          /* Found a SEND instruction, which will include two or fewer
2195           * implied MRF writes.  We could do better here.
2196           */
2197          for (int i = 0; i < implied_mrf_writes(inst); i++) {
2198             last_mrf_move[inst->base_mrf + i] = NULL;
2199          }
2200       }
2201
2202       /* Clear out any MRF move records whose sources got overwritten. */
2203       if (inst->dst.file == GRF) {
2204          for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2205             if (last_mrf_move[i] &&
2206                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2207                last_mrf_move[i] = NULL;
2208             }
2209          }
2210       }
2211
2212       if (inst->opcode == BRW_OPCODE_MOV &&
2213           inst->dst.file == MRF &&
2214           inst->src[0].file == GRF &&
2215           !inst->predicate) {
2216          last_mrf_move[inst->dst.reg] = inst;
2217       }
2218    }
2219
2220    if (progress)
2221       live_intervals_valid = false;
2222
2223    return progress;
2224 }
2225
2226 void
2227 fs_visitor::dump_instruction(fs_inst *inst)
2228 {
2229    if (inst->predicate) {
2230       printf("(%cf0.%d) ",
2231              inst->predicate_inverse ? '-' : '+',
2232              inst->flag_subreg);
2233    }
2234
2235    if (inst->opcode < ARRAY_SIZE(opcode_descs) &&
2236        opcode_descs[inst->opcode].name) {
2237       printf("%s", opcode_descs[inst->opcode].name);
2238    } else {
2239       printf("op%d", inst->opcode);
2240    }
2241    if (inst->saturate)
2242       printf(".sat");
2243    if (inst->conditional_mod) {
2244       printf(".cmod");
2245       if (!inst->predicate &&
2246           (intel->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2247                               inst->opcode != BRW_OPCODE_IF &&
2248                               inst->opcode != BRW_OPCODE_WHILE))) {
2249          printf(".f0.%d\n", inst->flag_subreg);
2250       }
2251    }
2252    printf(" ");
2253
2254
2255    switch (inst->dst.file) {
2256    case GRF:
2257       printf("vgrf%d", inst->dst.reg);
2258       if (inst->dst.reg_offset)
2259          printf("+%d", inst->dst.reg_offset);
2260       break;
2261    case MRF:
2262       printf("m%d", inst->dst.reg);
2263       break;
2264    case BAD_FILE:
2265       printf("(null)");
2266       break;
2267    case UNIFORM:
2268       printf("***u%d***", inst->dst.reg);
2269       break;
2270    default:
2271       printf("???");
2272       break;
2273    }
2274    printf(", ");
2275
2276    for (int i = 0; i < 3; i++) {
2277       if (inst->src[i].negate)
2278          printf("-");
2279       if (inst->src[i].abs)
2280          printf("|");
2281       switch (inst->src[i].file) {
2282       case GRF:
2283          printf("vgrf%d", inst->src[i].reg);
2284          if (inst->src[i].reg_offset)
2285             printf("+%d", inst->src[i].reg_offset);
2286          break;
2287       case MRF:
2288          printf("***m%d***", inst->src[i].reg);
2289          break;
2290       case UNIFORM:
2291          printf("u%d", inst->src[i].reg);
2292          if (inst->src[i].reg_offset)
2293             printf(".%d", inst->src[i].reg_offset);
2294          break;
2295       case BAD_FILE:
2296          printf("(null)");
2297          break;
2298       default:
2299          printf("???");
2300          break;
2301       }
2302       if (inst->src[i].abs)
2303          printf("|");
2304
2305       if (i < 3)
2306          printf(", ");
2307    }
2308
2309    printf(" ");
2310
2311    if (inst->force_uncompressed)
2312       printf("1sthalf ");
2313
2314    if (inst->force_sechalf)
2315       printf("2ndhalf ");
2316
2317    printf("\n");
2318 }
2319
2320 void
2321 fs_visitor::dump_instructions()
2322 {
2323    int ip = 0;
2324    foreach_list(node, &this->instructions) {
2325       fs_inst *inst = (fs_inst *)node;
2326       printf("%d: ", ip++);
2327       dump_instruction(inst);
2328    }
2329 }
2330
2331 /**
2332  * Possibly returns an instruction that set up @param reg.
2333  *
2334  * Sometimes we want to take the result of some expression/variable
2335  * dereference tree and rewrite the instruction generating the result
2336  * of the tree.  When processing the tree, we know that the
2337  * instructions generated are all writing temporaries that are dead
2338  * outside of this tree.  So, if we have some instructions that write
2339  * a temporary, we're free to point that temp write somewhere else.
2340  *
2341  * Note that this doesn't guarantee that the instruction generated
2342  * only reg -- it might be the size=4 destination of a texture instruction.
2343  */
2344 fs_inst *
2345 fs_visitor::get_instruction_generating_reg(fs_inst *start,
2346                                            fs_inst *end,
2347                                            fs_reg reg)
2348 {
2349    if (end == start ||
2350        end->predicate ||
2351        end->force_uncompressed ||
2352        end->force_sechalf ||
2353        reg.reladdr ||
2354        !reg.equals(end->dst)) {
2355       return NULL;
2356    } else {
2357       return end;
2358    }
2359 }
2360
2361 void
2362 fs_visitor::setup_payload_gen6()
2363 {
2364    struct intel_context *intel = &brw->intel;
2365    bool uses_depth =
2366       (fp->Base.InputsRead & (1 << FRAG_ATTRIB_WPOS)) != 0;
2367    unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
2368
2369    assert(intel->gen >= 6);
2370
2371    /* R0-1: masks, pixel X/Y coordinates. */
2372    c->nr_payload_regs = 2;
2373    /* R2: only for 32-pixel dispatch.*/
2374
2375    /* R3-26: barycentric interpolation coordinates.  These appear in the
2376     * same order that they appear in the brw_wm_barycentric_interp_mode
2377     * enum.  Each set of coordinates occupies 2 registers if dispatch width
2378     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
2379     * appear if they were enabled using the "Barycentric Interpolation
2380     * Mode" bits in WM_STATE.
2381     */
2382    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
2383       if (barycentric_interp_modes & (1 << i)) {
2384          c->barycentric_coord_reg[i] = c->nr_payload_regs;
2385          c->nr_payload_regs += 2;
2386          if (dispatch_width == 16) {
2387             c->nr_payload_regs += 2;
2388          }
2389       }
2390    }
2391
2392    /* R27: interpolated depth if uses source depth */
2393    if (uses_depth) {
2394       c->source_depth_reg = c->nr_payload_regs;
2395       c->nr_payload_regs++;
2396       if (dispatch_width == 16) {
2397          /* R28: interpolated depth if not 8-wide. */
2398          c->nr_payload_regs++;
2399       }
2400    }
2401    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
2402    if (uses_depth) {
2403       c->source_w_reg = c->nr_payload_regs;
2404       c->nr_payload_regs++;
2405       if (dispatch_width == 16) {
2406          /* R30: interpolated W if not 8-wide. */
2407          c->nr_payload_regs++;
2408       }
2409    }
2410    /* R31: MSAA position offsets. */
2411    /* R32-: bary for 32-pixel. */
2412    /* R58-59: interp W for 32-pixel. */
2413
2414    if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
2415       c->source_depth_to_render_target = true;
2416    }
2417 }
2418
2419 bool
2420 fs_visitor::run()
2421 {
2422    sanity_param_count = fp->Base.Parameters->NumParameters;
2423    uint32_t orig_nr_params = c->prog_data.nr_params;
2424
2425    if (intel->gen >= 6)
2426       setup_payload_gen6();
2427    else
2428       setup_payload_gen4();
2429
2430    if (0) {
2431       emit_dummy_fs();
2432    } else {
2433       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2434          emit_shader_time_begin();
2435
2436       calculate_urb_setup();
2437       if (intel->gen < 6)
2438          emit_interpolation_setup_gen4();
2439       else
2440          emit_interpolation_setup_gen6();
2441
2442       /* We handle discards by keeping track of the still-live pixels in f0.1.
2443        * Initialize it with the dispatched pixels.
2444        */
2445       if (fp->UsesKill) {
2446          fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
2447          discard_init->flag_subreg = 1;
2448       }
2449
2450       /* Generate FS IR for main().  (the visitor only descends into
2451        * functions called "main").
2452        */
2453       if (shader) {
2454          foreach_list(node, &*shader->ir) {
2455             ir_instruction *ir = (ir_instruction *)node;
2456             base_ir = ir;
2457             this->result = reg_undef;
2458             ir->accept(this);
2459          }
2460       } else {
2461          emit_fragment_program_code();
2462       }
2463       base_ir = NULL;
2464       if (failed)
2465          return false;
2466
2467       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2468          emit_shader_time_end();
2469
2470       emit_fb_writes();
2471
2472       split_virtual_grfs();
2473
2474       move_uniform_array_access_to_pull_constants();
2475       setup_pull_constants();
2476
2477       bool progress;
2478       do {
2479          progress = false;
2480
2481          compact_virtual_grfs();
2482
2483          progress = remove_duplicate_mrf_writes() || progress;
2484
2485          progress = opt_algebraic() || progress;
2486          progress = opt_cse() || progress;
2487          progress = opt_copy_propagate() || progress;
2488          progress = dead_code_eliminate() || progress;
2489          progress = register_coalesce() || progress;
2490          progress = register_coalesce_2() || progress;
2491          progress = compute_to_mrf() || progress;
2492       } while (progress);
2493
2494       remove_dead_constants();
2495
2496       schedule_instructions(false);
2497
2498       assign_curb_setup();
2499       assign_urb_setup();
2500
2501       if (0) {
2502          /* Debug of register spilling: Go spill everything. */
2503          for (int i = 0; i < virtual_grf_count; i++) {
2504             spill_reg(i);
2505          }
2506       }
2507
2508       if (0)
2509          assign_regs_trivial();
2510       else {
2511          while (!assign_regs()) {
2512             if (failed)
2513                break;
2514          }
2515       }
2516    }
2517    assert(force_uncompressed_stack == 0);
2518    assert(force_sechalf_stack == 0);
2519
2520    if (failed)
2521       return false;
2522
2523    schedule_instructions(true);
2524
2525    if (dispatch_width == 8) {
2526       c->prog_data.reg_blocks = brw_register_blocks(grf_used);
2527    } else {
2528       c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
2529
2530       /* Make sure we didn't try to sneak in an extra uniform */
2531       assert(orig_nr_params == c->prog_data.nr_params);
2532       (void) orig_nr_params;
2533    }
2534
2535    /* If any state parameters were appended, then ParameterValues could have
2536     * been realloced, in which case the driver uniform storage set up by
2537     * _mesa_associate_uniform_storage() would point to freed memory.  Make
2538     * sure that didn't happen.
2539     */
2540    assert(sanity_param_count == fp->Base.Parameters->NumParameters);
2541
2542    return !failed;
2543 }
2544
2545 const unsigned *
2546 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
2547                struct gl_fragment_program *fp,
2548                struct gl_shader_program *prog,
2549                unsigned *final_assembly_size)
2550 {
2551    struct intel_context *intel = &brw->intel;
2552    bool start_busy = false;
2553    float start_time = 0;
2554
2555    if (unlikely(INTEL_DEBUG & DEBUG_PERF)) {
2556       start_busy = (intel->batch.last_bo &&
2557                     drm_intel_bo_busy(intel->batch.last_bo));
2558       start_time = get_time();
2559    }
2560
2561    struct brw_shader *shader = NULL;
2562    if (prog)
2563       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
2564
2565    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
2566       if (shader) {
2567          printf("GLSL IR for native fragment shader %d:\n", prog->Name);
2568          _mesa_print_ir(shader->ir, NULL);
2569          printf("\n\n");
2570       } else {
2571          printf("ARB_fragment_program %d ir for native fragment shader\n",
2572                 fp->Base.Id);
2573          _mesa_print_program(&fp->Base);
2574       }
2575    }
2576
2577    /* Now the main event: Visit the shader IR and generate our FS IR for it.
2578     */
2579    fs_visitor v(brw, c, prog, fp, 8);
2580    if (!v.run()) {
2581       prog->LinkStatus = false;
2582       ralloc_strcat(&prog->InfoLog, v.fail_msg);
2583
2584       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
2585                     v.fail_msg);
2586
2587       return NULL;
2588    }
2589
2590    exec_list *simd16_instructions = NULL;
2591    fs_visitor v2(brw, c, prog, fp, 16);
2592    bool no16 = INTEL_DEBUG & DEBUG_NO16;
2593    if (intel->gen >= 5 && c->prog_data.nr_pull_params == 0 && likely(!no16)) {
2594       v2.import_uniforms(&v);
2595       if (!v2.run()) {
2596          perf_debug("16-wide shader failed to compile, falling back to "
2597                     "8-wide at a 10-20%% performance cost: %s", v2.fail_msg);
2598       } else {
2599          simd16_instructions = &v2.instructions;
2600       }
2601    }
2602
2603    c->prog_data.dispatch_width = 8;
2604
2605    fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
2606    const unsigned *generated = g.generate_assembly(&v.instructions,
2607                                                    simd16_instructions,
2608                                                    final_assembly_size);
2609
2610    if (unlikely(INTEL_DEBUG & DEBUG_PERF) && shader) {
2611       if (shader->compiled_once)
2612          brw_wm_debug_recompile(brw, prog, &c->key);
2613       shader->compiled_once = true;
2614
2615       if (start_busy && !drm_intel_bo_busy(intel->batch.last_bo)) {
2616          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
2617                     (get_time() - start_time) * 1000);
2618       }
2619    }
2620
2621    return generated;
2622 }
2623
2624 bool
2625 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
2626 {
2627    struct brw_context *brw = brw_context(ctx);
2628    struct intel_context *intel = &brw->intel;
2629    struct brw_wm_prog_key key;
2630
2631    if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
2632       return true;
2633
2634    struct gl_fragment_program *fp = (struct gl_fragment_program *)
2635       prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
2636    struct brw_fragment_program *bfp = brw_fragment_program(fp);
2637    bool program_uses_dfdy = fp->UsesDFdy;
2638
2639    memset(&key, 0, sizeof(key));
2640
2641    if (intel->gen < 6) {
2642       if (fp->UsesKill)
2643          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
2644
2645       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
2646          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
2647
2648       /* Just assume depth testing. */
2649       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
2650       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
2651    }
2652
2653    if (prog->Name != 0)
2654       key.proj_attrib_mask = 0xffffffff;
2655
2656    if (intel->gen < 6)
2657       key.vp_outputs_written |= BITFIELD64_BIT(FRAG_ATTRIB_WPOS);
2658
2659    for (int i = 0; i < FRAG_ATTRIB_MAX; i++) {
2660       if (!(fp->Base.InputsRead & BITFIELD64_BIT(i)))
2661          continue;
2662
2663       if (prog->Name == 0)
2664          key.proj_attrib_mask |= 1 << i;
2665
2666       if (intel->gen < 6) {
2667          int vp_index = _mesa_vert_result_to_frag_attrib((gl_vert_result) i);
2668
2669          if (vp_index >= 0)
2670             key.vp_outputs_written |= BITFIELD64_BIT(vp_index);
2671       }
2672    }
2673
2674    key.clamp_fragment_color = true;
2675
2676    for (int i = 0; i < MAX_SAMPLERS; i++) {
2677       if (fp->Base.ShadowSamplers & (1 << i)) {
2678          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
2679          key.tex.swizzles[i] =
2680             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
2681       } else {
2682          /* Color sampler: assume no swizzling. */
2683          key.tex.swizzles[i] = SWIZZLE_XYZW;
2684       }
2685    }
2686
2687    if (fp->Base.InputsRead & FRAG_BIT_WPOS) {
2688       key.drawable_height = ctx->DrawBuffer->Height;
2689    }
2690
2691    if ((fp->Base.InputsRead & FRAG_BIT_WPOS) || program_uses_dfdy) {
2692       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
2693    }
2694
2695    key.nr_color_regions = 1;
2696
2697    key.program_string_id = bfp->id;
2698
2699    uint32_t old_prog_offset = brw->wm.prog_offset;
2700    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
2701
2702    bool success = do_wm_prog(brw, prog, bfp, &key);
2703
2704    brw->wm.prog_offset = old_prog_offset;
2705    brw->wm.prog_data = old_prog_data;
2706
2707    return success;
2708 }