src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 extern "C" {
  32
  33 #include <sys/types.h>
  34
  35 #include "main/macros.h"
  36 #include "main/shaderobj.h"
  37 #include "main/uniforms.h"
  38 #include "main/fbobject.h"
  39 #include "program/prog_parameter.h"
  40 #include "program/prog_print.h"
  41 #include "program/register_allocate.h"
  42 #include "program/sampler.h"
  43 #include "program/hash_table.h"
  44 #include "brw_context.h"
  45 #include "brw_eu.h"
  46 #include "brw_wm.h"
  47 }
  48 #include "brw_fs.h"
  49 #include "glsl/glsl_types.h"
  50 #include "glsl/ir_print_visitor.h"
  51
  52 void
  53 fs_inst::init()
  54 {
  55    memset(this, 0, sizeof(*this));
  56    this->opcode = BRW_OPCODE_NOP;
  57    this->conditional_mod = BRW_CONDITIONAL_NONE;
  58
  59    this->dst = reg_undef;
  60    this->src[0] = reg_undef;
  61    this->src[1] = reg_undef;
  62    this->src[2] = reg_undef;
  63 }
  64
  65 fs_inst::fs_inst()
  66 {
  67    init();
  68 }
  69
  70 fs_inst::fs_inst(enum opcode opcode)
  71 {
  72    init();
  73    this->opcode = opcode;
  74 }
  75
  76 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
  77 {
  78    init();
  79    this->opcode = opcode;
  80    this->dst = dst;
  81
  82    if (dst.file == GRF)
  83       assert(dst.reg_offset >= 0);
  84 }
  85
  86 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
  87 {
  88    init();
  89    this->opcode = opcode;
  90    this->dst = dst;
  91    this->src[0] = src0;
  92
  93    if (dst.file == GRF)
  94       assert(dst.reg_offset >= 0);
  95    if (src[0].file == GRF)
  96       assert(src[0].reg_offset >= 0);
  97 }
  98
  99 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 100 {
 101    init();
 102    this->opcode = opcode;
 103    this->dst = dst;
 104    this->src[0] = src0;
 105    this->src[1] = src1;
 106
 107    if (dst.file == GRF)
 108       assert(dst.reg_offset >= 0);
 109    if (src[0].file == GRF)
 110       assert(src[0].reg_offset >= 0);
 111    if (src[1].file == GRF)
 112       assert(src[1].reg_offset >= 0);
 113 }
 114
 115 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
 116                  fs_reg src0, fs_reg src1, fs_reg src2)
 117 {
 118    init();
 119    this->opcode = opcode;
 120    this->dst = dst;
 121    this->src[0] = src0;
 122    this->src[1] = src1;
 123    this->src[2] = src2;
 124
 125    if (dst.file == GRF)
 126       assert(dst.reg_offset >= 0);
 127    if (src[0].file == GRF)
 128       assert(src[0].reg_offset >= 0);
 129    if (src[1].file == GRF)
 130       assert(src[1].reg_offset >= 0);
 131    if (src[2].file == GRF)
 132       assert(src[2].reg_offset >= 0);
 133 }
 134
 135 #define ALU1(op)                                                        \
 136    fs_inst *                                                            \
 137    fs_visitor::op(fs_reg dst, fs_reg src0)                              \
 138    {                                                                    \
 139       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
 140    }
 141
 142 #define ALU2(op)                                                        \
 143    fs_inst *                                                            \
 144    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1)                 \
 145    {                                                                    \
 146       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
 147    }
 148
 149 ALU1(NOT)
 150 ALU1(MOV)
 151 ALU1(FRC)
 152 ALU1(RNDD)
 153 ALU1(RNDE)
 154 ALU1(RNDZ)
 155 ALU2(ADD)
 156 ALU2(MUL)
 157 ALU2(MACH)
 158 ALU2(AND)
 159 ALU2(OR)
 160 ALU2(XOR)
 161 ALU2(SHL)
 162 ALU2(SHR)
 163 ALU2(ASR)
 164
 165 /** Gen4 predicated IF. */
 166 fs_inst *
 167 fs_visitor::IF(uint32_t predicate)
 168 {
 169    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
 170    inst->predicate = predicate;
 171    return inst;
 172 }
 173
 174 /** Gen6+ IF with embedded comparison. */
 175 fs_inst *
 176 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
 177 {
 178    assert(intel->gen >= 6);
 179    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
 180                                         reg_null_d, src0, src1);
 181    inst->conditional_mod = condition;
 182    return inst;
 183 }
 184
 185 /**
 186  * CMP: Sets the low bit of the destination channels with the result
 187  * of the comparison, while the upper bits are undefined, and updates
 188  * the flag register with the packed 16 bits of the result.
 189  */
 190 fs_inst *
 191 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
 192 {
 193    fs_inst *inst;
 194
 195    /* Take the instruction:
 196     *
 197     * CMP null<d> src0<f> src1<f>
 198     *
 199     * Original gen4 does type conversion to the destination type before
 200     * comparison, producing garbage results for floating point comparisons.
 201     * gen5 does the comparison on the execution type (resolved source types),
 202     * so dst type doesn't matter.  gen6 does comparison and then uses the
 203     * result as if it was the dst type with no conversion, which happens to
 204     * mostly work out for float-interpreted-as-int since our comparisons are
 205     * for >0, =0, <0.
 206     */
 207    if (intel->gen == 4) {
 208       dst.type = src0.type;
 209       if (dst.file == FIXED_HW_REG)
 210          dst.fixed_hw_reg.type = dst.type;
 211    }
 212
 213    resolve_ud_negate(&src0);
 214    resolve_ud_negate(&src1);
 215
 216    inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
 217    inst->conditional_mod = condition;
 218
 219    return inst;
 220 }
 221
 222 exec_list
 223 fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
 224                                        fs_reg offset)
 225 {
 226    exec_list instructions;
 227    fs_inst *inst;
 228
 229    if (intel->gen >= 7) {
 230       inst = new(mem_ctx) fs_inst(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7,
 231                                   dst, surf_index, offset);
 232       instructions.push_tail(inst);
 233    } else {
 234       int base_mrf = 13;
 235       bool header_present = true;
 236
 237       fs_reg mrf = fs_reg(MRF, base_mrf + header_present);
 238       mrf.type = BRW_REGISTER_TYPE_D;
 239
 240       /* On gen6+ we want the dword offset passed in, but on gen4/5 we need a
 241        * dword-aligned byte offset.
 242        */
 243       if (intel->gen == 6) {
 244          instructions.push_tail(MOV(mrf, offset));
 245       } else {
 246          instructions.push_tail(MUL(mrf, offset, fs_reg(4)));
 247       }
 248       inst = MOV(mrf, offset);
 249       inst = new(mem_ctx) fs_inst(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD,
 250                                   dst, surf_index);
 251       inst->header_present = header_present;
 252       inst->base_mrf = base_mrf;
 253       inst->mlen = header_present + dispatch_width / 8;
 254
 255       instructions.push_tail(inst);
 256    }
 257
 258    return instructions;
 259 }
 260
 261 bool
 262 fs_inst::equals(fs_inst *inst)
 263 {
 264    return (opcode == inst->opcode &&
 265            dst.equals(inst->dst) &&
 266            src[0].equals(inst->src[0]) &&
 267            src[1].equals(inst->src[1]) &&
 268            src[2].equals(inst->src[2]) &&
 269            saturate == inst->saturate &&
 270            predicate == inst->predicate &&
 271            conditional_mod == inst->conditional_mod &&
 272            mlen == inst->mlen &&
 273            base_mrf == inst->base_mrf &&
 274            sampler == inst->sampler &&
 275            target == inst->target &&
 276            eot == inst->eot &&
 277            header_present == inst->header_present &&
 278            shadow_compare == inst->shadow_compare &&
 279            offset == inst->offset);
 280 }
 281
 282 int
 283 fs_inst::regs_written()
 284 {
 285    if (is_tex())
 286       return 4;
 287
 288    /* The SINCOS and INT_DIV_QUOTIENT_AND_REMAINDER math functions return 2,
 289     * but we don't currently use them...nor do we have an opcode for them.
 290     */
 291
 292    return 1;
 293 }
 294
 295 bool
 296 fs_inst::overwrites_reg(const fs_reg &reg)
 297 {
 298    return (reg.file == dst.file &&
 299            reg.reg == dst.reg &&
 300            reg.reg_offset >= dst.reg_offset  &&
 301            reg.reg_offset < dst.reg_offset + regs_written());
 302 }
 303
 304 bool
 305 fs_inst::is_tex()
 306 {
 307    return (opcode == SHADER_OPCODE_TEX ||
 308            opcode == FS_OPCODE_TXB ||
 309            opcode == SHADER_OPCODE_TXD ||
 310            opcode == SHADER_OPCODE_TXF ||
 311            opcode == SHADER_OPCODE_TXL ||
 312            opcode == SHADER_OPCODE_TXS);
 313 }
 314
 315 bool
 316 fs_inst::is_math()
 317 {
 318    return (opcode == SHADER_OPCODE_RCP ||
 319            opcode == SHADER_OPCODE_RSQ ||
 320            opcode == SHADER_OPCODE_SQRT ||
 321            opcode == SHADER_OPCODE_EXP2 ||
 322            opcode == SHADER_OPCODE_LOG2 ||
 323            opcode == SHADER_OPCODE_SIN ||
 324            opcode == SHADER_OPCODE_COS ||
 325            opcode == SHADER_OPCODE_INT_QUOTIENT ||
 326            opcode == SHADER_OPCODE_INT_REMAINDER ||
 327            opcode == SHADER_OPCODE_POW);
 328 }
 329
 330 bool
 331 fs_inst::is_send_from_grf()
 332 {
 333    return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
 334            (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
 335             src[1].file == GRF));
 336 }
 337
 338 bool
 339 fs_visitor::can_do_source_mods(fs_inst *inst)
 340 {
 341    if (intel->gen == 6 && inst->is_math())
 342       return false;
 343
 344    if (inst->is_send_from_grf())
 345       return false;
 346
 347    return true;
 348 }
 349
 350 void
 351 fs_reg::init()
 352 {
 353    memset(this, 0, sizeof(*this));
 354    this->smear = -1;
 355 }
 356
 357 /** Generic unset register constructor. */
 358 fs_reg::fs_reg()
 359 {
 360    init();
 361    this->file = BAD_FILE;
 362 }
 363
 364 /** Immediate value constructor. */
 365 fs_reg::fs_reg(float f)
 366 {
 367    init();
 368    this->file = IMM;
 369    this->type = BRW_REGISTER_TYPE_F;
 370    this->imm.f = f;
 371 }
 372
 373 /** Immediate value constructor. */
 374 fs_reg::fs_reg(int32_t i)
 375 {
 376    init();
 377    this->file = IMM;
 378    this->type = BRW_REGISTER_TYPE_D;
 379    this->imm.i = i;
 380 }
 381
 382 /** Immediate value constructor. */
 383 fs_reg::fs_reg(uint32_t u)
 384 {
 385    init();
 386    this->file = IMM;
 387    this->type = BRW_REGISTER_TYPE_UD;
 388    this->imm.u = u;
 389 }
 390
 391 /** Fixed brw_reg Immediate value constructor. */
 392 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
 393 {
 394    init();
 395    this->file = FIXED_HW_REG;
 396    this->fixed_hw_reg = fixed_hw_reg;
 397    this->type = fixed_hw_reg.type;
 398 }
 399
 400 bool
 401 fs_reg::equals(const fs_reg &r) const
 402 {
 403    return (file == r.file &&
 404            reg == r.reg &&
 405            reg_offset == r.reg_offset &&
 406            type == r.type &&
 407            negate == r.negate &&
 408            abs == r.abs &&
 409            !reladdr && !r.reladdr &&
 410            memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
 411                   sizeof(fixed_hw_reg)) == 0 &&
 412            smear == r.smear &&
 413            imm.u == r.imm.u);
 414 }
 415
 416 bool
 417 fs_reg::is_zero() const
 418 {
 419    if (file != IMM)
 420       return false;
 421
 422    return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
 423 }
 424
 425 bool
 426 fs_reg::is_one() const
 427 {
 428    if (file != IMM)
 429       return false;
 430
 431    return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
 432 }
 433
 434 int
 435 fs_visitor::type_size(const struct glsl_type *type)
 436 {
 437    unsigned int size, i;
 438
 439    switch (type->base_type) {
 440    case GLSL_TYPE_UINT:
 441    case GLSL_TYPE_INT:
 442    case GLSL_TYPE_FLOAT:
 443    case GLSL_TYPE_BOOL:
 444       return type->components();
 445    case GLSL_TYPE_ARRAY:
 446       return type_size(type->fields.array) * type->length;
 447    case GLSL_TYPE_STRUCT:
 448       size = 0;
 449       for (i = 0; i < type->length; i++) {
 450          size += type_size(type->fields.structure[i].type);
 451       }
 452       return size;
 453    case GLSL_TYPE_SAMPLER:
 454       /* Samplers take up no register space, since they're baked in at
 455        * link time.
 456        */
 457       return 0;
 458    default:
 459       assert(!"not reached");
 460       return 0;
 461    }
 462 }
 463
 464 fs_reg
 465 fs_visitor::get_timestamp()
 466 {
 467    assert(intel->gen >= 7);
 468
 469    fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 470                                           BRW_ARF_TIMESTAMP,
 471                                           0),
 472                              BRW_REGISTER_TYPE_UD));
 473
 474    fs_reg dst = fs_reg(this, glsl_type::uint_type);
 475
 476    fs_inst *mov = emit(MOV(dst, ts));
 477    /* We want to read the 3 fields we care about (mostly field 0, but also 2)
 478     * even if it's not enabled in the dispatch.
 479     */
 480    mov->force_writemask_all = true;
 481    mov->force_uncompressed = true;
 482
 483    /* The caller wants the low 32 bits of the timestamp.  Since it's running
 484     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
 485     * which is plenty of time for our purposes.  It is identical across the
 486     * EUs, but since it's tracking GPU core speed it will increment at a
 487     * varying rate as render P-states change.
 488     *
 489     * The caller could also check if render P-states have changed (or anything
 490     * else that might disrupt timing) by setting smear to 2 and checking if
 491     * that field is != 0.
 492     */
 493    dst.smear = 0;
 494
 495    return dst;
 496 }
 497
 498 void
 499 fs_visitor::emit_shader_time_begin()
 500 {
 501    current_annotation = "shader time start";
 502    shader_start_time = get_timestamp();
 503 }
 504
 505 void
 506 fs_visitor::emit_shader_time_end()
 507 {
 508    current_annotation = "shader time end";
 509
 510    enum shader_time_shader_type type, written_type, reset_type;
 511    if (dispatch_width == 8) {
 512       type = ST_FS8;
 513       written_type = ST_FS8_WRITTEN;
 514       reset_type = ST_FS8_RESET;
 515    } else {
 516       assert(dispatch_width == 16);
 517       type = ST_FS16;
 518       written_type = ST_FS16_WRITTEN;
 519       reset_type = ST_FS16_RESET;
 520    }
 521
 522    fs_reg shader_end_time = get_timestamp();
 523
 524    /* Check that there weren't any timestamp reset events (assuming these
 525     * were the only two timestamp reads that happened).
 526     */
 527    fs_reg reset = shader_end_time;
 528    reset.smear = 2;
 529    fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
 530    test->conditional_mod = BRW_CONDITIONAL_Z;
 531    emit(IF(BRW_PREDICATE_NORMAL));
 532
 533    push_force_uncompressed();
 534    fs_reg start = shader_start_time;
 535    start.negate = true;
 536    fs_reg diff = fs_reg(this, glsl_type::uint_type);
 537    emit(ADD(diff, start, shader_end_time));
 538
 539    /* If there were no instructions between the two timestamp gets, the diff
 540     * is 2 cycles.  Remove that overhead, so I can forget about that when
 541     * trying to determine the time taken for single instructions.
 542     */
 543    emit(ADD(diff, diff, fs_reg(-2u)));
 544
 545    emit_shader_time_write(type, diff);
 546    emit_shader_time_write(written_type, fs_reg(1u));
 547    emit(BRW_OPCODE_ELSE);
 548    emit_shader_time_write(reset_type, fs_reg(1u));
 549    emit(BRW_OPCODE_ENDIF);
 550
 551    pop_force_uncompressed();
 552 }
 553
 554 void
 555 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
 556                                    fs_reg value)
 557 {
 558    /* Choose an index in the buffer and set up tracking information for our
 559     * printouts.
 560     */
 561    int shader_time_index = brw->shader_time.num_entries++;
 562    assert(shader_time_index <= brw->shader_time.max_entries);
 563    brw->shader_time.types[shader_time_index] = type;
 564    if (prog) {
 565       _mesa_reference_shader_program(ctx,
 566                                      &brw->shader_time.programs[shader_time_index],
 567                                      prog);
 568    }
 569
 570    int base_mrf = 6;
 571
 572    fs_reg offset_mrf = fs_reg(MRF, base_mrf);
 573    offset_mrf.type = BRW_REGISTER_TYPE_UD;
 574    emit(MOV(offset_mrf, fs_reg(shader_time_index * 4)));
 575
 576    fs_reg time_mrf = fs_reg(MRF, base_mrf + 1);
 577    time_mrf.type = BRW_REGISTER_TYPE_UD;
 578    emit(MOV(time_mrf, value));
 579
 580    fs_inst *inst = emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD));
 581    inst->base_mrf = base_mrf;
 582    inst->mlen = 2;
 583 }
 584
 585 void
 586 fs_visitor::fail(const char *format, ...)
 587 {
 588    va_list va;
 589    char *msg;
 590
 591    if (failed)
 592       return;
 593
 594    failed = true;
 595
 596    va_start(va, format);
 597    msg = ralloc_vasprintf(mem_ctx, format, va);
 598    va_end(va);
 599    msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
 600
 601    this->fail_msg = msg;
 602
 603    if (INTEL_DEBUG & DEBUG_WM) {
 604       fprintf(stderr, "%s",  msg);
 605    }
 606 }
 607
 608 fs_inst *
 609 fs_visitor::emit(enum opcode opcode)
 610 {
 611    return emit(fs_inst(opcode));
 612 }
 613
 614 fs_inst *
 615 fs_visitor::emit(enum opcode opcode, fs_reg dst)
 616 {
 617    return emit(fs_inst(opcode, dst));
 618 }
 619
 620 fs_inst *
 621 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
 622 {
 623    return emit(fs_inst(opcode, dst, src0));
 624 }
 625
 626 fs_inst *
 627 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 628 {
 629    return emit(fs_inst(opcode, dst, src0, src1));
 630 }
 631
 632 fs_inst *
 633 fs_visitor::emit(enum opcode opcode, fs_reg dst,
 634                  fs_reg src0, fs_reg src1, fs_reg src2)
 635 {
 636    return emit(fs_inst(opcode, dst, src0, src1, src2));
 637 }
 638
 639 void
 640 fs_visitor::push_force_uncompressed()
 641 {
 642    force_uncompressed_stack++;
 643 }
 644
 645 void
 646 fs_visitor::pop_force_uncompressed()
 647 {
 648    force_uncompressed_stack--;
 649    assert(force_uncompressed_stack >= 0);
 650 }
 651
 652 void
 653 fs_visitor::push_force_sechalf()
 654 {
 655    force_sechalf_stack++;
 656 }
 657
 658 void
 659 fs_visitor::pop_force_sechalf()
 660 {
 661    force_sechalf_stack--;
 662    assert(force_sechalf_stack >= 0);
 663 }
 664
 665 /**
 666  * Returns how many MRFs an FS opcode will write over.
 667  *
 668  * Note that this is not the 0 or 1 implied writes in an actual gen
 669  * instruction -- the FS opcodes often generate MOVs in addition.
 670  */
 671 int
 672 fs_visitor::implied_mrf_writes(fs_inst *inst)
 673 {
 674    if (inst->mlen == 0)
 675       return 0;
 676
 677    switch (inst->opcode) {
 678    case SHADER_OPCODE_RCP:
 679    case SHADER_OPCODE_RSQ:
 680    case SHADER_OPCODE_SQRT:
 681    case SHADER_OPCODE_EXP2:
 682    case SHADER_OPCODE_LOG2:
 683    case SHADER_OPCODE_SIN:
 684    case SHADER_OPCODE_COS:
 685       return 1 * dispatch_width / 8;
 686    case SHADER_OPCODE_POW:
 687    case SHADER_OPCODE_INT_QUOTIENT:
 688    case SHADER_OPCODE_INT_REMAINDER:
 689       return 2 * dispatch_width / 8;
 690    case SHADER_OPCODE_TEX:
 691    case FS_OPCODE_TXB:
 692    case SHADER_OPCODE_TXD:
 693    case SHADER_OPCODE_TXF:
 694    case SHADER_OPCODE_TXL:
 695    case SHADER_OPCODE_TXS:
 696       return 1;
 697    case SHADER_OPCODE_SHADER_TIME_ADD:
 698       return 0;
 699    case FS_OPCODE_FB_WRITE:
 700       return 2;
 701    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 702    case FS_OPCODE_UNSPILL:
 703       return 1;
 704    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
 705       return inst->header_present;
 706    case FS_OPCODE_SPILL:
 707       return 2;
 708    default:
 709       assert(!"not reached");
 710       return inst->mlen;
 711    }
 712 }
 713
 714 int
 715 fs_visitor::virtual_grf_alloc(int size)
 716 {
 717    if (virtual_grf_array_size <= virtual_grf_count) {
 718       if (virtual_grf_array_size == 0)
 719          virtual_grf_array_size = 16;
 720       else
 721          virtual_grf_array_size *= 2;
 722       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 723                                    virtual_grf_array_size);
 724    }
 725    virtual_grf_sizes[virtual_grf_count] = size;
 726    return virtual_grf_count++;
 727 }
 728
 729 /** Fixed HW reg constructor. */
 730 fs_reg::fs_reg(enum register_file file, int reg)
 731 {
 732    init();
 733    this->file = file;
 734    this->reg = reg;
 735    this->type = BRW_REGISTER_TYPE_F;
 736 }
 737
 738 /** Fixed HW reg constructor. */
 739 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
 740 {
 741    init();
 742    this->file = file;
 743    this->reg = reg;
 744    this->type = type;
 745 }
 746
 747 /** Automatic reg constructor. */
 748 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
 749 {
 750    init();
 751
 752    this->file = GRF;
 753    this->reg = v->virtual_grf_alloc(v->type_size(type));
 754    this->reg_offset = 0;
 755    this->type = brw_type_for_base_type(type);
 756 }
 757
 758 fs_reg *
 759 fs_visitor::variable_storage(ir_variable *var)
 760 {
 761    return (fs_reg *)hash_table_find(this->variable_ht, var);
 762 }
 763
 764 void
 765 import_uniforms_callback(const void *key,
 766                          void *data,
 767                          void *closure)
 768 {
 769    struct hash_table *dst_ht = (struct hash_table *)closure;
 770    const fs_reg *reg = (const fs_reg *)data;
 771
 772    if (reg->file != UNIFORM)
 773       return;
 774
 775    hash_table_insert(dst_ht, data, key);
 776 }
 777
 778 /* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
 779  * This brings in those uniform definitions
 780  */
 781 void
 782 fs_visitor::import_uniforms(fs_visitor *v)
 783 {
 784    hash_table_call_foreach(v->variable_ht,
 785                            import_uniforms_callback,
 786                            variable_ht);
 787    this->params_remap = v->params_remap;
 788 }
 789
 790 /* Our support for uniforms is piggy-backed on the struct
 791  * gl_fragment_program, because that's where the values actually
 792  * get stored, rather than in some global gl_shader_program uniform
 793  * store.
 794  */
 795 int
 796 fs_visitor::setup_uniform_values(int loc, const glsl_type *type)
 797 {
 798    unsigned int offset = 0;
 799
 800    if (type->is_matrix()) {
 801       const glsl_type *column = glsl_type::get_instance(GLSL_TYPE_FLOAT,
 802                                                         type->vector_elements,
 803                                                         1);
 804
 805       for (unsigned int i = 0; i < type->matrix_columns; i++) {
 806          offset += setup_uniform_values(loc + offset, column);
 807       }
 808
 809       return offset;
 810    }
 811
 812    switch (type->base_type) {
 813    case GLSL_TYPE_FLOAT:
 814    case GLSL_TYPE_UINT:
 815    case GLSL_TYPE_INT:
 816    case GLSL_TYPE_BOOL:
 817       for (unsigned int i = 0; i < type->vector_elements; i++) {
 818          c->prog_data.param[c->prog_data.nr_params++] =
 819             &fp->Base.Parameters->ParameterValues[loc][i].f;
 820       }
 821       return 1;
 822
 823    case GLSL_TYPE_STRUCT:
 824       for (unsigned int i = 0; i < type->length; i++) {
 825          offset += setup_uniform_values(loc + offset,
 826                                         type->fields.structure[i].type);
 827       }
 828       return offset;
 829
 830    case GLSL_TYPE_ARRAY:
 831       for (unsigned int i = 0; i < type->length; i++) {
 832          offset += setup_uniform_values(loc + offset, type->fields.array);
 833       }
 834       return offset;
 835
 836    case GLSL_TYPE_SAMPLER:
 837       /* The sampler takes up a slot, but we don't use any values from it. */
 838       return 1;
 839
 840    default:
 841       assert(!"not reached");
 842       return 0;
 843    }
 844 }
 845
 846
 847 /* Our support for builtin uniforms is even scarier than non-builtin.
 848  * It sits on top of the PROG_STATE_VAR parameters that are
 849  * automatically updated from GL context state.
 850  */
 851 void
 852 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
 853 {
 854    const ir_state_slot *const slots = ir->state_slots;
 855    assert(ir->state_slots != NULL);
 856
 857    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 858       /* This state reference has already been setup by ir_to_mesa, but we'll
 859        * get the same index back here.
 860        */
 861       int index = _mesa_add_state_reference(this->fp->Base.Parameters,
 862                                             (gl_state_index *)slots[i].tokens);
 863
 864       /* Add each of the unique swizzles of the element as a parameter.
 865        * This'll end up matching the expected layout of the
 866        * array/matrix/structure we're trying to fill in.
 867        */
 868       int last_swiz = -1;
 869       for (unsigned int j = 0; j < 4; j++) {
 870          int swiz = GET_SWZ(slots[i].swizzle, j);
 871          if (swiz == last_swiz)
 872             break;
 873          last_swiz = swiz;
 874
 875          c->prog_data.param[c->prog_data.nr_params++] =
 876             &fp->Base.Parameters->ParameterValues[index][swiz].f;
 877       }
 878    }
 879 }
 880
 881 fs_reg *
 882 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
 883 {
 884    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 885    fs_reg wpos = *reg;
 886    bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
 887
 888    /* gl_FragCoord.x */
 889    if (ir->pixel_center_integer) {
 890       emit(MOV(wpos, this->pixel_x));
 891    } else {
 892       emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
 893    }
 894    wpos.reg_offset++;
 895
 896    /* gl_FragCoord.y */
 897    if (!flip && ir->pixel_center_integer) {
 898       emit(MOV(wpos, this->pixel_y));
 899    } else {
 900       fs_reg pixel_y = this->pixel_y;
 901       float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
 902
 903       if (flip) {
 904          pixel_y.negate = true;
 905          offset += c->key.drawable_height - 1.0;
 906       }
 907
 908       emit(ADD(wpos, pixel_y, fs_reg(offset)));
 909    }
 910    wpos.reg_offset++;
 911
 912    /* gl_FragCoord.z */
 913    if (intel->gen >= 6) {
 914       emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
 915    } else {
 916       emit(FS_OPCODE_LINTERP, wpos,
 917            this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
 918            this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
 919            interp_reg(FRAG_ATTRIB_WPOS, 2));
 920    }
 921    wpos.reg_offset++;
 922
 923    /* gl_FragCoord.w: Already set up in emit_interpolation */
 924    emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
 925
 926    return reg;
 927 }
 928
 929 fs_inst *
 930 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
 931                          glsl_interp_qualifier interpolation_mode,
 932                          bool is_centroid)
 933 {
 934    brw_wm_barycentric_interp_mode barycoord_mode;
 935    if (is_centroid) {
 936       if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
 937          barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
 938       else
 939          barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
 940    } else {
 941       if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
 942          barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
 943       else
 944          barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
 945    }
 946    return emit(FS_OPCODE_LINTERP, attr,
 947                this->delta_x[barycoord_mode],
 948                this->delta_y[barycoord_mode], interp);
 949 }
 950
 951 fs_reg *
 952 fs_visitor::emit_general_interpolation(ir_variable *ir)
 953 {
 954    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 955    reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
 956    fs_reg attr = *reg;
 957
 958    unsigned int array_elements;
 959    const glsl_type *type;
 960
 961    if (ir->type->is_array()) {
 962       array_elements = ir->type->length;
 963       if (array_elements == 0) {
 964          fail("dereferenced array '%s' has length 0\n", ir->name);
 965       }
 966       type = ir->type->fields.array;
 967    } else {
 968       array_elements = 1;
 969       type = ir->type;
 970    }
 971
 972    glsl_interp_qualifier interpolation_mode =
 973       ir->determine_interpolation_mode(c->key.flat_shade);
 974
 975    int location = ir->location;
 976    for (unsigned int i = 0; i < array_elements; i++) {
 977       for (unsigned int j = 0; j < type->matrix_columns; j++) {
 978          if (urb_setup[location] == -1) {
 979             /* If there's no incoming setup data for this slot, don't
 980              * emit interpolation for it.
 981              */
 982             attr.reg_offset += type->vector_elements;
 983             location++;
 984             continue;
 985          }
 986
 987          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
 988             /* Constant interpolation (flat shading) case. The SF has
 989              * handed us defined values in only the constant offset
 990              * field of the setup reg.
 991              */
 992             for (unsigned int k = 0; k < type->vector_elements; k++) {
 993                struct brw_reg interp = interp_reg(location, k);
 994                interp = suboffset(interp, 3);
 995                interp.type = reg->type;
 996                emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
 997                attr.reg_offset++;
 998             }
 999          } else {
1000             /* Smooth/noperspective interpolation case. */
1001             for (unsigned int k = 0; k < type->vector_elements; k++) {
1002                /* FINISHME: At some point we probably want to push
1003                 * this farther by giving similar treatment to the
1004                 * other potentially constant components of the
1005                 * attribute, as well as making brw_vs_constval.c
1006                 * handle varyings other than gl_TexCoord.
1007                 */
1008                if (location >= FRAG_ATTRIB_TEX0 &&
1009                    location <= FRAG_ATTRIB_TEX7 &&
1010                    k == 3 && !(c->key.proj_attrib_mask & (1 << location))) {
1011                   emit(BRW_OPCODE_MOV, attr, fs_reg(1.0f));
1012                } else {
1013                   struct brw_reg interp = interp_reg(location, k);
1014                   emit_linterp(attr, fs_reg(interp), interpolation_mode,
1015                                ir->centroid);
1016                   if (brw->needs_unlit_centroid_workaround && ir->centroid) {
1017                      /* Get the pixel/sample mask into f0 so that we know
1018                       * which pixels are lit.  Then, for each channel that is
1019                       * unlit, replace the centroid data with non-centroid
1020                       * data.
1021                       */
1022                      emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS, attr);
1023                      fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1024                                                   interpolation_mode, false);
1025                      inst->predicate = BRW_PREDICATE_NORMAL;
1026                      inst->predicate_inverse = true;
1027                   }
1028                   if (intel->gen < 6) {
1029                      emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1030                   }
1031                }
1032                attr.reg_offset++;
1033             }
1034
1035          }
1036          location++;
1037       }
1038    }
1039
1040    return reg;
1041 }
1042
1043 fs_reg *
1044 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1045 {
1046    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1047
1048    /* The frontfacing comes in as a bit in the thread payload. */
1049    if (intel->gen >= 6) {
1050       emit(BRW_OPCODE_ASR, *reg,
1051            fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1052            fs_reg(15));
1053       emit(BRW_OPCODE_NOT, *reg, *reg);
1054       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1055    } else {
1056       struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1057       /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1058        * us front face
1059        */
1060       emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1061       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1062    }
1063
1064    return reg;
1065 }
1066
1067 fs_reg
1068 fs_visitor::fix_math_operand(fs_reg src)
1069 {
1070    /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1071     * might be able to do better by doing execsize = 1 math and then
1072     * expanding that result out, but we would need to be careful with
1073     * masking.
1074     *
1075     * The hardware ignores source modifiers (negate and abs) on math
1076     * instructions, so we also move to a temp to set those up.
1077     */
1078    if (intel->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1079        !src.abs && !src.negate)
1080       return src;
1081
1082    /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1083     * operands to math
1084     */
1085    if (intel->gen >= 7 && src.file != IMM)
1086       return src;
1087
1088    fs_reg expanded = fs_reg(this, glsl_type::float_type);
1089    expanded.type = src.type;
1090    emit(BRW_OPCODE_MOV, expanded, src);
1091    return expanded;
1092 }
1093
1094 fs_inst *
1095 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1096 {
1097    switch (opcode) {
1098    case SHADER_OPCODE_RCP:
1099    case SHADER_OPCODE_RSQ:
1100    case SHADER_OPCODE_SQRT:
1101    case SHADER_OPCODE_EXP2:
1102    case SHADER_OPCODE_LOG2:
1103    case SHADER_OPCODE_SIN:
1104    case SHADER_OPCODE_COS:
1105       break;
1106    default:
1107       assert(!"not reached: bad math opcode");
1108       return NULL;
1109    }
1110
1111    /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
1112     * might be able to do better by doing execsize = 1 math and then
1113     * expanding that result out, but we would need to be careful with
1114     * masking.
1115     *
1116     * Gen 6 hardware ignores source modifiers (negate and abs) on math
1117     * instructions, so we also move to a temp to set those up.
1118     */
1119    if (intel->gen >= 6)
1120       src = fix_math_operand(src);
1121
1122    fs_inst *inst = emit(opcode, dst, src);
1123
1124    if (intel->gen < 6) {
1125       inst->base_mrf = 2;
1126       inst->mlen = dispatch_width / 8;
1127    }
1128
1129    return inst;
1130 }
1131
1132 fs_inst *
1133 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1134 {
1135    int base_mrf = 2;
1136    fs_inst *inst;
1137
1138    switch (opcode) {
1139    case SHADER_OPCODE_INT_QUOTIENT:
1140    case SHADER_OPCODE_INT_REMAINDER:
1141       if (intel->gen >= 7 && dispatch_width == 16)
1142          fail("16-wide INTDIV unsupported\n");
1143       break;
1144    case SHADER_OPCODE_POW:
1145       break;
1146    default:
1147       assert(!"not reached: unsupported binary math opcode.");
1148       return NULL;
1149    }
1150
1151    if (intel->gen >= 6) {
1152       src0 = fix_math_operand(src0);
1153       src1 = fix_math_operand(src1);
1154
1155       inst = emit(opcode, dst, src0, src1);
1156    } else {
1157       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1158        * "Message Payload":
1159        *
1160        * "Operand0[7].  For the INT DIV functions, this operand is the
1161        *  denominator."
1162        *  ...
1163        * "Operand1[7].  For the INT DIV functions, this operand is the
1164        *  numerator."
1165        */
1166       bool is_int_div = opcode != SHADER_OPCODE_POW;
1167       fs_reg &op0 = is_int_div ? src1 : src0;
1168       fs_reg &op1 = is_int_div ? src0 : src1;
1169
1170       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1171       inst = emit(opcode, dst, op0, reg_null_f);
1172
1173       inst->base_mrf = base_mrf;
1174       inst->mlen = 2 * dispatch_width / 8;
1175    }
1176    return inst;
1177 }
1178
1179 void
1180 fs_visitor::assign_curb_setup()
1181 {
1182    c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
1183    if (dispatch_width == 8) {
1184       c->prog_data.first_curbe_grf = c->nr_payload_regs;
1185    } else {
1186       c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1187    }
1188
1189    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1190    foreach_list(node, &this->instructions) {
1191       fs_inst *inst = (fs_inst *)node;
1192
1193       for (unsigned int i = 0; i < 3; i++) {
1194          if (inst->src[i].file == UNIFORM) {
1195             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1196             struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1197                                                   constant_nr / 8,
1198                                                   constant_nr % 8);
1199
1200             inst->src[i].file = FIXED_HW_REG;
1201             inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
1202          }
1203       }
1204    }
1205 }
1206
1207 void
1208 fs_visitor::calculate_urb_setup()
1209 {
1210    for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
1211       urb_setup[i] = -1;
1212    }
1213
1214    int urb_next = 0;
1215    /* Figure out where each of the incoming setup attributes lands. */
1216    if (intel->gen >= 6) {
1217       for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
1218          if (fp->Base.InputsRead & BITFIELD64_BIT(i)) {
1219             urb_setup[i] = urb_next++;
1220          }
1221       }
1222    } else {
1223       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1224       for (unsigned int i = 0; i < VERT_RESULT_MAX; i++) {
1225          /* Point size is packed into the header, not as a general attribute */
1226          if (i == VERT_RESULT_PSIZ)
1227             continue;
1228
1229          if (c->key.vp_outputs_written & BITFIELD64_BIT(i)) {
1230             int fp_index = _mesa_vert_result_to_frag_attrib((gl_vert_result) i);
1231
1232             /* The back color slot is skipped when the front color is
1233              * also written to.  In addition, some slots can be
1234              * written in the vertex shader and not read in the
1235              * fragment shader.  So the register number must always be
1236              * incremented, mapped or not.
1237              */
1238             if (fp_index >= 0)
1239                urb_setup[fp_index] = urb_next;
1240             urb_next++;
1241          }
1242       }
1243
1244       /*
1245        * It's a FS only attribute, and we did interpolation for this attribute
1246        * in SF thread. So, count it here, too.
1247        *
1248        * See compile_sf_prog() for more info.
1249        */
1250       if (fp->Base.InputsRead & BITFIELD64_BIT(FRAG_ATTRIB_PNTC))
1251          urb_setup[FRAG_ATTRIB_PNTC] = urb_next++;
1252    }
1253
1254    /* Each attribute is 4 setup channels, each of which is half a reg. */
1255    c->prog_data.urb_read_length = urb_next * 2;
1256 }
1257
1258 void
1259 fs_visitor::assign_urb_setup()
1260 {
1261    int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1262
1263    /* Offset all the urb_setup[] index by the actual position of the
1264     * setup regs, now that the location of the constants has been chosen.
1265     */
1266    foreach_list(node, &this->instructions) {
1267       fs_inst *inst = (fs_inst *)node;
1268
1269       if (inst->opcode == FS_OPCODE_LINTERP) {
1270          assert(inst->src[2].file == FIXED_HW_REG);
1271          inst->src[2].fixed_hw_reg.nr += urb_start;
1272       }
1273
1274       if (inst->opcode == FS_OPCODE_CINTERP) {
1275          assert(inst->src[0].file == FIXED_HW_REG);
1276          inst->src[0].fixed_hw_reg.nr += urb_start;
1277       }
1278    }
1279
1280    this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length;
1281 }
1282
1283 /**
1284  * Split large virtual GRFs into separate components if we can.
1285  *
1286  * This is mostly duplicated with what brw_fs_vector_splitting does,
1287  * but that's really conservative because it's afraid of doing
1288  * splitting that doesn't result in real progress after the rest of
1289  * the optimization phases, which would cause infinite looping in
1290  * optimization.  We can do it once here, safely.  This also has the
1291  * opportunity to split interpolated values, or maybe even uniforms,
1292  * which we don't have at the IR level.
1293  *
1294  * We want to split, because virtual GRFs are what we register
1295  * allocate and spill (due to contiguousness requirements for some
1296  * instructions), and they're what we naturally generate in the
1297  * codegen process, but most virtual GRFs don't actually need to be
1298  * contiguous sets of GRFs.  If we split, we'll end up with reduced
1299  * live intervals and better dead code elimination and coalescing.
1300  */
1301 void
1302 fs_visitor::split_virtual_grfs()
1303 {
1304    int num_vars = this->virtual_grf_count;
1305    bool split_grf[num_vars];
1306    int new_virtual_grf[num_vars];
1307
1308    /* Try to split anything > 0 sized. */
1309    for (int i = 0; i < num_vars; i++) {
1310       if (this->virtual_grf_sizes[i] != 1)
1311          split_grf[i] = true;
1312       else
1313          split_grf[i] = false;
1314    }
1315
1316    if (brw->has_pln &&
1317        this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1318       /* PLN opcodes rely on the delta_xy being contiguous.  We only have to
1319        * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1320        * Gen6, that was the only supported interpolation mode, and since Gen6,
1321        * delta_x and delta_y are in fixed hardware registers.
1322        */
1323       split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1324          false;
1325    }
1326
1327    foreach_list(node, &this->instructions) {
1328       fs_inst *inst = (fs_inst *)node;
1329
1330       /* If there's a SEND message that requires contiguous destination
1331        * registers, no splitting is allowed.
1332        */
1333       if (inst->regs_written() > 1) {
1334          split_grf[inst->dst.reg] = false;
1335       }
1336    }
1337
1338    /* Allocate new space for split regs.  Note that the virtual
1339     * numbers will be contiguous.
1340     */
1341    for (int i = 0; i < num_vars; i++) {
1342       if (split_grf[i]) {
1343          new_virtual_grf[i] = virtual_grf_alloc(1);
1344          for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1345             int reg = virtual_grf_alloc(1);
1346             assert(reg == new_virtual_grf[i] + j - 1);
1347             (void) reg;
1348          }
1349          this->virtual_grf_sizes[i] = 1;
1350       }
1351    }
1352
1353    foreach_list(node, &this->instructions) {
1354       fs_inst *inst = (fs_inst *)node;
1355
1356       if (inst->dst.file == GRF &&
1357           split_grf[inst->dst.reg] &&
1358           inst->dst.reg_offset != 0) {
1359          inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1360                           inst->dst.reg_offset - 1);
1361          inst->dst.reg_offset = 0;
1362       }
1363       for (int i = 0; i < 3; i++) {
1364          if (inst->src[i].file == GRF &&
1365              split_grf[inst->src[i].reg] &&
1366              inst->src[i].reg_offset != 0) {
1367             inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1368                                 inst->src[i].reg_offset - 1);
1369             inst->src[i].reg_offset = 0;
1370          }
1371       }
1372    }
1373    this->live_intervals_valid = false;
1374 }
1375
1376 /**
1377  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1378  *
1379  * During code generation, we create tons of temporary variables, many of
1380  * which get immediately killed and are never used again.  Yet, in later
1381  * optimization and analysis passes, such as compute_live_intervals, we need
1382  * to loop over all the virtual GRFs.  Compacting them can save a lot of
1383  * overhead.
1384  */
1385 void
1386 fs_visitor::compact_virtual_grfs()
1387 {
1388    /* Mark which virtual GRFs are used, and count how many. */
1389    int remap_table[this->virtual_grf_count];
1390    memset(remap_table, -1, sizeof(remap_table));
1391
1392    foreach_list(node, &this->instructions) {
1393       const fs_inst *inst = (const fs_inst *) node;
1394
1395       if (inst->dst.file == GRF)
1396          remap_table[inst->dst.reg] = 0;
1397
1398       for (int i = 0; i < 3; i++) {
1399          if (inst->src[i].file == GRF)
1400             remap_table[inst->src[i].reg] = 0;
1401       }
1402    }
1403
1404    /* In addition to registers used in instructions, fs_visitor keeps
1405     * direct references to certain special values which must be patched:
1406     */
1407    fs_reg *special[] = {
1408       &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1409       &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1410       &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1411       &delta_x[0], &delta_x[1], &delta_x[2],
1412       &delta_x[3], &delta_x[4], &delta_x[5],
1413       &delta_y[0], &delta_y[1], &delta_y[2],
1414       &delta_y[3], &delta_y[4], &delta_y[5],
1415    };
1416    STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1417    STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1418
1419    /* Treat all special values as used, to be conservative */
1420    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1421       if (special[i]->file == GRF)
1422          remap_table[special[i]->reg] = 0;
1423    }
1424
1425    /* Compact the GRF arrays. */
1426    int new_index = 0;
1427    for (int i = 0; i < this->virtual_grf_count; i++) {
1428       if (remap_table[i] != -1) {
1429          remap_table[i] = new_index;
1430          virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1431          if (live_intervals_valid) {
1432             virtual_grf_use[new_index] = virtual_grf_use[i];
1433             virtual_grf_def[new_index] = virtual_grf_def[i];
1434          }
1435          ++new_index;
1436       }
1437    }
1438
1439    this->virtual_grf_count = new_index;
1440
1441    /* Patch all the instructions to use the newly renumbered registers */
1442    foreach_list(node, &this->instructions) {
1443       fs_inst *inst = (fs_inst *) node;
1444
1445       if (inst->dst.file == GRF)
1446          inst->dst.reg = remap_table[inst->dst.reg];
1447
1448       for (int i = 0; i < 3; i++) {
1449          if (inst->src[i].file == GRF)
1450             inst->src[i].reg = remap_table[inst->src[i].reg];
1451       }
1452    }
1453
1454    /* Patch all the references to special values */
1455    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1456       if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1457          special[i]->reg = remap_table[special[i]->reg];
1458    }
1459 }
1460
1461 bool
1462 fs_visitor::remove_dead_constants()
1463 {
1464    if (dispatch_width == 8) {
1465       this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
1466
1467       for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
1468          this->params_remap[i] = -1;
1469
1470       /* Find which params are still in use. */
1471       foreach_list(node, &this->instructions) {
1472          fs_inst *inst = (fs_inst *)node;
1473
1474          for (int i = 0; i < 3; i++) {
1475             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1476
1477             if (inst->src[i].file != UNIFORM)
1478                continue;
1479
1480             assert(constant_nr < (int)c->prog_data.nr_params);
1481
1482             /* For now, set this to non-negative.  We'll give it the
1483              * actual new number in a moment, in order to keep the
1484              * register numbers nicely ordered.
1485              */
1486             this->params_remap[constant_nr] = 0;
1487          }
1488       }
1489
1490       /* Figure out what the new numbers for the params will be.  At some
1491        * point when we're doing uniform array access, we're going to want
1492        * to keep the distinction between .reg and .reg_offset, but for
1493        * now we don't care.
1494        */
1495       unsigned int new_nr_params = 0;
1496       for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1497          if (this->params_remap[i] != -1) {
1498             this->params_remap[i] = new_nr_params++;
1499          }
1500       }
1501
1502       /* Update the list of params to be uploaded to match our new numbering. */
1503       for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1504          int remapped = this->params_remap[i];
1505
1506          if (remapped == -1)
1507             continue;
1508
1509          c->prog_data.param[remapped] = c->prog_data.param[i];
1510       }
1511
1512       c->prog_data.nr_params = new_nr_params;
1513    } else {
1514       /* This should have been generated in the 8-wide pass already. */
1515       assert(this->params_remap);
1516    }
1517
1518    /* Now do the renumbering of the shader to remove unused params. */
1519    foreach_list(node, &this->instructions) {
1520       fs_inst *inst = (fs_inst *)node;
1521
1522       for (int i = 0; i < 3; i++) {
1523          int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1524
1525          if (inst->src[i].file != UNIFORM)
1526             continue;
1527
1528          assert(this->params_remap[constant_nr] != -1);
1529          inst->src[i].reg = this->params_remap[constant_nr];
1530          inst->src[i].reg_offset = 0;
1531       }
1532    }
1533
1534    return true;
1535 }
1536
1537 /*
1538  * Implements array access of uniforms by inserting a
1539  * PULL_CONSTANT_LOAD instruction.
1540  *
1541  * Unlike temporary GRF array access (where we don't support it due to
1542  * the difficulty of doing relative addressing on instruction
1543  * destinations), we could potentially do array access of uniforms
1544  * that were loaded in GRF space as push constants.  In real-world
1545  * usage we've seen, though, the arrays being used are always larger
1546  * than we could load as push constants, so just always move all
1547  * uniform array access out to a pull constant buffer.
1548  */
1549 void
1550 fs_visitor::move_uniform_array_access_to_pull_constants()
1551 {
1552    int pull_constant_loc[c->prog_data.nr_params];
1553
1554    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1555       pull_constant_loc[i] = -1;
1556    }
1557
1558    /* Walk through and find array access of uniforms.  Put a copy of that
1559     * uniform in the pull constant buffer.
1560     *
1561     * Note that we don't move constant-indexed accesses to arrays.  No
1562     * testing has been done of the performance impact of this choice.
1563     */
1564    foreach_list_safe(node, &this->instructions) {
1565       fs_inst *inst = (fs_inst *)node;
1566
1567       for (int i = 0 ; i < 3; i++) {
1568          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1569             continue;
1570
1571          int uniform = inst->src[i].reg;
1572
1573          /* If this array isn't already present in the pull constant buffer,
1574           * add it.
1575           */
1576          if (pull_constant_loc[uniform] == -1) {
1577             const float **values = &c->prog_data.param[uniform];
1578
1579             pull_constant_loc[uniform] = c->prog_data.nr_pull_params;
1580
1581             assert(param_size[uniform]);
1582
1583             for (int j = 0; j < param_size[uniform]; j++) {
1584                c->prog_data.pull_param[c->prog_data.nr_pull_params++] =
1585                   values[j];
1586             }
1587          }
1588
1589          /* Set up the annotation tracking for new generated instructions. */
1590          base_ir = inst->ir;
1591          current_annotation = inst->annotation;
1592
1593          fs_reg offset = fs_reg(this, glsl_type::int_type);
1594          inst->insert_before(ADD(offset, *inst->src[i].reladdr,
1595                                  fs_reg(pull_constant_loc[uniform] +
1596                                         inst->src[i].reg_offset)));
1597
1598          fs_reg surf_index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1599          fs_reg temp = fs_reg(this, glsl_type::float_type);
1600          exec_list list = VARYING_PULL_CONSTANT_LOAD(temp,
1601                                                      surf_index, offset);
1602          inst->insert_before(&list);
1603
1604          inst->src[i].file = temp.file;
1605          inst->src[i].reg = temp.reg;
1606          inst->src[i].reg_offset = temp.reg_offset;
1607          inst->src[i].reladdr = NULL;
1608       }
1609    }
1610 }
1611
1612 /**
1613  * Choose accesses from the UNIFORM file to demote to using the pull
1614  * constant buffer.
1615  *
1616  * We allow a fragment shader to have more than the specified minimum
1617  * maximum number of fragment shader uniform components (64).  If
1618  * there are too many of these, they'd fill up all of register space.
1619  * So, this will push some of them out to the pull constant buffer and
1620  * update the program to load them.
1621  */
1622 void
1623 fs_visitor::setup_pull_constants()
1624 {
1625    /* Only allow 16 registers (128 uniform components) as push constants. */
1626    unsigned int max_uniform_components = 16 * 8;
1627    if (c->prog_data.nr_params <= max_uniform_components)
1628       return;
1629
1630    if (dispatch_width == 16) {
1631       fail("Pull constants not supported in 16-wide\n");
1632       return;
1633    }
1634
1635    /* Just demote the end of the list.  We could probably do better
1636     * here, demoting things that are rarely used in the program first.
1637     */
1638    unsigned int pull_uniform_base = max_uniform_components;
1639
1640    int pull_constant_loc[c->prog_data.nr_params];
1641    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1642       if (i < pull_uniform_base) {
1643          pull_constant_loc[i] = -1;
1644       } else {
1645          pull_constant_loc[i] = -1;
1646          /* If our constant is already being uploaded for reladdr purposes,
1647           * reuse it.
1648           */
1649          for (unsigned int j = 0; j < c->prog_data.nr_pull_params; j++) {
1650             if (c->prog_data.pull_param[j] == c->prog_data.param[i]) {
1651                pull_constant_loc[i] = j;
1652                break;
1653             }
1654          }
1655          if (pull_constant_loc[i] == -1) {
1656             int pull_index = c->prog_data.nr_pull_params++;
1657             c->prog_data.pull_param[pull_index] = c->prog_data.param[i];
1658             pull_constant_loc[i] = pull_index;;
1659          }
1660       }
1661    }
1662    c->prog_data.nr_params = pull_uniform_base;
1663
1664    foreach_list(node, &this->instructions) {
1665       fs_inst *inst = (fs_inst *)node;
1666
1667       for (int i = 0; i < 3; i++) {
1668          if (inst->src[i].file != UNIFORM)
1669             continue;
1670
1671          int pull_index = pull_constant_loc[inst->src[i].reg +
1672                                             inst->src[i].reg_offset];
1673          if (pull_index == -1)
1674             continue;
1675
1676          assert(!inst->src[i].reladdr);
1677
1678          fs_reg dst = fs_reg(this, glsl_type::float_type);
1679          fs_reg index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1680          fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1681          fs_inst *pull =
1682             new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1683                                  dst, index, offset);
1684          pull->ir = inst->ir;
1685          pull->annotation = inst->annotation;
1686          pull->base_mrf = 14;
1687          pull->mlen = 1;
1688
1689          inst->insert_before(pull);
1690
1691          inst->src[i].file = GRF;
1692          inst->src[i].reg = dst.reg;
1693          inst->src[i].reg_offset = 0;
1694          inst->src[i].smear = pull_index & 3;
1695       }
1696    }
1697 }
1698
1699 bool
1700 fs_visitor::opt_algebraic()
1701 {
1702    bool progress = false;
1703
1704    foreach_list(node, &this->instructions) {
1705       fs_inst *inst = (fs_inst *)node;
1706
1707       switch (inst->opcode) {
1708       case BRW_OPCODE_MUL:
1709          if (inst->src[1].file != IMM)
1710             continue;
1711
1712          /* a * 1.0 = a */
1713          if (inst->src[1].is_one()) {
1714             inst->opcode = BRW_OPCODE_MOV;
1715             inst->src[1] = reg_undef;
1716             progress = true;
1717             break;
1718          }
1719
1720          /* a * 0.0 = 0.0 */
1721          if (inst->src[1].is_zero()) {
1722             inst->opcode = BRW_OPCODE_MOV;
1723             inst->src[0] = inst->src[1];
1724             inst->src[1] = reg_undef;
1725             progress = true;
1726             break;
1727          }
1728
1729          break;
1730       case BRW_OPCODE_ADD:
1731          if (inst->src[1].file != IMM)
1732             continue;
1733
1734          /* a + 0.0 = a */
1735          if (inst->src[1].is_zero()) {
1736             inst->opcode = BRW_OPCODE_MOV;
1737             inst->src[1] = reg_undef;
1738             progress = true;
1739             break;
1740          }
1741          break;
1742       default:
1743          break;
1744       }
1745    }
1746
1747    return progress;
1748 }
1749
1750 /**
1751  * Must be called after calculate_live_intervales() to remove unused
1752  * writes to registers -- register allocation will fail otherwise
1753  * because something deffed but not used won't be considered to
1754  * interfere with other regs.
1755  */
1756 bool
1757 fs_visitor::dead_code_eliminate()
1758 {
1759    bool progress = false;
1760    int pc = 0;
1761
1762    calculate_live_intervals();
1763
1764    foreach_list_safe(node, &this->instructions) {
1765       fs_inst *inst = (fs_inst *)node;
1766
1767       if (inst->dst.file == GRF && this->virtual_grf_use[inst->dst.reg] <= pc) {
1768          inst->remove();
1769          progress = true;
1770       }
1771
1772       pc++;
1773    }
1774
1775    if (progress)
1776       live_intervals_valid = false;
1777
1778    return progress;
1779 }
1780
1781 /**
1782  * Implements a second type of register coalescing: This one checks if
1783  * the two regs involved in a raw move don't interfere, in which case
1784  * they can both by stored in the same place and the MOV removed.
1785  */
1786 bool
1787 fs_visitor::register_coalesce_2()
1788 {
1789    bool progress = false;
1790
1791    calculate_live_intervals();
1792
1793    foreach_list_safe(node, &this->instructions) {
1794       fs_inst *inst = (fs_inst *)node;
1795
1796       if (inst->opcode != BRW_OPCODE_MOV ||
1797           inst->predicate ||
1798           inst->saturate ||
1799           inst->src[0].file != GRF ||
1800           inst->src[0].negate ||
1801           inst->src[0].abs ||
1802           inst->src[0].smear != -1 ||
1803           inst->dst.file != GRF ||
1804           inst->dst.type != inst->src[0].type ||
1805           virtual_grf_sizes[inst->src[0].reg] != 1 ||
1806           virtual_grf_interferes(inst->dst.reg, inst->src[0].reg)) {
1807          continue;
1808       }
1809
1810       int reg_from = inst->src[0].reg;
1811       assert(inst->src[0].reg_offset == 0);
1812       int reg_to = inst->dst.reg;
1813       int reg_to_offset = inst->dst.reg_offset;
1814
1815       foreach_list(node, &this->instructions) {
1816          fs_inst *scan_inst = (fs_inst *)node;
1817
1818          if (scan_inst->dst.file == GRF &&
1819              scan_inst->dst.reg == reg_from) {
1820             scan_inst->dst.reg = reg_to;
1821             scan_inst->dst.reg_offset = reg_to_offset;
1822          }
1823          for (int i = 0; i < 3; i++) {
1824             if (scan_inst->src[i].file == GRF &&
1825                 scan_inst->src[i].reg == reg_from) {
1826                scan_inst->src[i].reg = reg_to;
1827                scan_inst->src[i].reg_offset = reg_to_offset;
1828             }
1829          }
1830       }
1831
1832       inst->remove();
1833
1834       /* We don't need to recalculate live intervals inside the loop despite
1835        * flagging live_intervals_valid because we only use live intervals for
1836        * the interferes test, and we must have had a situation where the
1837        * intervals were:
1838        *
1839        *  from  to
1840        *  ^
1841        *  |
1842        *  v
1843        *        ^
1844        *        |
1845        *        v
1846        *
1847        * Some register R that might get coalesced with one of these two could
1848        * only be referencing "to", otherwise "from"'s range would have been
1849        * longer.  R's range could also only start at the end of "to" or later,
1850        * otherwise it will conflict with "to" when we try to coalesce "to"
1851        * into Rw anyway.
1852        */
1853       live_intervals_valid = false;
1854
1855       progress = true;
1856       continue;
1857    }
1858
1859    return progress;
1860 }
1861
1862 bool
1863 fs_visitor::register_coalesce()
1864 {
1865    bool progress = false;
1866    int if_depth = 0;
1867    int loop_depth = 0;
1868
1869    foreach_list_safe(node, &this->instructions) {
1870       fs_inst *inst = (fs_inst *)node;
1871
1872       /* Make sure that we dominate the instructions we're going to
1873        * scan for interfering with our coalescing, or we won't have
1874        * scanned enough to see if anything interferes with our
1875        * coalescing.  We don't dominate the following instructions if
1876        * we're in a loop or an if block.
1877        */
1878       switch (inst->opcode) {
1879       case BRW_OPCODE_DO:
1880          loop_depth++;
1881          break;
1882       case BRW_OPCODE_WHILE:
1883          loop_depth--;
1884          break;
1885       case BRW_OPCODE_IF:
1886          if_depth++;
1887          break;
1888       case BRW_OPCODE_ENDIF:
1889          if_depth--;
1890          break;
1891       default:
1892          break;
1893       }
1894       if (loop_depth || if_depth)
1895          continue;
1896
1897       if (inst->opcode != BRW_OPCODE_MOV ||
1898           inst->predicate ||
1899           inst->saturate ||
1900           inst->dst.file != GRF || (inst->src[0].file != GRF &&
1901                                     inst->src[0].file != UNIFORM)||
1902           inst->dst.type != inst->src[0].type)
1903          continue;
1904
1905       bool has_source_modifiers = (inst->src[0].abs ||
1906                                    inst->src[0].negate ||
1907                                    inst->src[0].file == UNIFORM);
1908
1909       /* Found a move of a GRF to a GRF.  Let's see if we can coalesce
1910        * them: check for no writes to either one until the exit of the
1911        * program.
1912        */
1913       bool interfered = false;
1914
1915       for (fs_inst *scan_inst = (fs_inst *)inst->next;
1916            !scan_inst->is_tail_sentinel();
1917            scan_inst = (fs_inst *)scan_inst->next) {
1918          if (scan_inst->dst.file == GRF) {
1919             if (scan_inst->overwrites_reg(inst->dst) ||
1920                 scan_inst->overwrites_reg(inst->src[0])) {
1921                interfered = true;
1922                break;
1923             }
1924          }
1925
1926          /* The gen6 MATH instruction can't handle source modifiers or
1927           * unusual register regions, so avoid coalescing those for
1928           * now.  We should do something more specific.
1929           */
1930          if (has_source_modifiers && !can_do_source_mods(scan_inst)) {
1931             interfered = true;
1932             break;
1933          }
1934
1935          /* The accumulator result appears to get used for the
1936           * conditional modifier generation.  When negating a UD
1937           * value, there is a 33rd bit generated for the sign in the
1938           * accumulator value, so now you can't check, for example,
1939           * equality with a 32-bit value.  See piglit fs-op-neg-uint.
1940           */
1941          if (scan_inst->conditional_mod &&
1942              inst->src[0].negate &&
1943              inst->src[0].type == BRW_REGISTER_TYPE_UD) {
1944             interfered = true;
1945             break;
1946          }
1947       }
1948       if (interfered) {
1949          continue;
1950       }
1951
1952       /* Rewrite the later usage to point at the source of the move to
1953        * be removed.
1954        */
1955       for (fs_inst *scan_inst = inst;
1956            !scan_inst->is_tail_sentinel();
1957            scan_inst = (fs_inst *)scan_inst->next) {
1958          for (int i = 0; i < 3; i++) {
1959             if (scan_inst->src[i].file == GRF &&
1960                 scan_inst->src[i].reg == inst->dst.reg &&
1961                 scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
1962                fs_reg new_src = inst->src[0];
1963                if (scan_inst->src[i].abs) {
1964                   new_src.negate = 0;
1965                   new_src.abs = 1;
1966                }
1967                new_src.negate ^= scan_inst->src[i].negate;
1968                scan_inst->src[i] = new_src;
1969             }
1970          }
1971       }
1972
1973       inst->remove();
1974       progress = true;
1975    }
1976
1977    if (progress)
1978       live_intervals_valid = false;
1979
1980    return progress;
1981 }
1982
1983
1984 bool
1985 fs_visitor::compute_to_mrf()
1986 {
1987    bool progress = false;
1988    int next_ip = 0;
1989
1990    calculate_live_intervals();
1991
1992    foreach_list_safe(node, &this->instructions) {
1993       fs_inst *inst = (fs_inst *)node;
1994
1995       int ip = next_ip;
1996       next_ip++;
1997
1998       if (inst->opcode != BRW_OPCODE_MOV ||
1999           inst->predicate ||
2000           inst->dst.file != MRF || inst->src[0].file != GRF ||
2001           inst->dst.type != inst->src[0].type ||
2002           inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
2003          continue;
2004
2005       /* Work out which hardware MRF registers are written by this
2006        * instruction.
2007        */
2008       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2009       int mrf_high;
2010       if (inst->dst.reg & BRW_MRF_COMPR4) {
2011          mrf_high = mrf_low + 4;
2012       } else if (dispatch_width == 16 &&
2013                  (!inst->force_uncompressed && !inst->force_sechalf)) {
2014          mrf_high = mrf_low + 1;
2015       } else {
2016          mrf_high = mrf_low;
2017       }
2018
2019       /* Can't compute-to-MRF this GRF if someone else was going to
2020        * read it later.
2021        */
2022       if (this->virtual_grf_use[inst->src[0].reg] > ip)
2023          continue;
2024
2025       /* Found a move of a GRF to a MRF.  Let's see if we can go
2026        * rewrite the thing that made this GRF to write into the MRF.
2027        */
2028       fs_inst *scan_inst;
2029       for (scan_inst = (fs_inst *)inst->prev;
2030            scan_inst->prev != NULL;
2031            scan_inst = (fs_inst *)scan_inst->prev) {
2032          if (scan_inst->dst.file == GRF &&
2033              scan_inst->dst.reg == inst->src[0].reg) {
2034             /* Found the last thing to write our reg we want to turn
2035              * into a compute-to-MRF.
2036              */
2037
2038             /* SENDs can only write to GRFs, so no compute-to-MRF. */
2039             if (scan_inst->mlen) {
2040                break;
2041             }
2042
2043             /* If it's predicated, it (probably) didn't populate all
2044              * the channels.  We might be able to rewrite everything
2045              * that writes that reg, but it would require smarter
2046              * tracking to delay the rewriting until complete success.
2047              */
2048             if (scan_inst->predicate)
2049                break;
2050
2051             /* If it's half of register setup and not the same half as
2052              * our MOV we're trying to remove, bail for now.
2053              */
2054             if (scan_inst->force_uncompressed != inst->force_uncompressed ||
2055                 scan_inst->force_sechalf != inst->force_sechalf) {
2056                break;
2057             }
2058
2059             /* SEND instructions can't have MRF as a destination. */
2060             if (scan_inst->mlen)
2061                break;
2062
2063             if (intel->gen >= 6) {
2064                /* gen6 math instructions must have the destination be
2065                 * GRF, so no compute-to-MRF for them.
2066                 */
2067                if (scan_inst->is_math()) {
2068                   break;
2069                }
2070             }
2071
2072             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2073                /* Found the creator of our MRF's source value. */
2074                scan_inst->dst.file = MRF;
2075                scan_inst->dst.reg = inst->dst.reg;
2076                scan_inst->saturate |= inst->saturate;
2077                inst->remove();
2078                progress = true;
2079             }
2080             break;
2081          }
2082
2083          /* We don't handle flow control here.  Most computation of
2084           * values that end up in MRFs are shortly before the MRF
2085           * write anyway.
2086           */
2087          if (scan_inst->opcode == BRW_OPCODE_DO ||
2088              scan_inst->opcode == BRW_OPCODE_WHILE ||
2089              scan_inst->opcode == BRW_OPCODE_ELSE ||
2090              scan_inst->opcode == BRW_OPCODE_ENDIF) {
2091             break;
2092          }
2093
2094          /* You can't read from an MRF, so if someone else reads our
2095           * MRF's source GRF that we wanted to rewrite, that stops us.
2096           */
2097          bool interfered = false;
2098          for (int i = 0; i < 3; i++) {
2099             if (scan_inst->src[i].file == GRF &&
2100                 scan_inst->src[i].reg == inst->src[0].reg &&
2101                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2102                interfered = true;
2103             }
2104          }
2105          if (interfered)
2106             break;
2107
2108          if (scan_inst->dst.file == MRF) {
2109             /* If somebody else writes our MRF here, we can't
2110              * compute-to-MRF before that.
2111              */
2112             int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2113             int scan_mrf_high;
2114
2115             if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2116                scan_mrf_high = scan_mrf_low + 4;
2117             } else if (dispatch_width == 16 &&
2118                        (!scan_inst->force_uncompressed &&
2119                         !scan_inst->force_sechalf)) {
2120                scan_mrf_high = scan_mrf_low + 1;
2121             } else {
2122                scan_mrf_high = scan_mrf_low;
2123             }
2124
2125             if (mrf_low == scan_mrf_low ||
2126                 mrf_low == scan_mrf_high ||
2127                 mrf_high == scan_mrf_low ||
2128                 mrf_high == scan_mrf_high) {
2129                break;
2130             }
2131          }
2132
2133          if (scan_inst->mlen > 0) {
2134             /* Found a SEND instruction, which means that there are
2135              * live values in MRFs from base_mrf to base_mrf +
2136              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
2137              * above it.
2138              */
2139             if (mrf_low >= scan_inst->base_mrf &&
2140                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2141                break;
2142             }
2143             if (mrf_high >= scan_inst->base_mrf &&
2144                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2145                break;
2146             }
2147          }
2148       }
2149    }
2150
2151    if (progress)
2152       live_intervals_valid = false;
2153
2154    return progress;
2155 }
2156
2157 /**
2158  * Walks through basic blocks, looking for repeated MRF writes and
2159  * removing the later ones.
2160  */
2161 bool
2162 fs_visitor::remove_duplicate_mrf_writes()
2163 {
2164    fs_inst *last_mrf_move[16];
2165    bool progress = false;
2166
2167    /* Need to update the MRF tracking for compressed instructions. */
2168    if (dispatch_width == 16)
2169       return false;
2170
2171    memset(last_mrf_move, 0, sizeof(last_mrf_move));
2172
2173    foreach_list_safe(node, &this->instructions) {
2174       fs_inst *inst = (fs_inst *)node;
2175
2176       switch (inst->opcode) {
2177       case BRW_OPCODE_DO:
2178       case BRW_OPCODE_WHILE:
2179       case BRW_OPCODE_IF:
2180       case BRW_OPCODE_ELSE:
2181       case BRW_OPCODE_ENDIF:
2182          memset(last_mrf_move, 0, sizeof(last_mrf_move));
2183          continue;
2184       default:
2185          break;
2186       }
2187
2188       if (inst->opcode == BRW_OPCODE_MOV &&
2189           inst->dst.file == MRF) {
2190          fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2191          if (prev_inst && inst->equals(prev_inst)) {
2192             inst->remove();
2193             progress = true;
2194             continue;
2195          }
2196       }
2197
2198       /* Clear out the last-write records for MRFs that were overwritten. */
2199       if (inst->dst.file == MRF) {
2200          last_mrf_move[inst->dst.reg] = NULL;
2201       }
2202
2203       if (inst->mlen > 0) {
2204          /* Found a SEND instruction, which will include two or fewer
2205           * implied MRF writes.  We could do better here.
2206           */
2207          for (int i = 0; i < implied_mrf_writes(inst); i++) {
2208             last_mrf_move[inst->base_mrf + i] = NULL;
2209          }
2210       }
2211
2212       /* Clear out any MRF move records whose sources got overwritten. */
2213       if (inst->dst.file == GRF) {
2214          for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2215             if (last_mrf_move[i] &&
2216                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2217                last_mrf_move[i] = NULL;
2218             }
2219          }
2220       }
2221
2222       if (inst->opcode == BRW_OPCODE_MOV &&
2223           inst->dst.file == MRF &&
2224           inst->src[0].file == GRF &&
2225           !inst->predicate) {
2226          last_mrf_move[inst->dst.reg] = inst;
2227       }
2228    }
2229
2230    if (progress)
2231       live_intervals_valid = false;
2232
2233    return progress;
2234 }
2235
2236 void
2237 fs_visitor::dump_instruction(fs_inst *inst)
2238 {
2239    if (inst->predicate) {
2240       printf("(%cf0.%d) ",
2241              inst->predicate_inverse ? '-' : '+',
2242              inst->flag_subreg);
2243    }
2244
2245    if (inst->opcode < ARRAY_SIZE(opcode_descs) &&
2246        opcode_descs[inst->opcode].name) {
2247       printf("%s", opcode_descs[inst->opcode].name);
2248    } else {
2249       printf("op%d", inst->opcode);
2250    }
2251    if (inst->saturate)
2252       printf(".sat");
2253    if (inst->conditional_mod) {
2254       printf(".cmod");
2255       if (!inst->predicate &&
2256           (intel->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2257                               inst->opcode != BRW_OPCODE_IF &&
2258                               inst->opcode != BRW_OPCODE_WHILE))) {
2259          printf(".f0.%d\n", inst->flag_subreg);
2260       }
2261    }
2262    printf(" ");
2263
2264
2265    switch (inst->dst.file) {
2266    case GRF:
2267       printf("vgrf%d", inst->dst.reg);
2268       if (inst->dst.reg_offset)
2269          printf("+%d", inst->dst.reg_offset);
2270       break;
2271    case MRF:
2272       printf("m%d", inst->dst.reg);
2273       break;
2274    case BAD_FILE:
2275       printf("(null)");
2276       break;
2277    case UNIFORM:
2278       printf("***u%d***", inst->dst.reg);
2279       break;
2280    default:
2281       printf("???");
2282       break;
2283    }
2284    printf(", ");
2285
2286    for (int i = 0; i < 3; i++) {
2287       if (inst->src[i].negate)
2288          printf("-");
2289       if (inst->src[i].abs)
2290          printf("|");
2291       switch (inst->src[i].file) {
2292       case GRF:
2293          printf("vgrf%d", inst->src[i].reg);
2294          if (inst->src[i].reg_offset)
2295             printf("+%d", inst->src[i].reg_offset);
2296          break;
2297       case MRF:
2298          printf("***m%d***", inst->src[i].reg);
2299          break;
2300       case UNIFORM:
2301          printf("u%d", inst->src[i].reg);
2302          if (inst->src[i].reg_offset)
2303             printf(".%d", inst->src[i].reg_offset);
2304          break;
2305       case BAD_FILE:
2306          printf("(null)");
2307          break;
2308       default:
2309          printf("???");
2310          break;
2311       }
2312       if (inst->src[i].abs)
2313          printf("|");
2314
2315       if (i < 3)
2316          printf(", ");
2317    }
2318
2319    printf(" ");
2320
2321    if (inst->force_uncompressed)
2322       printf("1sthalf ");
2323
2324    if (inst->force_sechalf)
2325       printf("2ndhalf ");
2326
2327    printf("\n");
2328 }
2329
2330 void
2331 fs_visitor::dump_instructions()
2332 {
2333    int ip = 0;
2334    foreach_list(node, &this->instructions) {
2335       fs_inst *inst = (fs_inst *)node;
2336       printf("%d: ", ip++);
2337       dump_instruction(inst);
2338    }
2339 }
2340
2341 /**
2342  * Possibly returns an instruction that set up @param reg.
2343  *
2344  * Sometimes we want to take the result of some expression/variable
2345  * dereference tree and rewrite the instruction generating the result
2346  * of the tree.  When processing the tree, we know that the
2347  * instructions generated are all writing temporaries that are dead
2348  * outside of this tree.  So, if we have some instructions that write
2349  * a temporary, we're free to point that temp write somewhere else.
2350  *
2351  * Note that this doesn't guarantee that the instruction generated
2352  * only reg -- it might be the size=4 destination of a texture instruction.
2353  */
2354 fs_inst *
2355 fs_visitor::get_instruction_generating_reg(fs_inst *start,
2356                                            fs_inst *end,
2357                                            fs_reg reg)
2358 {
2359    if (end == start ||
2360        end->predicate ||
2361        end->force_uncompressed ||
2362        end->force_sechalf ||
2363        reg.reladdr ||
2364        !reg.equals(end->dst)) {
2365       return NULL;
2366    } else {
2367       return end;
2368    }
2369 }
2370
2371 void
2372 fs_visitor::setup_payload_gen6()
2373 {
2374    struct intel_context *intel = &brw->intel;
2375    bool uses_depth =
2376       (fp->Base.InputsRead & (1 << FRAG_ATTRIB_WPOS)) != 0;
2377    unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
2378
2379    assert(intel->gen >= 6);
2380
2381    /* R0-1: masks, pixel X/Y coordinates. */
2382    c->nr_payload_regs = 2;
2383    /* R2: only for 32-pixel dispatch.*/
2384
2385    /* R3-26: barycentric interpolation coordinates.  These appear in the
2386     * same order that they appear in the brw_wm_barycentric_interp_mode
2387     * enum.  Each set of coordinates occupies 2 registers if dispatch width
2388     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
2389     * appear if they were enabled using the "Barycentric Interpolation
2390     * Mode" bits in WM_STATE.
2391     */
2392    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
2393       if (barycentric_interp_modes & (1 << i)) {
2394          c->barycentric_coord_reg[i] = c->nr_payload_regs;
2395          c->nr_payload_regs += 2;
2396          if (dispatch_width == 16) {
2397             c->nr_payload_regs += 2;
2398          }
2399       }
2400    }
2401
2402    /* R27: interpolated depth if uses source depth */
2403    if (uses_depth) {
2404       c->source_depth_reg = c->nr_payload_regs;
2405       c->nr_payload_regs++;
2406       if (dispatch_width == 16) {
2407          /* R28: interpolated depth if not 8-wide. */
2408          c->nr_payload_regs++;
2409       }
2410    }
2411    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
2412    if (uses_depth) {
2413       c->source_w_reg = c->nr_payload_regs;
2414       c->nr_payload_regs++;
2415       if (dispatch_width == 16) {
2416          /* R30: interpolated W if not 8-wide. */
2417          c->nr_payload_regs++;
2418       }
2419    }
2420    /* R31: MSAA position offsets. */
2421    /* R32-: bary for 32-pixel. */
2422    /* R58-59: interp W for 32-pixel. */
2423
2424    if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
2425       c->source_depth_to_render_target = true;
2426    }
2427 }
2428
2429 bool
2430 fs_visitor::run()
2431 {
2432    sanity_param_count = fp->Base.Parameters->NumParameters;
2433    uint32_t orig_nr_params = c->prog_data.nr_params;
2434
2435    if (intel->gen >= 6)
2436       setup_payload_gen6();
2437    else
2438       setup_payload_gen4();
2439
2440    if (0) {
2441       emit_dummy_fs();
2442    } else {
2443       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2444          emit_shader_time_begin();
2445
2446       calculate_urb_setup();
2447       if (intel->gen < 6)
2448          emit_interpolation_setup_gen4();
2449       else
2450          emit_interpolation_setup_gen6();
2451
2452       /* We handle discards by keeping track of the still-live pixels in f0.1.
2453        * Initialize it with the dispatched pixels.
2454        */
2455       if (fp->UsesKill) {
2456          fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
2457          discard_init->flag_subreg = 1;
2458       }
2459
2460       /* Generate FS IR for main().  (the visitor only descends into
2461        * functions called "main").
2462        */
2463       if (shader) {
2464          foreach_list(node, &*shader->ir) {
2465             ir_instruction *ir = (ir_instruction *)node;
2466             base_ir = ir;
2467             this->result = reg_undef;
2468             ir->accept(this);
2469          }
2470       } else {
2471          emit_fragment_program_code();
2472       }
2473       base_ir = NULL;
2474       if (failed)
2475          return false;
2476
2477       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2478          emit_shader_time_end();
2479
2480       emit_fb_writes();
2481
2482       split_virtual_grfs();
2483
2484       move_uniform_array_access_to_pull_constants();
2485       setup_pull_constants();
2486
2487       bool progress;
2488       do {
2489          progress = false;
2490
2491          compact_virtual_grfs();
2492
2493          progress = remove_duplicate_mrf_writes() || progress;
2494
2495          progress = opt_algebraic() || progress;
2496          progress = opt_cse() || progress;
2497          progress = opt_copy_propagate() || progress;
2498          progress = dead_code_eliminate() || progress;
2499          progress = register_coalesce() || progress;
2500          progress = register_coalesce_2() || progress;
2501          progress = compute_to_mrf() || progress;
2502       } while (progress);
2503
2504       remove_dead_constants();
2505
2506       schedule_instructions(false);
2507
2508       assign_curb_setup();
2509       assign_urb_setup();
2510
2511       if (0) {
2512          /* Debug of register spilling: Go spill everything. */
2513          for (int i = 0; i < virtual_grf_count; i++) {
2514             spill_reg(i);
2515          }
2516       }
2517
2518       if (0)
2519          assign_regs_trivial();
2520       else {
2521          while (!assign_regs()) {
2522             if (failed)
2523                break;
2524          }
2525       }
2526    }
2527    assert(force_uncompressed_stack == 0);
2528    assert(force_sechalf_stack == 0);
2529
2530    if (failed)
2531       return false;
2532
2533    schedule_instructions(true);
2534
2535    if (dispatch_width == 8) {
2536       c->prog_data.reg_blocks = brw_register_blocks(grf_used);
2537    } else {
2538       c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
2539
2540       /* Make sure we didn't try to sneak in an extra uniform */
2541       assert(orig_nr_params == c->prog_data.nr_params);
2542       (void) orig_nr_params;
2543    }
2544
2545    /* If any state parameters were appended, then ParameterValues could have
2546     * been realloced, in which case the driver uniform storage set up by
2547     * _mesa_associate_uniform_storage() would point to freed memory.  Make
2548     * sure that didn't happen.
2549     */
2550    assert(sanity_param_count == fp->Base.Parameters->NumParameters);
2551
2552    return !failed;
2553 }
2554
2555 const unsigned *
2556 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
2557                struct gl_fragment_program *fp,
2558                struct gl_shader_program *prog,
2559                unsigned *final_assembly_size)
2560 {
2561    struct intel_context *intel = &brw->intel;
2562    bool start_busy = false;
2563    float start_time = 0;
2564
2565    if (unlikely(INTEL_DEBUG & DEBUG_PERF)) {
2566       start_busy = (intel->batch.last_bo &&
2567                     drm_intel_bo_busy(intel->batch.last_bo));
2568       start_time = get_time();
2569    }
2570
2571    struct brw_shader *shader = NULL;
2572    if (prog)
2573       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
2574
2575    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
2576       if (shader) {
2577          printf("GLSL IR for native fragment shader %d:\n", prog->Name);
2578          _mesa_print_ir(shader->ir, NULL);
2579          printf("\n\n");
2580       } else {
2581          printf("ARB_fragment_program %d ir for native fragment shader\n",
2582                 fp->Base.Id);
2583          _mesa_print_program(&fp->Base);
2584       }
2585    }
2586
2587    /* Now the main event: Visit the shader IR and generate our FS IR for it.
2588     */
2589    fs_visitor v(brw, c, prog, fp, 8);
2590    if (!v.run()) {
2591       prog->LinkStatus = false;
2592       ralloc_strcat(&prog->InfoLog, v.fail_msg);
2593
2594       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
2595                     v.fail_msg);
2596
2597       return NULL;
2598    }
2599
2600    exec_list *simd16_instructions = NULL;
2601    fs_visitor v2(brw, c, prog, fp, 16);
2602    if (intel->gen >= 5 && c->prog_data.nr_pull_params == 0) {
2603       v2.import_uniforms(&v);
2604       if (!v2.run()) {
2605          perf_debug("16-wide shader failed to compile, falling back to "
2606                     "8-wide at a 10-20%% performance cost: %s", v2.fail_msg);
2607       } else {
2608          simd16_instructions = &v2.instructions;
2609       }
2610    }
2611
2612    c->prog_data.dispatch_width = 8;
2613
2614    fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
2615    const unsigned *generated = g.generate_assembly(&v.instructions,
2616                                                    simd16_instructions,
2617                                                    final_assembly_size);
2618
2619    if (unlikely(INTEL_DEBUG & DEBUG_PERF) && shader) {
2620       if (shader->compiled_once)
2621          brw_wm_debug_recompile(brw, prog, &c->key);
2622       shader->compiled_once = true;
2623
2624       if (start_busy && !drm_intel_bo_busy(intel->batch.last_bo)) {
2625          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
2626                     (get_time() - start_time) * 1000);
2627       }
2628    }
2629
2630    return generated;
2631 }
2632
2633 bool
2634 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
2635 {
2636    struct brw_context *brw = brw_context(ctx);
2637    struct intel_context *intel = &brw->intel;
2638    struct brw_wm_prog_key key;
2639
2640    if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
2641       return true;
2642
2643    struct gl_fragment_program *fp = (struct gl_fragment_program *)
2644       prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
2645    struct brw_fragment_program *bfp = brw_fragment_program(fp);
2646    bool program_uses_dfdy = fp->UsesDFdy;
2647
2648    memset(&key, 0, sizeof(key));
2649
2650    if (intel->gen < 6) {
2651       if (fp->UsesKill)
2652          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
2653
2654       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
2655          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
2656
2657       /* Just assume depth testing. */
2658       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
2659       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
2660    }
2661
2662    if (prog->Name != 0)
2663       key.proj_attrib_mask = 0xffffffff;
2664
2665    if (intel->gen < 6)
2666       key.vp_outputs_written |= BITFIELD64_BIT(FRAG_ATTRIB_WPOS);
2667
2668    for (int i = 0; i < FRAG_ATTRIB_MAX; i++) {
2669       if (!(fp->Base.InputsRead & BITFIELD64_BIT(i)))
2670          continue;
2671
2672       if (prog->Name == 0)
2673          key.proj_attrib_mask |= 1 << i;
2674
2675       if (intel->gen < 6) {
2676          int vp_index = _mesa_vert_result_to_frag_attrib((gl_vert_result) i);
2677
2678          if (vp_index >= 0)
2679             key.vp_outputs_written |= BITFIELD64_BIT(vp_index);
2680       }
2681    }
2682
2683    key.clamp_fragment_color = true;
2684
2685    for (int i = 0; i < MAX_SAMPLERS; i++) {
2686       if (fp->Base.ShadowSamplers & (1 << i)) {
2687          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
2688          key.tex.swizzles[i] =
2689             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
2690       } else {
2691          /* Color sampler: assume no swizzling. */
2692          key.tex.swizzles[i] = SWIZZLE_XYZW;
2693       }
2694    }
2695
2696    if (fp->Base.InputsRead & FRAG_BIT_WPOS) {
2697       key.drawable_height = ctx->DrawBuffer->Height;
2698    }
2699
2700    if ((fp->Base.InputsRead & FRAG_BIT_WPOS) || program_uses_dfdy) {
2701       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
2702    }
2703
2704    key.nr_color_regions = 1;
2705
2706    key.program_string_id = bfp->id;
2707
2708    uint32_t old_prog_offset = brw->wm.prog_offset;
2709    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
2710
2711    bool success = do_wm_prog(brw, prog, bfp, &key);
2712
2713    brw->wm.prog_offset = old_prog_offset;
2714    brw->wm.prog_data = old_prog_data;
2715
2716    return success;
2717 }