src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 extern "C" {
  32
  33 #include <sys/types.h>
  34
  35 #include "main/macros.h"
  36 #include "main/shaderobj.h"
  37 #include "main/uniforms.h"
  38 #include "main/fbobject.h"
  39 #include "program/prog_parameter.h"
  40 #include "program/prog_print.h"
  41 #include "program/register_allocate.h"
  42 #include "program/sampler.h"
  43 #include "program/hash_table.h"
  44 #include "brw_context.h"
  45 #include "brw_eu.h"
  46 #include "brw_wm.h"
  47 }
  48 #include "brw_fs.h"
  49 #include "glsl/glsl_types.h"
  50 #include "glsl/ir_print_visitor.h"
  51
  52 void
  53 fs_inst::init()
  54 {
  55    memset(this, 0, sizeof(*this));
  56    this->opcode = BRW_OPCODE_NOP;
  57    this->conditional_mod = BRW_CONDITIONAL_NONE;
  58
  59    this->dst = reg_undef;
  60    this->src[0] = reg_undef;
  61    this->src[1] = reg_undef;
  62    this->src[2] = reg_undef;
  63 }
  64
  65 fs_inst::fs_inst()
  66 {
  67    init();
  68 }
  69
  70 fs_inst::fs_inst(enum opcode opcode)
  71 {
  72    init();
  73    this->opcode = opcode;
  74 }
  75
  76 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
  77 {
  78    init();
  79    this->opcode = opcode;
  80    this->dst = dst;
  81
  82    if (dst.file == GRF)
  83       assert(dst.reg_offset >= 0);
  84 }
  85
  86 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
  87 {
  88    init();
  89    this->opcode = opcode;
  90    this->dst = dst;
  91    this->src[0] = src0;
  92
  93    if (dst.file == GRF)
  94       assert(dst.reg_offset >= 0);
  95    if (src[0].file == GRF)
  96       assert(src[0].reg_offset >= 0);
  97 }
  98
  99 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 100 {
 101    init();
 102    this->opcode = opcode;
 103    this->dst = dst;
 104    this->src[0] = src0;
 105    this->src[1] = src1;
 106
 107    if (dst.file == GRF)
 108       assert(dst.reg_offset >= 0);
 109    if (src[0].file == GRF)
 110       assert(src[0].reg_offset >= 0);
 111    if (src[1].file == GRF)
 112       assert(src[1].reg_offset >= 0);
 113 }
 114
 115 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
 116                  fs_reg src0, fs_reg src1, fs_reg src2)
 117 {
 118    init();
 119    this->opcode = opcode;
 120    this->dst = dst;
 121    this->src[0] = src0;
 122    this->src[1] = src1;
 123    this->src[2] = src2;
 124
 125    if (dst.file == GRF)
 126       assert(dst.reg_offset >= 0);
 127    if (src[0].file == GRF)
 128       assert(src[0].reg_offset >= 0);
 129    if (src[1].file == GRF)
 130       assert(src[1].reg_offset >= 0);
 131    if (src[2].file == GRF)
 132       assert(src[2].reg_offset >= 0);
 133 }
 134
 135 #define ALU1(op)                                                        \
 136    fs_inst *                                                            \
 137    fs_visitor::op(fs_reg dst, fs_reg src0)                              \
 138    {                                                                    \
 139       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
 140    }
 141
 142 #define ALU2(op)                                                        \
 143    fs_inst *                                                            \
 144    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1)                 \
 145    {                                                                    \
 146       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
 147    }
 148
 149 ALU1(NOT)
 150 ALU1(MOV)
 151 ALU1(FRC)
 152 ALU1(RNDD)
 153 ALU1(RNDE)
 154 ALU1(RNDZ)
 155 ALU2(ADD)
 156 ALU2(MUL)
 157 ALU2(MACH)
 158 ALU2(AND)
 159 ALU2(OR)
 160 ALU2(XOR)
 161 ALU2(SHL)
 162 ALU2(SHR)
 163 ALU2(ASR)
 164
 165 /** Gen4 predicated IF. */
 166 fs_inst *
 167 fs_visitor::IF(uint32_t predicate)
 168 {
 169    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
 170    inst->predicate = predicate;
 171    return inst;
 172 }
 173
 174 /** Gen6+ IF with embedded comparison. */
 175 fs_inst *
 176 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
 177 {
 178    assert(intel->gen >= 6);
 179    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
 180                                         reg_null_d, src0, src1);
 181    inst->conditional_mod = condition;
 182    return inst;
 183 }
 184
 185 /**
 186  * CMP: Sets the low bit of the destination channels with the result
 187  * of the comparison, while the upper bits are undefined, and updates
 188  * the flag register with the packed 16 bits of the result.
 189  */
 190 fs_inst *
 191 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
 192 {
 193    fs_inst *inst;
 194
 195    /* Take the instruction:
 196     *
 197     * CMP null<d> src0<f> src1<f>
 198     *
 199     * Original gen4 does type conversion to the destination type before
 200     * comparison, producing garbage results for floating point comparisons.
 201     * gen5 does the comparison on the execution type (resolved source types),
 202     * so dst type doesn't matter.  gen6 does comparison and then uses the
 203     * result as if it was the dst type with no conversion, which happens to
 204     * mostly work out for float-interpreted-as-int since our comparisons are
 205     * for >0, =0, <0.
 206     */
 207    if (intel->gen == 4) {
 208       dst.type = src0.type;
 209       if (dst.file == FIXED_HW_REG)
 210          dst.fixed_hw_reg.type = dst.type;
 211    }
 212
 213    resolve_ud_negate(&src0);
 214    resolve_ud_negate(&src1);
 215
 216    inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
 217    inst->conditional_mod = condition;
 218
 219    return inst;
 220 }
 221
 222 exec_list
 223 fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
 224                                        fs_reg offset)
 225 {
 226    exec_list instructions;
 227    fs_inst *inst;
 228
 229    if (intel->gen >= 7) {
 230       inst = new(mem_ctx) fs_inst(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7,
 231                                   dst, surf_index, offset);
 232       instructions.push_tail(inst);
 233    } else {
 234       int base_mrf = 13;
 235       bool header_present = true;
 236
 237       fs_reg mrf = fs_reg(MRF, base_mrf + header_present);
 238       mrf.type = BRW_REGISTER_TYPE_D;
 239
 240       /* On gen6+ we want the dword offset passed in, but on gen4/5 we need a
 241        * dword-aligned byte offset.
 242        */
 243       if (intel->gen == 6) {
 244          instructions.push_tail(MOV(mrf, offset));
 245       } else {
 246          instructions.push_tail(MUL(mrf, offset, fs_reg(4)));
 247       }
 248       inst = MOV(mrf, offset);
 249       inst = new(mem_ctx) fs_inst(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD,
 250                                   dst, surf_index);
 251       inst->header_present = header_present;
 252       inst->base_mrf = base_mrf;
 253       inst->mlen = header_present + dispatch_width / 8;
 254
 255       instructions.push_tail(inst);
 256    }
 257
 258    return instructions;
 259 }
 260
 261 bool
 262 fs_inst::equals(fs_inst *inst)
 263 {
 264    return (opcode == inst->opcode &&
 265            dst.equals(inst->dst) &&
 266            src[0].equals(inst->src[0]) &&
 267            src[1].equals(inst->src[1]) &&
 268            src[2].equals(inst->src[2]) &&
 269            saturate == inst->saturate &&
 270            predicate == inst->predicate &&
 271            conditional_mod == inst->conditional_mod &&
 272            mlen == inst->mlen &&
 273            base_mrf == inst->base_mrf &&
 274            sampler == inst->sampler &&
 275            target == inst->target &&
 276            eot == inst->eot &&
 277            header_present == inst->header_present &&
 278            shadow_compare == inst->shadow_compare &&
 279            offset == inst->offset);
 280 }
 281
 282 int
 283 fs_inst::regs_written()
 284 {
 285    if (is_tex())
 286       return 4;
 287
 288    /* The SINCOS and INT_DIV_QUOTIENT_AND_REMAINDER math functions return 2,
 289     * but we don't currently use them...nor do we have an opcode for them.
 290     */
 291
 292    return 1;
 293 }
 294
 295 bool
 296 fs_inst::overwrites_reg(const fs_reg &reg)
 297 {
 298    return (reg.file == dst.file &&
 299            reg.reg == dst.reg &&
 300            reg.reg_offset >= dst.reg_offset  &&
 301            reg.reg_offset < dst.reg_offset + regs_written());
 302 }
 303
 304 bool
 305 fs_inst::is_tex()
 306 {
 307    return (opcode == SHADER_OPCODE_TEX ||
 308            opcode == FS_OPCODE_TXB ||
 309            opcode == SHADER_OPCODE_TXD ||
 310            opcode == SHADER_OPCODE_TXF ||
 311            opcode == SHADER_OPCODE_TXL ||
 312            opcode == SHADER_OPCODE_TXS);
 313 }
 314
 315 bool
 316 fs_inst::is_math()
 317 {
 318    return (opcode == SHADER_OPCODE_RCP ||
 319            opcode == SHADER_OPCODE_RSQ ||
 320            opcode == SHADER_OPCODE_SQRT ||
 321            opcode == SHADER_OPCODE_EXP2 ||
 322            opcode == SHADER_OPCODE_LOG2 ||
 323            opcode == SHADER_OPCODE_SIN ||
 324            opcode == SHADER_OPCODE_COS ||
 325            opcode == SHADER_OPCODE_INT_QUOTIENT ||
 326            opcode == SHADER_OPCODE_INT_REMAINDER ||
 327            opcode == SHADER_OPCODE_POW);
 328 }
 329
 330 bool
 331 fs_inst::is_send_from_grf()
 332 {
 333    return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
 334            (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
 335             src[1].file == GRF));
 336 }
 337
 338 bool
 339 fs_visitor::can_do_source_mods(fs_inst *inst)
 340 {
 341    if (intel->gen == 6 && inst->is_math())
 342       return false;
 343
 344    if (inst->is_send_from_grf())
 345       return false;
 346
 347    return true;
 348 }
 349
 350 void
 351 fs_reg::init()
 352 {
 353    memset(this, 0, sizeof(*this));
 354    this->smear = -1;
 355 }
 356
 357 /** Generic unset register constructor. */
 358 fs_reg::fs_reg()
 359 {
 360    init();
 361    this->file = BAD_FILE;
 362 }
 363
 364 /** Immediate value constructor. */
 365 fs_reg::fs_reg(float f)
 366 {
 367    init();
 368    this->file = IMM;
 369    this->type = BRW_REGISTER_TYPE_F;
 370    this->imm.f = f;
 371 }
 372
 373 /** Immediate value constructor. */
 374 fs_reg::fs_reg(int32_t i)
 375 {
 376    init();
 377    this->file = IMM;
 378    this->type = BRW_REGISTER_TYPE_D;
 379    this->imm.i = i;
 380 }
 381
 382 /** Immediate value constructor. */
 383 fs_reg::fs_reg(uint32_t u)
 384 {
 385    init();
 386    this->file = IMM;
 387    this->type = BRW_REGISTER_TYPE_UD;
 388    this->imm.u = u;
 389 }
 390
 391 /** Fixed brw_reg Immediate value constructor. */
 392 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
 393 {
 394    init();
 395    this->file = FIXED_HW_REG;
 396    this->fixed_hw_reg = fixed_hw_reg;
 397    this->type = fixed_hw_reg.type;
 398 }
 399
 400 bool
 401 fs_reg::equals(const fs_reg &r) const
 402 {
 403    return (file == r.file &&
 404            reg == r.reg &&
 405            reg_offset == r.reg_offset &&
 406            type == r.type &&
 407            negate == r.negate &&
 408            abs == r.abs &&
 409            !reladdr && !r.reladdr &&
 410            memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
 411                   sizeof(fixed_hw_reg)) == 0 &&
 412            smear == r.smear &&
 413            imm.u == r.imm.u);
 414 }
 415
 416 bool
 417 fs_reg::is_zero() const
 418 {
 419    if (file != IMM)
 420       return false;
 421
 422    return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
 423 }
 424
 425 bool
 426 fs_reg::is_one() const
 427 {
 428    if (file != IMM)
 429       return false;
 430
 431    return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
 432 }
 433
 434 int
 435 fs_visitor::type_size(const struct glsl_type *type)
 436 {
 437    unsigned int size, i;
 438
 439    switch (type->base_type) {
 440    case GLSL_TYPE_UINT:
 441    case GLSL_TYPE_INT:
 442    case GLSL_TYPE_FLOAT:
 443    case GLSL_TYPE_BOOL:
 444       return type->components();
 445    case GLSL_TYPE_ARRAY:
 446       return type_size(type->fields.array) * type->length;
 447    case GLSL_TYPE_STRUCT:
 448       size = 0;
 449       for (i = 0; i < type->length; i++) {
 450          size += type_size(type->fields.structure[i].type);
 451       }
 452       return size;
 453    case GLSL_TYPE_SAMPLER:
 454       /* Samplers take up no register space, since they're baked in at
 455        * link time.
 456        */
 457       return 0;
 458    case GLSL_TYPE_VOID:
 459    case GLSL_TYPE_ERROR:
 460       assert(!"not reached");
 461       break;
 462    }
 463
 464    return 0;
 465 }
 466
 467 fs_reg
 468 fs_visitor::get_timestamp()
 469 {
 470    assert(intel->gen >= 7);
 471
 472    fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 473                                           BRW_ARF_TIMESTAMP,
 474                                           0),
 475                              BRW_REGISTER_TYPE_UD));
 476
 477    fs_reg dst = fs_reg(this, glsl_type::uint_type);
 478
 479    fs_inst *mov = emit(MOV(dst, ts));
 480    /* We want to read the 3 fields we care about (mostly field 0, but also 2)
 481     * even if it's not enabled in the dispatch.
 482     */
 483    mov->force_writemask_all = true;
 484    mov->force_uncompressed = true;
 485
 486    /* The caller wants the low 32 bits of the timestamp.  Since it's running
 487     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
 488     * which is plenty of time for our purposes.  It is identical across the
 489     * EUs, but since it's tracking GPU core speed it will increment at a
 490     * varying rate as render P-states change.
 491     *
 492     * The caller could also check if render P-states have changed (or anything
 493     * else that might disrupt timing) by setting smear to 2 and checking if
 494     * that field is != 0.
 495     */
 496    dst.smear = 0;
 497
 498    return dst;
 499 }
 500
 501 void
 502 fs_visitor::emit_shader_time_begin()
 503 {
 504    current_annotation = "shader time start";
 505    shader_start_time = get_timestamp();
 506 }
 507
 508 void
 509 fs_visitor::emit_shader_time_end()
 510 {
 511    current_annotation = "shader time end";
 512
 513    enum shader_time_shader_type type, written_type, reset_type;
 514    if (dispatch_width == 8) {
 515       type = ST_FS8;
 516       written_type = ST_FS8_WRITTEN;
 517       reset_type = ST_FS8_RESET;
 518    } else {
 519       assert(dispatch_width == 16);
 520       type = ST_FS16;
 521       written_type = ST_FS16_WRITTEN;
 522       reset_type = ST_FS16_RESET;
 523    }
 524
 525    fs_reg shader_end_time = get_timestamp();
 526
 527    /* Check that there weren't any timestamp reset events (assuming these
 528     * were the only two timestamp reads that happened).
 529     */
 530    fs_reg reset = shader_end_time;
 531    reset.smear = 2;
 532    fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
 533    test->conditional_mod = BRW_CONDITIONAL_Z;
 534    emit(IF(BRW_PREDICATE_NORMAL));
 535
 536    push_force_uncompressed();
 537    fs_reg start = shader_start_time;
 538    start.negate = true;
 539    fs_reg diff = fs_reg(this, glsl_type::uint_type);
 540    emit(ADD(diff, start, shader_end_time));
 541
 542    /* If there were no instructions between the two timestamp gets, the diff
 543     * is 2 cycles.  Remove that overhead, so I can forget about that when
 544     * trying to determine the time taken for single instructions.
 545     */
 546    emit(ADD(diff, diff, fs_reg(-2u)));
 547
 548    emit_shader_time_write(type, diff);
 549    emit_shader_time_write(written_type, fs_reg(1u));
 550    emit(BRW_OPCODE_ELSE);
 551    emit_shader_time_write(reset_type, fs_reg(1u));
 552    emit(BRW_OPCODE_ENDIF);
 553
 554    pop_force_uncompressed();
 555 }
 556
 557 void
 558 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
 559                                    fs_reg value)
 560 {
 561    /* Choose an index in the buffer and set up tracking information for our
 562     * printouts.
 563     */
 564    int shader_time_index = brw->shader_time.num_entries++;
 565    assert(shader_time_index <= brw->shader_time.max_entries);
 566    brw->shader_time.types[shader_time_index] = type;
 567    if (prog) {
 568       _mesa_reference_shader_program(ctx,
 569                                      &brw->shader_time.programs[shader_time_index],
 570                                      prog);
 571    }
 572
 573    int base_mrf = 6;
 574
 575    fs_reg offset_mrf = fs_reg(MRF, base_mrf);
 576    offset_mrf.type = BRW_REGISTER_TYPE_UD;
 577    emit(MOV(offset_mrf, fs_reg(shader_time_index * 4)));
 578
 579    fs_reg time_mrf = fs_reg(MRF, base_mrf + 1);
 580    time_mrf.type = BRW_REGISTER_TYPE_UD;
 581    emit(MOV(time_mrf, value));
 582
 583    fs_inst *inst = emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD));
 584    inst->base_mrf = base_mrf;
 585    inst->mlen = 2;
 586 }
 587
 588 void
 589 fs_visitor::fail(const char *format, ...)
 590 {
 591    va_list va;
 592    char *msg;
 593
 594    if (failed)
 595       return;
 596
 597    failed = true;
 598
 599    va_start(va, format);
 600    msg = ralloc_vasprintf(mem_ctx, format, va);
 601    va_end(va);
 602    msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
 603
 604    this->fail_msg = msg;
 605
 606    if (INTEL_DEBUG & DEBUG_WM) {
 607       fprintf(stderr, "%s",  msg);
 608    }
 609 }
 610
 611 fs_inst *
 612 fs_visitor::emit(enum opcode opcode)
 613 {
 614    return emit(fs_inst(opcode));
 615 }
 616
 617 fs_inst *
 618 fs_visitor::emit(enum opcode opcode, fs_reg dst)
 619 {
 620    return emit(fs_inst(opcode, dst));
 621 }
 622
 623 fs_inst *
 624 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
 625 {
 626    return emit(fs_inst(opcode, dst, src0));
 627 }
 628
 629 fs_inst *
 630 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 631 {
 632    return emit(fs_inst(opcode, dst, src0, src1));
 633 }
 634
 635 fs_inst *
 636 fs_visitor::emit(enum opcode opcode, fs_reg dst,
 637                  fs_reg src0, fs_reg src1, fs_reg src2)
 638 {
 639    return emit(fs_inst(opcode, dst, src0, src1, src2));
 640 }
 641
 642 void
 643 fs_visitor::push_force_uncompressed()
 644 {
 645    force_uncompressed_stack++;
 646 }
 647
 648 void
 649 fs_visitor::pop_force_uncompressed()
 650 {
 651    force_uncompressed_stack--;
 652    assert(force_uncompressed_stack >= 0);
 653 }
 654
 655 void
 656 fs_visitor::push_force_sechalf()
 657 {
 658    force_sechalf_stack++;
 659 }
 660
 661 void
 662 fs_visitor::pop_force_sechalf()
 663 {
 664    force_sechalf_stack--;
 665    assert(force_sechalf_stack >= 0);
 666 }
 667
 668 /**
 669  * Returns how many MRFs an FS opcode will write over.
 670  *
 671  * Note that this is not the 0 or 1 implied writes in an actual gen
 672  * instruction -- the FS opcodes often generate MOVs in addition.
 673  */
 674 int
 675 fs_visitor::implied_mrf_writes(fs_inst *inst)
 676 {
 677    if (inst->mlen == 0)
 678       return 0;
 679
 680    switch (inst->opcode) {
 681    case SHADER_OPCODE_RCP:
 682    case SHADER_OPCODE_RSQ:
 683    case SHADER_OPCODE_SQRT:
 684    case SHADER_OPCODE_EXP2:
 685    case SHADER_OPCODE_LOG2:
 686    case SHADER_OPCODE_SIN:
 687    case SHADER_OPCODE_COS:
 688       return 1 * dispatch_width / 8;
 689    case SHADER_OPCODE_POW:
 690    case SHADER_OPCODE_INT_QUOTIENT:
 691    case SHADER_OPCODE_INT_REMAINDER:
 692       return 2 * dispatch_width / 8;
 693    case SHADER_OPCODE_TEX:
 694    case FS_OPCODE_TXB:
 695    case SHADER_OPCODE_TXD:
 696    case SHADER_OPCODE_TXF:
 697    case SHADER_OPCODE_TXL:
 698    case SHADER_OPCODE_TXS:
 699       return 1;
 700    case SHADER_OPCODE_SHADER_TIME_ADD:
 701       return 0;
 702    case FS_OPCODE_FB_WRITE:
 703       return 2;
 704    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 705    case FS_OPCODE_UNSPILL:
 706       return 1;
 707    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
 708       return inst->header_present;
 709    case FS_OPCODE_SPILL:
 710       return 2;
 711    default:
 712       assert(!"not reached");
 713       return inst->mlen;
 714    }
 715 }
 716
 717 int
 718 fs_visitor::virtual_grf_alloc(int size)
 719 {
 720    if (virtual_grf_array_size <= virtual_grf_count) {
 721       if (virtual_grf_array_size == 0)
 722          virtual_grf_array_size = 16;
 723       else
 724          virtual_grf_array_size *= 2;
 725       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 726                                    virtual_grf_array_size);
 727    }
 728    virtual_grf_sizes[virtual_grf_count] = size;
 729    return virtual_grf_count++;
 730 }
 731
 732 /** Fixed HW reg constructor. */
 733 fs_reg::fs_reg(enum register_file file, int reg)
 734 {
 735    init();
 736    this->file = file;
 737    this->reg = reg;
 738    this->type = BRW_REGISTER_TYPE_F;
 739 }
 740
 741 /** Fixed HW reg constructor. */
 742 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
 743 {
 744    init();
 745    this->file = file;
 746    this->reg = reg;
 747    this->type = type;
 748 }
 749
 750 /** Automatic reg constructor. */
 751 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
 752 {
 753    init();
 754
 755    this->file = GRF;
 756    this->reg = v->virtual_grf_alloc(v->type_size(type));
 757    this->reg_offset = 0;
 758    this->type = brw_type_for_base_type(type);
 759 }
 760
 761 fs_reg *
 762 fs_visitor::variable_storage(ir_variable *var)
 763 {
 764    return (fs_reg *)hash_table_find(this->variable_ht, var);
 765 }
 766
 767 void
 768 import_uniforms_callback(const void *key,
 769                          void *data,
 770                          void *closure)
 771 {
 772    struct hash_table *dst_ht = (struct hash_table *)closure;
 773    const fs_reg *reg = (const fs_reg *)data;
 774
 775    if (reg->file != UNIFORM)
 776       return;
 777
 778    hash_table_insert(dst_ht, data, key);
 779 }
 780
 781 /* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
 782  * This brings in those uniform definitions
 783  */
 784 void
 785 fs_visitor::import_uniforms(fs_visitor *v)
 786 {
 787    hash_table_call_foreach(v->variable_ht,
 788                            import_uniforms_callback,
 789                            variable_ht);
 790    this->params_remap = v->params_remap;
 791 }
 792
 793 /* Our support for uniforms is piggy-backed on the struct
 794  * gl_fragment_program, because that's where the values actually
 795  * get stored, rather than in some global gl_shader_program uniform
 796  * store.
 797  */
 798 void
 799 fs_visitor::setup_uniform_values(ir_variable *ir)
 800 {
 801    int namelen = strlen(ir->name);
 802
 803    /* The data for our (non-builtin) uniforms is stored in a series of
 804     * gl_uniform_driver_storage structs for each subcomponent that
 805     * glGetUniformLocation() could name.  We know it's been set up in the same
 806     * order we'd walk the type, so walk the list of storage and find anything
 807     * with our name, or the prefix of a component that starts with our name.
 808     */
 809    unsigned params_before = c->prog_data.nr_params;
 810    for (unsigned u = 0; u < prog->NumUserUniformStorage; u++) {
 811       struct gl_uniform_storage *storage = &prog->UniformStorage[u];
 812
 813       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 814           (storage->name[namelen] != 0 &&
 815            storage->name[namelen] != '.' &&
 816            storage->name[namelen] != '[')) {
 817          continue;
 818       }
 819
 820       unsigned slots = storage->type->component_slots();
 821       if (storage->array_elements)
 822          slots *= storage->array_elements;
 823
 824       for (unsigned i = 0; i < slots; i++) {
 825          c->prog_data.param[c->prog_data.nr_params++] =
 826             &storage->storage[i].f;
 827       }
 828    }
 829
 830    /* Make sure we actually initialized the right amount of stuff here. */
 831    assert(params_before + ir->type->component_slots() ==
 832           c->prog_data.nr_params);
 833 }
 834
 835
 836 /* Our support for builtin uniforms is even scarier than non-builtin.
 837  * It sits on top of the PROG_STATE_VAR parameters that are
 838  * automatically updated from GL context state.
 839  */
 840 void
 841 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
 842 {
 843    const ir_state_slot *const slots = ir->state_slots;
 844    assert(ir->state_slots != NULL);
 845
 846    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 847       /* This state reference has already been setup by ir_to_mesa, but we'll
 848        * get the same index back here.
 849        */
 850       int index = _mesa_add_state_reference(this->fp->Base.Parameters,
 851                                             (gl_state_index *)slots[i].tokens);
 852
 853       /* Add each of the unique swizzles of the element as a parameter.
 854        * This'll end up matching the expected layout of the
 855        * array/matrix/structure we're trying to fill in.
 856        */
 857       int last_swiz = -1;
 858       for (unsigned int j = 0; j < 4; j++) {
 859          int swiz = GET_SWZ(slots[i].swizzle, j);
 860          if (swiz == last_swiz)
 861             break;
 862          last_swiz = swiz;
 863
 864          c->prog_data.param[c->prog_data.nr_params++] =
 865             &fp->Base.Parameters->ParameterValues[index][swiz].f;
 866       }
 867    }
 868 }
 869
 870 fs_reg *
 871 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
 872 {
 873    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 874    fs_reg wpos = *reg;
 875    bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
 876
 877    /* gl_FragCoord.x */
 878    if (ir->pixel_center_integer) {
 879       emit(MOV(wpos, this->pixel_x));
 880    } else {
 881       emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
 882    }
 883    wpos.reg_offset++;
 884
 885    /* gl_FragCoord.y */
 886    if (!flip && ir->pixel_center_integer) {
 887       emit(MOV(wpos, this->pixel_y));
 888    } else {
 889       fs_reg pixel_y = this->pixel_y;
 890       float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
 891
 892       if (flip) {
 893          pixel_y.negate = true;
 894          offset += c->key.drawable_height - 1.0;
 895       }
 896
 897       emit(ADD(wpos, pixel_y, fs_reg(offset)));
 898    }
 899    wpos.reg_offset++;
 900
 901    /* gl_FragCoord.z */
 902    if (intel->gen >= 6) {
 903       emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
 904    } else {
 905       emit(FS_OPCODE_LINTERP, wpos,
 906            this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
 907            this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
 908            interp_reg(FRAG_ATTRIB_WPOS, 2));
 909    }
 910    wpos.reg_offset++;
 911
 912    /* gl_FragCoord.w: Already set up in emit_interpolation */
 913    emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
 914
 915    return reg;
 916 }
 917
 918 fs_inst *
 919 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
 920                          glsl_interp_qualifier interpolation_mode,
 921                          bool is_centroid)
 922 {
 923    brw_wm_barycentric_interp_mode barycoord_mode;
 924    if (is_centroid) {
 925       if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
 926          barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
 927       else
 928          barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
 929    } else {
 930       if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
 931          barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
 932       else
 933          barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
 934    }
 935    return emit(FS_OPCODE_LINTERP, attr,
 936                this->delta_x[barycoord_mode],
 937                this->delta_y[barycoord_mode], interp);
 938 }
 939
 940 fs_reg *
 941 fs_visitor::emit_general_interpolation(ir_variable *ir)
 942 {
 943    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 944    reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
 945    fs_reg attr = *reg;
 946
 947    unsigned int array_elements;
 948    const glsl_type *type;
 949
 950    if (ir->type->is_array()) {
 951       array_elements = ir->type->length;
 952       if (array_elements == 0) {
 953          fail("dereferenced array '%s' has length 0\n", ir->name);
 954       }
 955       type = ir->type->fields.array;
 956    } else {
 957       array_elements = 1;
 958       type = ir->type;
 959    }
 960
 961    glsl_interp_qualifier interpolation_mode =
 962       ir->determine_interpolation_mode(c->key.flat_shade);
 963
 964    int location = ir->location;
 965    for (unsigned int i = 0; i < array_elements; i++) {
 966       for (unsigned int j = 0; j < type->matrix_columns; j++) {
 967          if (urb_setup[location] == -1) {
 968             /* If there's no incoming setup data for this slot, don't
 969              * emit interpolation for it.
 970              */
 971             attr.reg_offset += type->vector_elements;
 972             location++;
 973             continue;
 974          }
 975
 976          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
 977             /* Constant interpolation (flat shading) case. The SF has
 978              * handed us defined values in only the constant offset
 979              * field of the setup reg.
 980              */
 981             for (unsigned int k = 0; k < type->vector_elements; k++) {
 982                struct brw_reg interp = interp_reg(location, k);
 983                interp = suboffset(interp, 3);
 984                interp.type = reg->type;
 985                emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
 986                attr.reg_offset++;
 987             }
 988          } else {
 989             /* Smooth/noperspective interpolation case. */
 990             for (unsigned int k = 0; k < type->vector_elements; k++) {
 991                /* FINISHME: At some point we probably want to push
 992                 * this farther by giving similar treatment to the
 993                 * other potentially constant components of the
 994                 * attribute, as well as making brw_vs_constval.c
 995                 * handle varyings other than gl_TexCoord.
 996                 */
 997                if (location >= FRAG_ATTRIB_TEX0 &&
 998                    location <= FRAG_ATTRIB_TEX7 &&
 999                    k == 3 && !(c->key.proj_attrib_mask & (1 << location))) {
1000                   emit(BRW_OPCODE_MOV, attr, fs_reg(1.0f));
1001                } else {
1002                   struct brw_reg interp = interp_reg(location, k);
1003                   emit_linterp(attr, fs_reg(interp), interpolation_mode,
1004                                ir->centroid);
1005                   if (brw->needs_unlit_centroid_workaround && ir->centroid) {
1006                      /* Get the pixel/sample mask into f0 so that we know
1007                       * which pixels are lit.  Then, for each channel that is
1008                       * unlit, replace the centroid data with non-centroid
1009                       * data.
1010                       */
1011                      emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS, attr);
1012                      fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1013                                                   interpolation_mode, false);
1014                      inst->predicate = BRW_PREDICATE_NORMAL;
1015                      inst->predicate_inverse = true;
1016                   }
1017                   if (intel->gen < 6) {
1018                      emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1019                   }
1020                }
1021                attr.reg_offset++;
1022             }
1023
1024          }
1025          location++;
1026       }
1027    }
1028
1029    return reg;
1030 }
1031
1032 fs_reg *
1033 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1034 {
1035    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1036
1037    /* The frontfacing comes in as a bit in the thread payload. */
1038    if (intel->gen >= 6) {
1039       emit(BRW_OPCODE_ASR, *reg,
1040            fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1041            fs_reg(15));
1042       emit(BRW_OPCODE_NOT, *reg, *reg);
1043       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1044    } else {
1045       struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1046       /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1047        * us front face
1048        */
1049       emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1050       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1051    }
1052
1053    return reg;
1054 }
1055
1056 fs_reg
1057 fs_visitor::fix_math_operand(fs_reg src)
1058 {
1059    /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1060     * might be able to do better by doing execsize = 1 math and then
1061     * expanding that result out, but we would need to be careful with
1062     * masking.
1063     *
1064     * The hardware ignores source modifiers (negate and abs) on math
1065     * instructions, so we also move to a temp to set those up.
1066     */
1067    if (intel->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1068        !src.abs && !src.negate)
1069       return src;
1070
1071    /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1072     * operands to math
1073     */
1074    if (intel->gen >= 7 && src.file != IMM)
1075       return src;
1076
1077    fs_reg expanded = fs_reg(this, glsl_type::float_type);
1078    expanded.type = src.type;
1079    emit(BRW_OPCODE_MOV, expanded, src);
1080    return expanded;
1081 }
1082
1083 fs_inst *
1084 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1085 {
1086    switch (opcode) {
1087    case SHADER_OPCODE_RCP:
1088    case SHADER_OPCODE_RSQ:
1089    case SHADER_OPCODE_SQRT:
1090    case SHADER_OPCODE_EXP2:
1091    case SHADER_OPCODE_LOG2:
1092    case SHADER_OPCODE_SIN:
1093    case SHADER_OPCODE_COS:
1094       break;
1095    default:
1096       assert(!"not reached: bad math opcode");
1097       return NULL;
1098    }
1099
1100    /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
1101     * might be able to do better by doing execsize = 1 math and then
1102     * expanding that result out, but we would need to be careful with
1103     * masking.
1104     *
1105     * Gen 6 hardware ignores source modifiers (negate and abs) on math
1106     * instructions, so we also move to a temp to set those up.
1107     */
1108    if (intel->gen >= 6)
1109       src = fix_math_operand(src);
1110
1111    fs_inst *inst = emit(opcode, dst, src);
1112
1113    if (intel->gen < 6) {
1114       inst->base_mrf = 2;
1115       inst->mlen = dispatch_width / 8;
1116    }
1117
1118    return inst;
1119 }
1120
1121 fs_inst *
1122 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1123 {
1124    int base_mrf = 2;
1125    fs_inst *inst;
1126
1127    switch (opcode) {
1128    case SHADER_OPCODE_INT_QUOTIENT:
1129    case SHADER_OPCODE_INT_REMAINDER:
1130       if (intel->gen >= 7 && dispatch_width == 16)
1131          fail("16-wide INTDIV unsupported\n");
1132       break;
1133    case SHADER_OPCODE_POW:
1134       break;
1135    default:
1136       assert(!"not reached: unsupported binary math opcode.");
1137       return NULL;
1138    }
1139
1140    if (intel->gen >= 6) {
1141       src0 = fix_math_operand(src0);
1142       src1 = fix_math_operand(src1);
1143
1144       inst = emit(opcode, dst, src0, src1);
1145    } else {
1146       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1147        * "Message Payload":
1148        *
1149        * "Operand0[7].  For the INT DIV functions, this operand is the
1150        *  denominator."
1151        *  ...
1152        * "Operand1[7].  For the INT DIV functions, this operand is the
1153        *  numerator."
1154        */
1155       bool is_int_div = opcode != SHADER_OPCODE_POW;
1156       fs_reg &op0 = is_int_div ? src1 : src0;
1157       fs_reg &op1 = is_int_div ? src0 : src1;
1158
1159       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1160       inst = emit(opcode, dst, op0, reg_null_f);
1161
1162       inst->base_mrf = base_mrf;
1163       inst->mlen = 2 * dispatch_width / 8;
1164    }
1165    return inst;
1166 }
1167
1168 void
1169 fs_visitor::assign_curb_setup()
1170 {
1171    c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
1172    if (dispatch_width == 8) {
1173       c->prog_data.first_curbe_grf = c->nr_payload_regs;
1174    } else {
1175       c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1176    }
1177
1178    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1179    foreach_list(node, &this->instructions) {
1180       fs_inst *inst = (fs_inst *)node;
1181
1182       for (unsigned int i = 0; i < 3; i++) {
1183          if (inst->src[i].file == UNIFORM) {
1184             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1185             struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1186                                                   constant_nr / 8,
1187                                                   constant_nr % 8);
1188
1189             inst->src[i].file = FIXED_HW_REG;
1190             inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
1191          }
1192       }
1193    }
1194 }
1195
1196 void
1197 fs_visitor::calculate_urb_setup()
1198 {
1199    for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
1200       urb_setup[i] = -1;
1201    }
1202
1203    int urb_next = 0;
1204    /* Figure out where each of the incoming setup attributes lands. */
1205    if (intel->gen >= 6) {
1206       for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
1207          if (fp->Base.InputsRead & BITFIELD64_BIT(i)) {
1208             urb_setup[i] = urb_next++;
1209          }
1210       }
1211    } else {
1212       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1213       for (unsigned int i = 0; i < VERT_RESULT_MAX; i++) {
1214          /* Point size is packed into the header, not as a general attribute */
1215          if (i == VERT_RESULT_PSIZ)
1216             continue;
1217
1218          if (c->key.vp_outputs_written & BITFIELD64_BIT(i)) {
1219             int fp_index = _mesa_vert_result_to_frag_attrib((gl_vert_result) i);
1220
1221             /* The back color slot is skipped when the front color is
1222              * also written to.  In addition, some slots can be
1223              * written in the vertex shader and not read in the
1224              * fragment shader.  So the register number must always be
1225              * incremented, mapped or not.
1226              */
1227             if (fp_index >= 0)
1228                urb_setup[fp_index] = urb_next;
1229             urb_next++;
1230          }
1231       }
1232
1233       /*
1234        * It's a FS only attribute, and we did interpolation for this attribute
1235        * in SF thread. So, count it here, too.
1236        *
1237        * See compile_sf_prog() for more info.
1238        */
1239       if (fp->Base.InputsRead & BITFIELD64_BIT(FRAG_ATTRIB_PNTC))
1240          urb_setup[FRAG_ATTRIB_PNTC] = urb_next++;
1241    }
1242
1243    /* Each attribute is 4 setup channels, each of which is half a reg. */
1244    c->prog_data.urb_read_length = urb_next * 2;
1245 }
1246
1247 void
1248 fs_visitor::assign_urb_setup()
1249 {
1250    int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1251
1252    /* Offset all the urb_setup[] index by the actual position of the
1253     * setup regs, now that the location of the constants has been chosen.
1254     */
1255    foreach_list(node, &this->instructions) {
1256       fs_inst *inst = (fs_inst *)node;
1257
1258       if (inst->opcode == FS_OPCODE_LINTERP) {
1259          assert(inst->src[2].file == FIXED_HW_REG);
1260          inst->src[2].fixed_hw_reg.nr += urb_start;
1261       }
1262
1263       if (inst->opcode == FS_OPCODE_CINTERP) {
1264          assert(inst->src[0].file == FIXED_HW_REG);
1265          inst->src[0].fixed_hw_reg.nr += urb_start;
1266       }
1267    }
1268
1269    this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length;
1270 }
1271
1272 /**
1273  * Split large virtual GRFs into separate components if we can.
1274  *
1275  * This is mostly duplicated with what brw_fs_vector_splitting does,
1276  * but that's really conservative because it's afraid of doing
1277  * splitting that doesn't result in real progress after the rest of
1278  * the optimization phases, which would cause infinite looping in
1279  * optimization.  We can do it once here, safely.  This also has the
1280  * opportunity to split interpolated values, or maybe even uniforms,
1281  * which we don't have at the IR level.
1282  *
1283  * We want to split, because virtual GRFs are what we register
1284  * allocate and spill (due to contiguousness requirements for some
1285  * instructions), and they're what we naturally generate in the
1286  * codegen process, but most virtual GRFs don't actually need to be
1287  * contiguous sets of GRFs.  If we split, we'll end up with reduced
1288  * live intervals and better dead code elimination and coalescing.
1289  */
1290 void
1291 fs_visitor::split_virtual_grfs()
1292 {
1293    int num_vars = this->virtual_grf_count;
1294    bool split_grf[num_vars];
1295    int new_virtual_grf[num_vars];
1296
1297    /* Try to split anything > 0 sized. */
1298    for (int i = 0; i < num_vars; i++) {
1299       if (this->virtual_grf_sizes[i] != 1)
1300          split_grf[i] = true;
1301       else
1302          split_grf[i] = false;
1303    }
1304
1305    if (brw->has_pln &&
1306        this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1307       /* PLN opcodes rely on the delta_xy being contiguous.  We only have to
1308        * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1309        * Gen6, that was the only supported interpolation mode, and since Gen6,
1310        * delta_x and delta_y are in fixed hardware registers.
1311        */
1312       split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1313          false;
1314    }
1315
1316    foreach_list(node, &this->instructions) {
1317       fs_inst *inst = (fs_inst *)node;
1318
1319       /* If there's a SEND message that requires contiguous destination
1320        * registers, no splitting is allowed.
1321        */
1322       if (inst->regs_written() > 1) {
1323          split_grf[inst->dst.reg] = false;
1324       }
1325    }
1326
1327    /* Allocate new space for split regs.  Note that the virtual
1328     * numbers will be contiguous.
1329     */
1330    for (int i = 0; i < num_vars; i++) {
1331       if (split_grf[i]) {
1332          new_virtual_grf[i] = virtual_grf_alloc(1);
1333          for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1334             int reg = virtual_grf_alloc(1);
1335             assert(reg == new_virtual_grf[i] + j - 1);
1336             (void) reg;
1337          }
1338          this->virtual_grf_sizes[i] = 1;
1339       }
1340    }
1341
1342    foreach_list(node, &this->instructions) {
1343       fs_inst *inst = (fs_inst *)node;
1344
1345       if (inst->dst.file == GRF &&
1346           split_grf[inst->dst.reg] &&
1347           inst->dst.reg_offset != 0) {
1348          inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1349                           inst->dst.reg_offset - 1);
1350          inst->dst.reg_offset = 0;
1351       }
1352       for (int i = 0; i < 3; i++) {
1353          if (inst->src[i].file == GRF &&
1354              split_grf[inst->src[i].reg] &&
1355              inst->src[i].reg_offset != 0) {
1356             inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1357                                 inst->src[i].reg_offset - 1);
1358             inst->src[i].reg_offset = 0;
1359          }
1360       }
1361    }
1362    this->live_intervals_valid = false;
1363 }
1364
1365 /**
1366  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1367  *
1368  * During code generation, we create tons of temporary variables, many of
1369  * which get immediately killed and are never used again.  Yet, in later
1370  * optimization and analysis passes, such as compute_live_intervals, we need
1371  * to loop over all the virtual GRFs.  Compacting them can save a lot of
1372  * overhead.
1373  */
1374 void
1375 fs_visitor::compact_virtual_grfs()
1376 {
1377    /* Mark which virtual GRFs are used, and count how many. */
1378    int remap_table[this->virtual_grf_count];
1379    memset(remap_table, -1, sizeof(remap_table));
1380
1381    foreach_list(node, &this->instructions) {
1382       const fs_inst *inst = (const fs_inst *) node;
1383
1384       if (inst->dst.file == GRF)
1385          remap_table[inst->dst.reg] = 0;
1386
1387       for (int i = 0; i < 3; i++) {
1388          if (inst->src[i].file == GRF)
1389             remap_table[inst->src[i].reg] = 0;
1390       }
1391    }
1392
1393    /* In addition to registers used in instructions, fs_visitor keeps
1394     * direct references to certain special values which must be patched:
1395     */
1396    fs_reg *special[] = {
1397       &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1398       &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1399       &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1400       &delta_x[0], &delta_x[1], &delta_x[2],
1401       &delta_x[3], &delta_x[4], &delta_x[5],
1402       &delta_y[0], &delta_y[1], &delta_y[2],
1403       &delta_y[3], &delta_y[4], &delta_y[5],
1404    };
1405    STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1406    STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1407
1408    /* Treat all special values as used, to be conservative */
1409    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1410       if (special[i]->file == GRF)
1411          remap_table[special[i]->reg] = 0;
1412    }
1413
1414    /* Compact the GRF arrays. */
1415    int new_index = 0;
1416    for (int i = 0; i < this->virtual_grf_count; i++) {
1417       if (remap_table[i] != -1) {
1418          remap_table[i] = new_index;
1419          virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1420          if (live_intervals_valid) {
1421             virtual_grf_use[new_index] = virtual_grf_use[i];
1422             virtual_grf_def[new_index] = virtual_grf_def[i];
1423          }
1424          ++new_index;
1425       }
1426    }
1427
1428    this->virtual_grf_count = new_index;
1429
1430    /* Patch all the instructions to use the newly renumbered registers */
1431    foreach_list(node, &this->instructions) {
1432       fs_inst *inst = (fs_inst *) node;
1433
1434       if (inst->dst.file == GRF)
1435          inst->dst.reg = remap_table[inst->dst.reg];
1436
1437       for (int i = 0; i < 3; i++) {
1438          if (inst->src[i].file == GRF)
1439             inst->src[i].reg = remap_table[inst->src[i].reg];
1440       }
1441    }
1442
1443    /* Patch all the references to special values */
1444    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1445       if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1446          special[i]->reg = remap_table[special[i]->reg];
1447    }
1448 }
1449
1450 bool
1451 fs_visitor::remove_dead_constants()
1452 {
1453    if (dispatch_width == 8) {
1454       this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
1455
1456       for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
1457          this->params_remap[i] = -1;
1458
1459       /* Find which params are still in use. */
1460       foreach_list(node, &this->instructions) {
1461          fs_inst *inst = (fs_inst *)node;
1462
1463          for (int i = 0; i < 3; i++) {
1464             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1465
1466             if (inst->src[i].file != UNIFORM)
1467                continue;
1468
1469             assert(constant_nr < (int)c->prog_data.nr_params);
1470
1471             /* For now, set this to non-negative.  We'll give it the
1472              * actual new number in a moment, in order to keep the
1473              * register numbers nicely ordered.
1474              */
1475             this->params_remap[constant_nr] = 0;
1476          }
1477       }
1478
1479       /* Figure out what the new numbers for the params will be.  At some
1480        * point when we're doing uniform array access, we're going to want
1481        * to keep the distinction between .reg and .reg_offset, but for
1482        * now we don't care.
1483        */
1484       unsigned int new_nr_params = 0;
1485       for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1486          if (this->params_remap[i] != -1) {
1487             this->params_remap[i] = new_nr_params++;
1488          }
1489       }
1490
1491       /* Update the list of params to be uploaded to match our new numbering. */
1492       for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1493          int remapped = this->params_remap[i];
1494
1495          if (remapped == -1)
1496             continue;
1497
1498          c->prog_data.param[remapped] = c->prog_data.param[i];
1499       }
1500
1501       c->prog_data.nr_params = new_nr_params;
1502    } else {
1503       /* This should have been generated in the 8-wide pass already. */
1504       assert(this->params_remap);
1505    }
1506
1507    /* Now do the renumbering of the shader to remove unused params. */
1508    foreach_list(node, &this->instructions) {
1509       fs_inst *inst = (fs_inst *)node;
1510
1511       for (int i = 0; i < 3; i++) {
1512          int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1513
1514          if (inst->src[i].file != UNIFORM)
1515             continue;
1516
1517          assert(this->params_remap[constant_nr] != -1);
1518          inst->src[i].reg = this->params_remap[constant_nr];
1519          inst->src[i].reg_offset = 0;
1520       }
1521    }
1522
1523    return true;
1524 }
1525
1526 /*
1527  * Implements array access of uniforms by inserting a
1528  * PULL_CONSTANT_LOAD instruction.
1529  *
1530  * Unlike temporary GRF array access (where we don't support it due to
1531  * the difficulty of doing relative addressing on instruction
1532  * destinations), we could potentially do array access of uniforms
1533  * that were loaded in GRF space as push constants.  In real-world
1534  * usage we've seen, though, the arrays being used are always larger
1535  * than we could load as push constants, so just always move all
1536  * uniform array access out to a pull constant buffer.
1537  */
1538 void
1539 fs_visitor::move_uniform_array_access_to_pull_constants()
1540 {
1541    int pull_constant_loc[c->prog_data.nr_params];
1542
1543    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1544       pull_constant_loc[i] = -1;
1545    }
1546
1547    /* Walk through and find array access of uniforms.  Put a copy of that
1548     * uniform in the pull constant buffer.
1549     *
1550     * Note that we don't move constant-indexed accesses to arrays.  No
1551     * testing has been done of the performance impact of this choice.
1552     */
1553    foreach_list_safe(node, &this->instructions) {
1554       fs_inst *inst = (fs_inst *)node;
1555
1556       for (int i = 0 ; i < 3; i++) {
1557          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1558             continue;
1559
1560          int uniform = inst->src[i].reg;
1561
1562          /* If this array isn't already present in the pull constant buffer,
1563           * add it.
1564           */
1565          if (pull_constant_loc[uniform] == -1) {
1566             const float **values = &c->prog_data.param[uniform];
1567
1568             pull_constant_loc[uniform] = c->prog_data.nr_pull_params;
1569
1570             assert(param_size[uniform]);
1571
1572             for (int j = 0; j < param_size[uniform]; j++) {
1573                c->prog_data.pull_param[c->prog_data.nr_pull_params++] =
1574                   values[j];
1575             }
1576          }
1577
1578          /* Set up the annotation tracking for new generated instructions. */
1579          base_ir = inst->ir;
1580          current_annotation = inst->annotation;
1581
1582          fs_reg offset = fs_reg(this, glsl_type::int_type);
1583          inst->insert_before(ADD(offset, *inst->src[i].reladdr,
1584                                  fs_reg(pull_constant_loc[uniform] +
1585                                         inst->src[i].reg_offset)));
1586
1587          fs_reg surf_index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1588          fs_reg temp = fs_reg(this, glsl_type::float_type);
1589          exec_list list = VARYING_PULL_CONSTANT_LOAD(temp,
1590                                                      surf_index, offset);
1591          inst->insert_before(&list);
1592
1593          inst->src[i].file = temp.file;
1594          inst->src[i].reg = temp.reg;
1595          inst->src[i].reg_offset = temp.reg_offset;
1596          inst->src[i].reladdr = NULL;
1597       }
1598    }
1599 }
1600
1601 /**
1602  * Choose accesses from the UNIFORM file to demote to using the pull
1603  * constant buffer.
1604  *
1605  * We allow a fragment shader to have more than the specified minimum
1606  * maximum number of fragment shader uniform components (64).  If
1607  * there are too many of these, they'd fill up all of register space.
1608  * So, this will push some of them out to the pull constant buffer and
1609  * update the program to load them.
1610  */
1611 void
1612 fs_visitor::setup_pull_constants()
1613 {
1614    /* Only allow 16 registers (128 uniform components) as push constants. */
1615    unsigned int max_uniform_components = 16 * 8;
1616    if (c->prog_data.nr_params <= max_uniform_components)
1617       return;
1618
1619    if (dispatch_width == 16) {
1620       fail("Pull constants not supported in 16-wide\n");
1621       return;
1622    }
1623
1624    /* Just demote the end of the list.  We could probably do better
1625     * here, demoting things that are rarely used in the program first.
1626     */
1627    unsigned int pull_uniform_base = max_uniform_components;
1628
1629    int pull_constant_loc[c->prog_data.nr_params];
1630    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1631       if (i < pull_uniform_base) {
1632          pull_constant_loc[i] = -1;
1633       } else {
1634          pull_constant_loc[i] = -1;
1635          /* If our constant is already being uploaded for reladdr purposes,
1636           * reuse it.
1637           */
1638          for (unsigned int j = 0; j < c->prog_data.nr_pull_params; j++) {
1639             if (c->prog_data.pull_param[j] == c->prog_data.param[i]) {
1640                pull_constant_loc[i] = j;
1641                break;
1642             }
1643          }
1644          if (pull_constant_loc[i] == -1) {
1645             int pull_index = c->prog_data.nr_pull_params++;
1646             c->prog_data.pull_param[pull_index] = c->prog_data.param[i];
1647             pull_constant_loc[i] = pull_index;;
1648          }
1649       }
1650    }
1651    c->prog_data.nr_params = pull_uniform_base;
1652
1653    foreach_list(node, &this->instructions) {
1654       fs_inst *inst = (fs_inst *)node;
1655
1656       for (int i = 0; i < 3; i++) {
1657          if (inst->src[i].file != UNIFORM)
1658             continue;
1659
1660          int pull_index = pull_constant_loc[inst->src[i].reg +
1661                                             inst->src[i].reg_offset];
1662          if (pull_index == -1)
1663             continue;
1664
1665          assert(!inst->src[i].reladdr);
1666
1667          fs_reg dst = fs_reg(this, glsl_type::float_type);
1668          fs_reg index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1669          fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1670          fs_inst *pull =
1671             new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1672                                  dst, index, offset);
1673          pull->ir = inst->ir;
1674          pull->annotation = inst->annotation;
1675          pull->base_mrf = 14;
1676          pull->mlen = 1;
1677
1678          inst->insert_before(pull);
1679
1680          inst->src[i].file = GRF;
1681          inst->src[i].reg = dst.reg;
1682          inst->src[i].reg_offset = 0;
1683          inst->src[i].smear = pull_index & 3;
1684       }
1685    }
1686 }
1687
1688 bool
1689 fs_visitor::opt_algebraic()
1690 {
1691    bool progress = false;
1692
1693    foreach_list(node, &this->instructions) {
1694       fs_inst *inst = (fs_inst *)node;
1695
1696       switch (inst->opcode) {
1697       case BRW_OPCODE_MUL:
1698          if (inst->src[1].file != IMM)
1699             continue;
1700
1701          /* a * 1.0 = a */
1702          if (inst->src[1].is_one()) {
1703             inst->opcode = BRW_OPCODE_MOV;
1704             inst->src[1] = reg_undef;
1705             progress = true;
1706             break;
1707          }
1708
1709          /* a * 0.0 = 0.0 */
1710          if (inst->src[1].is_zero()) {
1711             inst->opcode = BRW_OPCODE_MOV;
1712             inst->src[0] = inst->src[1];
1713             inst->src[1] = reg_undef;
1714             progress = true;
1715             break;
1716          }
1717
1718          break;
1719       case BRW_OPCODE_ADD:
1720          if (inst->src[1].file != IMM)
1721             continue;
1722
1723          /* a + 0.0 = a */
1724          if (inst->src[1].is_zero()) {
1725             inst->opcode = BRW_OPCODE_MOV;
1726             inst->src[1] = reg_undef;
1727             progress = true;
1728             break;
1729          }
1730          break;
1731       default:
1732          break;
1733       }
1734    }
1735
1736    return progress;
1737 }
1738
1739 /**
1740  * Must be called after calculate_live_intervales() to remove unused
1741  * writes to registers -- register allocation will fail otherwise
1742  * because something deffed but not used won't be considered to
1743  * interfere with other regs.
1744  */
1745 bool
1746 fs_visitor::dead_code_eliminate()
1747 {
1748    bool progress = false;
1749    int pc = 0;
1750
1751    calculate_live_intervals();
1752
1753    foreach_list_safe(node, &this->instructions) {
1754       fs_inst *inst = (fs_inst *)node;
1755
1756       if (inst->dst.file == GRF && this->virtual_grf_use[inst->dst.reg] <= pc) {
1757          inst->remove();
1758          progress = true;
1759       }
1760
1761       pc++;
1762    }
1763
1764    if (progress)
1765       live_intervals_valid = false;
1766
1767    return progress;
1768 }
1769
1770 /**
1771  * Implements a second type of register coalescing: This one checks if
1772  * the two regs involved in a raw move don't interfere, in which case
1773  * they can both by stored in the same place and the MOV removed.
1774  */
1775 bool
1776 fs_visitor::register_coalesce_2()
1777 {
1778    bool progress = false;
1779
1780    calculate_live_intervals();
1781
1782    foreach_list_safe(node, &this->instructions) {
1783       fs_inst *inst = (fs_inst *)node;
1784
1785       if (inst->opcode != BRW_OPCODE_MOV ||
1786           inst->predicate ||
1787           inst->saturate ||
1788           inst->src[0].file != GRF ||
1789           inst->src[0].negate ||
1790           inst->src[0].abs ||
1791           inst->src[0].smear != -1 ||
1792           inst->dst.file != GRF ||
1793           inst->dst.type != inst->src[0].type ||
1794           virtual_grf_sizes[inst->src[0].reg] != 1 ||
1795           virtual_grf_interferes(inst->dst.reg, inst->src[0].reg)) {
1796          continue;
1797       }
1798
1799       int reg_from = inst->src[0].reg;
1800       assert(inst->src[0].reg_offset == 0);
1801       int reg_to = inst->dst.reg;
1802       int reg_to_offset = inst->dst.reg_offset;
1803
1804       foreach_list(node, &this->instructions) {
1805          fs_inst *scan_inst = (fs_inst *)node;
1806
1807          if (scan_inst->dst.file == GRF &&
1808              scan_inst->dst.reg == reg_from) {
1809             scan_inst->dst.reg = reg_to;
1810             scan_inst->dst.reg_offset = reg_to_offset;
1811          }
1812          for (int i = 0; i < 3; i++) {
1813             if (scan_inst->src[i].file == GRF &&
1814                 scan_inst->src[i].reg == reg_from) {
1815                scan_inst->src[i].reg = reg_to;
1816                scan_inst->src[i].reg_offset = reg_to_offset;
1817             }
1818          }
1819       }
1820
1821       inst->remove();
1822
1823       /* We don't need to recalculate live intervals inside the loop despite
1824        * flagging live_intervals_valid because we only use live intervals for
1825        * the interferes test, and we must have had a situation where the
1826        * intervals were:
1827        *
1828        *  from  to
1829        *  ^
1830        *  |
1831        *  v
1832        *        ^
1833        *        |
1834        *        v
1835        *
1836        * Some register R that might get coalesced with one of these two could
1837        * only be referencing "to", otherwise "from"'s range would have been
1838        * longer.  R's range could also only start at the end of "to" or later,
1839        * otherwise it will conflict with "to" when we try to coalesce "to"
1840        * into Rw anyway.
1841        */
1842       live_intervals_valid = false;
1843
1844       progress = true;
1845       continue;
1846    }
1847
1848    return progress;
1849 }
1850
1851 bool
1852 fs_visitor::register_coalesce()
1853 {
1854    bool progress = false;
1855    int if_depth = 0;
1856    int loop_depth = 0;
1857
1858    foreach_list_safe(node, &this->instructions) {
1859       fs_inst *inst = (fs_inst *)node;
1860
1861       /* Make sure that we dominate the instructions we're going to
1862        * scan for interfering with our coalescing, or we won't have
1863        * scanned enough to see if anything interferes with our
1864        * coalescing.  We don't dominate the following instructions if
1865        * we're in a loop or an if block.
1866        */
1867       switch (inst->opcode) {
1868       case BRW_OPCODE_DO:
1869          loop_depth++;
1870          break;
1871       case BRW_OPCODE_WHILE:
1872          loop_depth--;
1873          break;
1874       case BRW_OPCODE_IF:
1875          if_depth++;
1876          break;
1877       case BRW_OPCODE_ENDIF:
1878          if_depth--;
1879          break;
1880       default:
1881          break;
1882       }
1883       if (loop_depth || if_depth)
1884          continue;
1885
1886       if (inst->opcode != BRW_OPCODE_MOV ||
1887           inst->predicate ||
1888           inst->saturate ||
1889           inst->dst.file != GRF || (inst->src[0].file != GRF &&
1890                                     inst->src[0].file != UNIFORM)||
1891           inst->dst.type != inst->src[0].type)
1892          continue;
1893
1894       bool has_source_modifiers = (inst->src[0].abs ||
1895                                    inst->src[0].negate ||
1896                                    inst->src[0].file == UNIFORM);
1897
1898       /* Found a move of a GRF to a GRF.  Let's see if we can coalesce
1899        * them: check for no writes to either one until the exit of the
1900        * program.
1901        */
1902       bool interfered = false;
1903
1904       for (fs_inst *scan_inst = (fs_inst *)inst->next;
1905            !scan_inst->is_tail_sentinel();
1906            scan_inst = (fs_inst *)scan_inst->next) {
1907          if (scan_inst->dst.file == GRF) {
1908             if (scan_inst->overwrites_reg(inst->dst) ||
1909                 scan_inst->overwrites_reg(inst->src[0])) {
1910                interfered = true;
1911                break;
1912             }
1913          }
1914
1915          /* The gen6 MATH instruction can't handle source modifiers or
1916           * unusual register regions, so avoid coalescing those for
1917           * now.  We should do something more specific.
1918           */
1919          if (has_source_modifiers && !can_do_source_mods(scan_inst)) {
1920             interfered = true;
1921             break;
1922          }
1923
1924          /* The accumulator result appears to get used for the
1925           * conditional modifier generation.  When negating a UD
1926           * value, there is a 33rd bit generated for the sign in the
1927           * accumulator value, so now you can't check, for example,
1928           * equality with a 32-bit value.  See piglit fs-op-neg-uint.
1929           */
1930          if (scan_inst->conditional_mod &&
1931              inst->src[0].negate &&
1932              inst->src[0].type == BRW_REGISTER_TYPE_UD) {
1933             interfered = true;
1934             break;
1935          }
1936       }
1937       if (interfered) {
1938          continue;
1939       }
1940
1941       /* Rewrite the later usage to point at the source of the move to
1942        * be removed.
1943        */
1944       for (fs_inst *scan_inst = inst;
1945            !scan_inst->is_tail_sentinel();
1946            scan_inst = (fs_inst *)scan_inst->next) {
1947          for (int i = 0; i < 3; i++) {
1948             if (scan_inst->src[i].file == GRF &&
1949                 scan_inst->src[i].reg == inst->dst.reg &&
1950                 scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
1951                fs_reg new_src = inst->src[0];
1952                if (scan_inst->src[i].abs) {
1953                   new_src.negate = 0;
1954                   new_src.abs = 1;
1955                }
1956                new_src.negate ^= scan_inst->src[i].negate;
1957                scan_inst->src[i] = new_src;
1958             }
1959          }
1960       }
1961
1962       inst->remove();
1963       progress = true;
1964    }
1965
1966    if (progress)
1967       live_intervals_valid = false;
1968
1969    return progress;
1970 }
1971
1972
1973 bool
1974 fs_visitor::compute_to_mrf()
1975 {
1976    bool progress = false;
1977    int next_ip = 0;
1978
1979    calculate_live_intervals();
1980
1981    foreach_list_safe(node, &this->instructions) {
1982       fs_inst *inst = (fs_inst *)node;
1983
1984       int ip = next_ip;
1985       next_ip++;
1986
1987       if (inst->opcode != BRW_OPCODE_MOV ||
1988           inst->predicate ||
1989           inst->dst.file != MRF || inst->src[0].file != GRF ||
1990           inst->dst.type != inst->src[0].type ||
1991           inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
1992          continue;
1993
1994       /* Work out which hardware MRF registers are written by this
1995        * instruction.
1996        */
1997       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
1998       int mrf_high;
1999       if (inst->dst.reg & BRW_MRF_COMPR4) {
2000          mrf_high = mrf_low + 4;
2001       } else if (dispatch_width == 16 &&
2002                  (!inst->force_uncompressed && !inst->force_sechalf)) {
2003          mrf_high = mrf_low + 1;
2004       } else {
2005          mrf_high = mrf_low;
2006       }
2007
2008       /* Can't compute-to-MRF this GRF if someone else was going to
2009        * read it later.
2010        */
2011       if (this->virtual_grf_use[inst->src[0].reg] > ip)
2012          continue;
2013
2014       /* Found a move of a GRF to a MRF.  Let's see if we can go
2015        * rewrite the thing that made this GRF to write into the MRF.
2016        */
2017       fs_inst *scan_inst;
2018       for (scan_inst = (fs_inst *)inst->prev;
2019            scan_inst->prev != NULL;
2020            scan_inst = (fs_inst *)scan_inst->prev) {
2021          if (scan_inst->dst.file == GRF &&
2022              scan_inst->dst.reg == inst->src[0].reg) {
2023             /* Found the last thing to write our reg we want to turn
2024              * into a compute-to-MRF.
2025              */
2026
2027             /* SENDs can only write to GRFs, so no compute-to-MRF. */
2028             if (scan_inst->mlen) {
2029                break;
2030             }
2031
2032             /* If it's predicated, it (probably) didn't populate all
2033              * the channels.  We might be able to rewrite everything
2034              * that writes that reg, but it would require smarter
2035              * tracking to delay the rewriting until complete success.
2036              */
2037             if (scan_inst->predicate)
2038                break;
2039
2040             /* If it's half of register setup and not the same half as
2041              * our MOV we're trying to remove, bail for now.
2042              */
2043             if (scan_inst->force_uncompressed != inst->force_uncompressed ||
2044                 scan_inst->force_sechalf != inst->force_sechalf) {
2045                break;
2046             }
2047
2048             /* SEND instructions can't have MRF as a destination. */
2049             if (scan_inst->mlen)
2050                break;
2051
2052             if (intel->gen >= 6) {
2053                /* gen6 math instructions must have the destination be
2054                 * GRF, so no compute-to-MRF for them.
2055                 */
2056                if (scan_inst->is_math()) {
2057                   break;
2058                }
2059             }
2060
2061             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2062                /* Found the creator of our MRF's source value. */
2063                scan_inst->dst.file = MRF;
2064                scan_inst->dst.reg = inst->dst.reg;
2065                scan_inst->saturate |= inst->saturate;
2066                inst->remove();
2067                progress = true;
2068             }
2069             break;
2070          }
2071
2072          /* We don't handle flow control here.  Most computation of
2073           * values that end up in MRFs are shortly before the MRF
2074           * write anyway.
2075           */
2076          if (scan_inst->opcode == BRW_OPCODE_DO ||
2077              scan_inst->opcode == BRW_OPCODE_WHILE ||
2078              scan_inst->opcode == BRW_OPCODE_ELSE ||
2079              scan_inst->opcode == BRW_OPCODE_ENDIF) {
2080             break;
2081          }
2082
2083          /* You can't read from an MRF, so if someone else reads our
2084           * MRF's source GRF that we wanted to rewrite, that stops us.
2085           */
2086          bool interfered = false;
2087          for (int i = 0; i < 3; i++) {
2088             if (scan_inst->src[i].file == GRF &&
2089                 scan_inst->src[i].reg == inst->src[0].reg &&
2090                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2091                interfered = true;
2092             }
2093          }
2094          if (interfered)
2095             break;
2096
2097          if (scan_inst->dst.file == MRF) {
2098             /* If somebody else writes our MRF here, we can't
2099              * compute-to-MRF before that.
2100              */
2101             int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2102             int scan_mrf_high;
2103
2104             if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2105                scan_mrf_high = scan_mrf_low + 4;
2106             } else if (dispatch_width == 16 &&
2107                        (!scan_inst->force_uncompressed &&
2108                         !scan_inst->force_sechalf)) {
2109                scan_mrf_high = scan_mrf_low + 1;
2110             } else {
2111                scan_mrf_high = scan_mrf_low;
2112             }
2113
2114             if (mrf_low == scan_mrf_low ||
2115                 mrf_low == scan_mrf_high ||
2116                 mrf_high == scan_mrf_low ||
2117                 mrf_high == scan_mrf_high) {
2118                break;
2119             }
2120          }
2121
2122          if (scan_inst->mlen > 0) {
2123             /* Found a SEND instruction, which means that there are
2124              * live values in MRFs from base_mrf to base_mrf +
2125              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
2126              * above it.
2127              */
2128             if (mrf_low >= scan_inst->base_mrf &&
2129                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2130                break;
2131             }
2132             if (mrf_high >= scan_inst->base_mrf &&
2133                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2134                break;
2135             }
2136          }
2137       }
2138    }
2139
2140    if (progress)
2141       live_intervals_valid = false;
2142
2143    return progress;
2144 }
2145
2146 /**
2147  * Walks through basic blocks, looking for repeated MRF writes and
2148  * removing the later ones.
2149  */
2150 bool
2151 fs_visitor::remove_duplicate_mrf_writes()
2152 {
2153    fs_inst *last_mrf_move[16];
2154    bool progress = false;
2155
2156    /* Need to update the MRF tracking for compressed instructions. */
2157    if (dispatch_width == 16)
2158       return false;
2159
2160    memset(last_mrf_move, 0, sizeof(last_mrf_move));
2161
2162    foreach_list_safe(node, &this->instructions) {
2163       fs_inst *inst = (fs_inst *)node;
2164
2165       switch (inst->opcode) {
2166       case BRW_OPCODE_DO:
2167       case BRW_OPCODE_WHILE:
2168       case BRW_OPCODE_IF:
2169       case BRW_OPCODE_ELSE:
2170       case BRW_OPCODE_ENDIF:
2171          memset(last_mrf_move, 0, sizeof(last_mrf_move));
2172          continue;
2173       default:
2174          break;
2175       }
2176
2177       if (inst->opcode == BRW_OPCODE_MOV &&
2178           inst->dst.file == MRF) {
2179          fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2180          if (prev_inst && inst->equals(prev_inst)) {
2181             inst->remove();
2182             progress = true;
2183             continue;
2184          }
2185       }
2186
2187       /* Clear out the last-write records for MRFs that were overwritten. */
2188       if (inst->dst.file == MRF) {
2189          last_mrf_move[inst->dst.reg] = NULL;
2190       }
2191
2192       if (inst->mlen > 0) {
2193          /* Found a SEND instruction, which will include two or fewer
2194           * implied MRF writes.  We could do better here.
2195           */
2196          for (int i = 0; i < implied_mrf_writes(inst); i++) {
2197             last_mrf_move[inst->base_mrf + i] = NULL;
2198          }
2199       }
2200
2201       /* Clear out any MRF move records whose sources got overwritten. */
2202       if (inst->dst.file == GRF) {
2203          for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2204             if (last_mrf_move[i] &&
2205                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2206                last_mrf_move[i] = NULL;
2207             }
2208          }
2209       }
2210
2211       if (inst->opcode == BRW_OPCODE_MOV &&
2212           inst->dst.file == MRF &&
2213           inst->src[0].file == GRF &&
2214           !inst->predicate) {
2215          last_mrf_move[inst->dst.reg] = inst;
2216       }
2217    }
2218
2219    if (progress)
2220       live_intervals_valid = false;
2221
2222    return progress;
2223 }
2224
2225 void
2226 fs_visitor::dump_instruction(fs_inst *inst)
2227 {
2228    if (inst->predicate) {
2229       printf("(%cf0.%d) ",
2230              inst->predicate_inverse ? '-' : '+',
2231              inst->flag_subreg);
2232    }
2233
2234    if (inst->opcode < ARRAY_SIZE(opcode_descs) &&
2235        opcode_descs[inst->opcode].name) {
2236       printf("%s", opcode_descs[inst->opcode].name);
2237    } else {
2238       printf("op%d", inst->opcode);
2239    }
2240    if (inst->saturate)
2241       printf(".sat");
2242    if (inst->conditional_mod) {
2243       printf(".cmod");
2244       if (!inst->predicate &&
2245           (intel->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2246                               inst->opcode != BRW_OPCODE_IF &&
2247                               inst->opcode != BRW_OPCODE_WHILE))) {
2248          printf(".f0.%d\n", inst->flag_subreg);
2249       }
2250    }
2251    printf(" ");
2252
2253
2254    switch (inst->dst.file) {
2255    case GRF:
2256       printf("vgrf%d", inst->dst.reg);
2257       if (inst->dst.reg_offset)
2258          printf("+%d", inst->dst.reg_offset);
2259       break;
2260    case MRF:
2261       printf("m%d", inst->dst.reg);
2262       break;
2263    case BAD_FILE:
2264       printf("(null)");
2265       break;
2266    case UNIFORM:
2267       printf("***u%d***", inst->dst.reg);
2268       break;
2269    default:
2270       printf("???");
2271       break;
2272    }
2273    printf(", ");
2274
2275    for (int i = 0; i < 3; i++) {
2276       if (inst->src[i].negate)
2277          printf("-");
2278       if (inst->src[i].abs)
2279          printf("|");
2280       switch (inst->src[i].file) {
2281       case GRF:
2282          printf("vgrf%d", inst->src[i].reg);
2283          if (inst->src[i].reg_offset)
2284             printf("+%d", inst->src[i].reg_offset);
2285          break;
2286       case MRF:
2287          printf("***m%d***", inst->src[i].reg);
2288          break;
2289       case UNIFORM:
2290          printf("u%d", inst->src[i].reg);
2291          if (inst->src[i].reg_offset)
2292             printf(".%d", inst->src[i].reg_offset);
2293          break;
2294       case BAD_FILE:
2295          printf("(null)");
2296          break;
2297       default:
2298          printf("???");
2299          break;
2300       }
2301       if (inst->src[i].abs)
2302          printf("|");
2303
2304       if (i < 3)
2305          printf(", ");
2306    }
2307
2308    printf(" ");
2309
2310    if (inst->force_uncompressed)
2311       printf("1sthalf ");
2312
2313    if (inst->force_sechalf)
2314       printf("2ndhalf ");
2315
2316    printf("\n");
2317 }
2318
2319 void
2320 fs_visitor::dump_instructions()
2321 {
2322    int ip = 0;
2323    foreach_list(node, &this->instructions) {
2324       fs_inst *inst = (fs_inst *)node;
2325       printf("%d: ", ip++);
2326       dump_instruction(inst);
2327    }
2328 }
2329
2330 /**
2331  * Possibly returns an instruction that set up @param reg.
2332  *
2333  * Sometimes we want to take the result of some expression/variable
2334  * dereference tree and rewrite the instruction generating the result
2335  * of the tree.  When processing the tree, we know that the
2336  * instructions generated are all writing temporaries that are dead
2337  * outside of this tree.  So, if we have some instructions that write
2338  * a temporary, we're free to point that temp write somewhere else.
2339  *
2340  * Note that this doesn't guarantee that the instruction generated
2341  * only reg -- it might be the size=4 destination of a texture instruction.
2342  */
2343 fs_inst *
2344 fs_visitor::get_instruction_generating_reg(fs_inst *start,
2345                                            fs_inst *end,
2346                                            fs_reg reg)
2347 {
2348    if (end == start ||
2349        end->predicate ||
2350        end->force_uncompressed ||
2351        end->force_sechalf ||
2352        reg.reladdr ||
2353        !reg.equals(end->dst)) {
2354       return NULL;
2355    } else {
2356       return end;
2357    }
2358 }
2359
2360 void
2361 fs_visitor::setup_payload_gen6()
2362 {
2363    struct intel_context *intel = &brw->intel;
2364    bool uses_depth =
2365       (fp->Base.InputsRead & (1 << FRAG_ATTRIB_WPOS)) != 0;
2366    unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
2367
2368    assert(intel->gen >= 6);
2369
2370    /* R0-1: masks, pixel X/Y coordinates. */
2371    c->nr_payload_regs = 2;
2372    /* R2: only for 32-pixel dispatch.*/
2373
2374    /* R3-26: barycentric interpolation coordinates.  These appear in the
2375     * same order that they appear in the brw_wm_barycentric_interp_mode
2376     * enum.  Each set of coordinates occupies 2 registers if dispatch width
2377     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
2378     * appear if they were enabled using the "Barycentric Interpolation
2379     * Mode" bits in WM_STATE.
2380     */
2381    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
2382       if (barycentric_interp_modes & (1 << i)) {
2383          c->barycentric_coord_reg[i] = c->nr_payload_regs;
2384          c->nr_payload_regs += 2;
2385          if (dispatch_width == 16) {
2386             c->nr_payload_regs += 2;
2387          }
2388       }
2389    }
2390
2391    /* R27: interpolated depth if uses source depth */
2392    if (uses_depth) {
2393       c->source_depth_reg = c->nr_payload_regs;
2394       c->nr_payload_regs++;
2395       if (dispatch_width == 16) {
2396          /* R28: interpolated depth if not 8-wide. */
2397          c->nr_payload_regs++;
2398       }
2399    }
2400    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
2401    if (uses_depth) {
2402       c->source_w_reg = c->nr_payload_regs;
2403       c->nr_payload_regs++;
2404       if (dispatch_width == 16) {
2405          /* R30: interpolated W if not 8-wide. */
2406          c->nr_payload_regs++;
2407       }
2408    }
2409    /* R31: MSAA position offsets. */
2410    /* R32-: bary for 32-pixel. */
2411    /* R58-59: interp W for 32-pixel. */
2412
2413    if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
2414       c->source_depth_to_render_target = true;
2415    }
2416 }
2417
2418 bool
2419 fs_visitor::run()
2420 {
2421    sanity_param_count = fp->Base.Parameters->NumParameters;
2422    uint32_t orig_nr_params = c->prog_data.nr_params;
2423
2424    if (intel->gen >= 6)
2425       setup_payload_gen6();
2426    else
2427       setup_payload_gen4();
2428
2429    if (0) {
2430       emit_dummy_fs();
2431    } else {
2432       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2433          emit_shader_time_begin();
2434
2435       calculate_urb_setup();
2436       if (intel->gen < 6)
2437          emit_interpolation_setup_gen4();
2438       else
2439          emit_interpolation_setup_gen6();
2440
2441       /* We handle discards by keeping track of the still-live pixels in f0.1.
2442        * Initialize it with the dispatched pixels.
2443        */
2444       if (fp->UsesKill) {
2445          fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
2446          discard_init->flag_subreg = 1;
2447       }
2448
2449       /* Generate FS IR for main().  (the visitor only descends into
2450        * functions called "main").
2451        */
2452       if (shader) {
2453          foreach_list(node, &*shader->ir) {
2454             ir_instruction *ir = (ir_instruction *)node;
2455             base_ir = ir;
2456             this->result = reg_undef;
2457             ir->accept(this);
2458          }
2459       } else {
2460          emit_fragment_program_code();
2461       }
2462       base_ir = NULL;
2463       if (failed)
2464          return false;
2465
2466       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2467          emit_shader_time_end();
2468
2469       emit_fb_writes();
2470
2471       split_virtual_grfs();
2472
2473       move_uniform_array_access_to_pull_constants();
2474       setup_pull_constants();
2475
2476       bool progress;
2477       do {
2478          progress = false;
2479
2480          compact_virtual_grfs();
2481
2482          progress = remove_duplicate_mrf_writes() || progress;
2483
2484          progress = opt_algebraic() || progress;
2485          progress = opt_cse() || progress;
2486          progress = opt_copy_propagate() || progress;
2487          progress = dead_code_eliminate() || progress;
2488          progress = register_coalesce() || progress;
2489          progress = register_coalesce_2() || progress;
2490          progress = compute_to_mrf() || progress;
2491       } while (progress);
2492
2493       remove_dead_constants();
2494
2495       schedule_instructions(false);
2496
2497       assign_curb_setup();
2498       assign_urb_setup();
2499
2500       if (0) {
2501          /* Debug of register spilling: Go spill everything. */
2502          for (int i = 0; i < virtual_grf_count; i++) {
2503             spill_reg(i);
2504          }
2505       }
2506
2507       if (0)
2508          assign_regs_trivial();
2509       else {
2510          while (!assign_regs()) {
2511             if (failed)
2512                break;
2513          }
2514       }
2515    }
2516    assert(force_uncompressed_stack == 0);
2517    assert(force_sechalf_stack == 0);
2518
2519    if (failed)
2520       return false;
2521
2522    schedule_instructions(true);
2523
2524    if (dispatch_width == 8) {
2525       c->prog_data.reg_blocks = brw_register_blocks(grf_used);
2526    } else {
2527       c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
2528
2529       /* Make sure we didn't try to sneak in an extra uniform */
2530       assert(orig_nr_params == c->prog_data.nr_params);
2531       (void) orig_nr_params;
2532    }
2533
2534    /* If any state parameters were appended, then ParameterValues could have
2535     * been realloced, in which case the driver uniform storage set up by
2536     * _mesa_associate_uniform_storage() would point to freed memory.  Make
2537     * sure that didn't happen.
2538     */
2539    assert(sanity_param_count == fp->Base.Parameters->NumParameters);
2540
2541    return !failed;
2542 }
2543
2544 const unsigned *
2545 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
2546                struct gl_fragment_program *fp,
2547                struct gl_shader_program *prog,
2548                unsigned *final_assembly_size)
2549 {
2550    struct intel_context *intel = &brw->intel;
2551    bool start_busy = false;
2552    float start_time = 0;
2553
2554    if (unlikely(INTEL_DEBUG & DEBUG_PERF)) {
2555       start_busy = (intel->batch.last_bo &&
2556                     drm_intel_bo_busy(intel->batch.last_bo));
2557       start_time = get_time();
2558    }
2559
2560    struct brw_shader *shader = NULL;
2561    if (prog)
2562       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
2563
2564    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
2565       if (shader) {
2566          printf("GLSL IR for native fragment shader %d:\n", prog->Name);
2567          _mesa_print_ir(shader->ir, NULL);
2568          printf("\n\n");
2569       } else {
2570          printf("ARB_fragment_program %d ir for native fragment shader\n",
2571                 fp->Base.Id);
2572          _mesa_print_program(&fp->Base);
2573       }
2574    }
2575
2576    /* Now the main event: Visit the shader IR and generate our FS IR for it.
2577     */
2578    fs_visitor v(brw, c, prog, fp, 8);
2579    if (!v.run()) {
2580       prog->LinkStatus = false;
2581       ralloc_strcat(&prog->InfoLog, v.fail_msg);
2582
2583       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
2584                     v.fail_msg);
2585
2586       return NULL;
2587    }
2588
2589    exec_list *simd16_instructions = NULL;
2590    fs_visitor v2(brw, c, prog, fp, 16);
2591    bool no16 = INTEL_DEBUG & DEBUG_NO16;
2592    if (intel->gen >= 5 && c->prog_data.nr_pull_params == 0 && likely(!no16)) {
2593       v2.import_uniforms(&v);
2594       if (!v2.run()) {
2595          perf_debug("16-wide shader failed to compile, falling back to "
2596                     "8-wide at a 10-20%% performance cost: %s", v2.fail_msg);
2597       } else {
2598          simd16_instructions = &v2.instructions;
2599       }
2600    }
2601
2602    c->prog_data.dispatch_width = 8;
2603
2604    fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
2605    const unsigned *generated = g.generate_assembly(&v.instructions,
2606                                                    simd16_instructions,
2607                                                    final_assembly_size);
2608
2609    if (unlikely(INTEL_DEBUG & DEBUG_PERF) && shader) {
2610       if (shader->compiled_once)
2611          brw_wm_debug_recompile(brw, prog, &c->key);
2612       shader->compiled_once = true;
2613
2614       if (start_busy && !drm_intel_bo_busy(intel->batch.last_bo)) {
2615          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
2616                     (get_time() - start_time) * 1000);
2617       }
2618    }
2619
2620    return generated;
2621 }
2622
2623 bool
2624 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
2625 {
2626    struct brw_context *brw = brw_context(ctx);
2627    struct intel_context *intel = &brw->intel;
2628    struct brw_wm_prog_key key;
2629
2630    if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
2631       return true;
2632
2633    struct gl_fragment_program *fp = (struct gl_fragment_program *)
2634       prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
2635    struct brw_fragment_program *bfp = brw_fragment_program(fp);
2636    bool program_uses_dfdy = fp->UsesDFdy;
2637
2638    memset(&key, 0, sizeof(key));
2639
2640    if (intel->gen < 6) {
2641       if (fp->UsesKill)
2642          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
2643
2644       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
2645          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
2646
2647       /* Just assume depth testing. */
2648       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
2649       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
2650    }
2651
2652    if (prog->Name != 0)
2653       key.proj_attrib_mask = 0xffffffff;
2654
2655    if (intel->gen < 6)
2656       key.vp_outputs_written |= BITFIELD64_BIT(FRAG_ATTRIB_WPOS);
2657
2658    for (int i = 0; i < FRAG_ATTRIB_MAX; i++) {
2659       if (!(fp->Base.InputsRead & BITFIELD64_BIT(i)))
2660          continue;
2661
2662       if (prog->Name == 0)
2663          key.proj_attrib_mask |= 1 << i;
2664
2665       if (intel->gen < 6) {
2666          int vp_index = _mesa_vert_result_to_frag_attrib((gl_vert_result) i);
2667
2668          if (vp_index >= 0)
2669             key.vp_outputs_written |= BITFIELD64_BIT(vp_index);
2670       }
2671    }
2672
2673    key.clamp_fragment_color = true;
2674
2675    for (int i = 0; i < MAX_SAMPLERS; i++) {
2676       if (fp->Base.ShadowSamplers & (1 << i)) {
2677          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
2678          key.tex.swizzles[i] =
2679             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
2680       } else {
2681          /* Color sampler: assume no swizzling. */
2682          key.tex.swizzles[i] = SWIZZLE_XYZW;
2683       }
2684    }
2685
2686    if (fp->Base.InputsRead & FRAG_BIT_WPOS) {
2687       key.drawable_height = ctx->DrawBuffer->Height;
2688    }
2689
2690    if ((fp->Base.InputsRead & FRAG_BIT_WPOS) || program_uses_dfdy) {
2691       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
2692    }
2693
2694    key.nr_color_regions = 1;
2695
2696    key.program_string_id = bfp->id;
2697
2698    uint32_t old_prog_offset = brw->wm.prog_offset;
2699    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
2700
2701    bool success = do_wm_prog(brw, prog, bfp, &key);
2702
2703    brw->wm.prog_offset = old_prog_offset;
2704    brw->wm.prog_data = old_prog_data;
2705
2706    return success;
2707 }