src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 extern "C" {
  32
  33 #include <sys/types.h>
  34
  35 #include "main/macros.h"
  36 #include "main/shaderobj.h"
  37 #include "main/uniforms.h"
  38 #include "main/fbobject.h"
  39 #include "program/prog_parameter.h"
  40 #include "program/prog_print.h"
  41 #include "program/register_allocate.h"
  42 #include "program/sampler.h"
  43 #include "program/hash_table.h"
  44 #include "brw_context.h"
  45 #include "brw_eu.h"
  46 #include "brw_wm.h"
  47 }
  48 #include "brw_fs.h"
  49 #include "glsl/glsl_types.h"
  50 #include "glsl/ir_print_visitor.h"
  51
  52 void
  53 fs_inst::init()
  54 {
  55    memset(this, 0, sizeof(*this));
  56    this->opcode = BRW_OPCODE_NOP;
  57    this->conditional_mod = BRW_CONDITIONAL_NONE;
  58
  59    this->dst = reg_undef;
  60    this->src[0] = reg_undef;
  61    this->src[1] = reg_undef;
  62    this->src[2] = reg_undef;
  63 }
  64
  65 fs_inst::fs_inst()
  66 {
  67    init();
  68 }
  69
  70 fs_inst::fs_inst(enum opcode opcode)
  71 {
  72    init();
  73    this->opcode = opcode;
  74 }
  75
  76 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
  77 {
  78    init();
  79    this->opcode = opcode;
  80    this->dst = dst;
  81
  82    if (dst.file == GRF)
  83       assert(dst.reg_offset >= 0);
  84 }
  85
  86 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
  87 {
  88    init();
  89    this->opcode = opcode;
  90    this->dst = dst;
  91    this->src[0] = src0;
  92
  93    if (dst.file == GRF)
  94       assert(dst.reg_offset >= 0);
  95    if (src[0].file == GRF)
  96       assert(src[0].reg_offset >= 0);
  97 }
  98
  99 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 100 {
 101    init();
 102    this->opcode = opcode;
 103    this->dst = dst;
 104    this->src[0] = src0;
 105    this->src[1] = src1;
 106
 107    if (dst.file == GRF)
 108       assert(dst.reg_offset >= 0);
 109    if (src[0].file == GRF)
 110       assert(src[0].reg_offset >= 0);
 111    if (src[1].file == GRF)
 112       assert(src[1].reg_offset >= 0);
 113 }
 114
 115 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
 116                  fs_reg src0, fs_reg src1, fs_reg src2)
 117 {
 118    init();
 119    this->opcode = opcode;
 120    this->dst = dst;
 121    this->src[0] = src0;
 122    this->src[1] = src1;
 123    this->src[2] = src2;
 124
 125    if (dst.file == GRF)
 126       assert(dst.reg_offset >= 0);
 127    if (src[0].file == GRF)
 128       assert(src[0].reg_offset >= 0);
 129    if (src[1].file == GRF)
 130       assert(src[1].reg_offset >= 0);
 131    if (src[2].file == GRF)
 132       assert(src[2].reg_offset >= 0);
 133 }
 134
 135 #define ALU1(op)                                                        \
 136    fs_inst *                                                            \
 137    fs_visitor::op(fs_reg dst, fs_reg src0)                              \
 138    {                                                                    \
 139       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
 140    }
 141
 142 #define ALU2(op)                                                        \
 143    fs_inst *                                                            \
 144    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1)                 \
 145    {                                                                    \
 146       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
 147    }
 148
 149 ALU1(NOT)
 150 ALU1(MOV)
 151 ALU1(FRC)
 152 ALU1(RNDD)
 153 ALU1(RNDE)
 154 ALU1(RNDZ)
 155 ALU2(ADD)
 156 ALU2(MUL)
 157 ALU2(MACH)
 158 ALU2(AND)
 159 ALU2(OR)
 160 ALU2(XOR)
 161 ALU2(SHL)
 162 ALU2(SHR)
 163 ALU2(ASR)
 164
 165 /** Gen4 predicated IF. */
 166 fs_inst *
 167 fs_visitor::IF(uint32_t predicate)
 168 {
 169    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
 170    inst->predicate = predicate;
 171    return inst;
 172 }
 173
 174 /** Gen6+ IF with embedded comparison. */
 175 fs_inst *
 176 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
 177 {
 178    assert(intel->gen >= 6);
 179    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
 180                                         reg_null_d, src0, src1);
 181    inst->conditional_mod = condition;
 182    return inst;
 183 }
 184
 185 /**
 186  * CMP: Sets the low bit of the destination channels with the result
 187  * of the comparison, while the upper bits are undefined, and updates
 188  * the flag register with the packed 16 bits of the result.
 189  */
 190 fs_inst *
 191 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
 192 {
 193    fs_inst *inst;
 194
 195    /* Take the instruction:
 196     *
 197     * CMP null<d> src0<f> src1<f>
 198     *
 199     * Original gen4 does type conversion to the destination type before
 200     * comparison, producing garbage results for floating point comparisons.
 201     * gen5 does the comparison on the execution type (resolved source types),
 202     * so dst type doesn't matter.  gen6 does comparison and then uses the
 203     * result as if it was the dst type with no conversion, which happens to
 204     * mostly work out for float-interpreted-as-int since our comparisons are
 205     * for >0, =0, <0.
 206     */
 207    if (intel->gen == 4) {
 208       dst.type = src0.type;
 209       if (dst.file == FIXED_HW_REG)
 210          dst.fixed_hw_reg.type = dst.type;
 211    }
 212
 213    resolve_ud_negate(&src0);
 214    resolve_ud_negate(&src1);
 215
 216    inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
 217    inst->conditional_mod = condition;
 218
 219    return inst;
 220 }
 221
 222 exec_list
 223 fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
 224                                        fs_reg offset)
 225 {
 226    exec_list instructions;
 227    fs_inst *inst;
 228
 229    if (intel->gen >= 7) {
 230       inst = new(mem_ctx) fs_inst(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7,
 231                                   dst, surf_index, offset);
 232       instructions.push_tail(inst);
 233    } else {
 234       int base_mrf = 13;
 235       bool header_present = true;
 236
 237       fs_reg mrf = fs_reg(MRF, base_mrf + header_present);
 238       mrf.type = BRW_REGISTER_TYPE_D;
 239
 240       /* On gen6+ we want the dword offset passed in, but on gen4/5 we need a
 241        * dword-aligned byte offset.
 242        */
 243       if (intel->gen == 6) {
 244          instructions.push_tail(MOV(mrf, offset));
 245       } else {
 246          instructions.push_tail(MUL(mrf, offset, fs_reg(4)));
 247       }
 248       inst = MOV(mrf, offset);
 249       inst = new(mem_ctx) fs_inst(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD,
 250                                   dst, surf_index);
 251       inst->header_present = header_present;
 252       inst->base_mrf = base_mrf;
 253       inst->mlen = header_present + dispatch_width / 8;
 254
 255       instructions.push_tail(inst);
 256    }
 257
 258    return instructions;
 259 }
 260
 261 bool
 262 fs_inst::equals(fs_inst *inst)
 263 {
 264    return (opcode == inst->opcode &&
 265            dst.equals(inst->dst) &&
 266            src[0].equals(inst->src[0]) &&
 267            src[1].equals(inst->src[1]) &&
 268            src[2].equals(inst->src[2]) &&
 269            saturate == inst->saturate &&
 270            predicate == inst->predicate &&
 271            conditional_mod == inst->conditional_mod &&
 272            mlen == inst->mlen &&
 273            base_mrf == inst->base_mrf &&
 274            sampler == inst->sampler &&
 275            target == inst->target &&
 276            eot == inst->eot &&
 277            header_present == inst->header_present &&
 278            shadow_compare == inst->shadow_compare &&
 279            offset == inst->offset);
 280 }
 281
 282 int
 283 fs_inst::regs_written()
 284 {
 285    if (is_tex())
 286       return 4;
 287
 288    /* The SINCOS and INT_DIV_QUOTIENT_AND_REMAINDER math functions return 2,
 289     * but we don't currently use them...nor do we have an opcode for them.
 290     */
 291
 292    return 1;
 293 }
 294
 295 bool
 296 fs_inst::overwrites_reg(const fs_reg &reg)
 297 {
 298    return (reg.file == dst.file &&
 299            reg.reg == dst.reg &&
 300            reg.reg_offset >= dst.reg_offset  &&
 301            reg.reg_offset < dst.reg_offset + regs_written());
 302 }
 303
 304 bool
 305 fs_inst::is_tex()
 306 {
 307    return (opcode == SHADER_OPCODE_TEX ||
 308            opcode == FS_OPCODE_TXB ||
 309            opcode == SHADER_OPCODE_TXD ||
 310            opcode == SHADER_OPCODE_TXF ||
 311            opcode == SHADER_OPCODE_TXL ||
 312            opcode == SHADER_OPCODE_TXS);
 313 }
 314
 315 bool
 316 fs_inst::is_math()
 317 {
 318    return (opcode == SHADER_OPCODE_RCP ||
 319            opcode == SHADER_OPCODE_RSQ ||
 320            opcode == SHADER_OPCODE_SQRT ||
 321            opcode == SHADER_OPCODE_EXP2 ||
 322            opcode == SHADER_OPCODE_LOG2 ||
 323            opcode == SHADER_OPCODE_SIN ||
 324            opcode == SHADER_OPCODE_COS ||
 325            opcode == SHADER_OPCODE_INT_QUOTIENT ||
 326            opcode == SHADER_OPCODE_INT_REMAINDER ||
 327            opcode == SHADER_OPCODE_POW);
 328 }
 329
 330 bool
 331 fs_inst::is_send_from_grf()
 332 {
 333    return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
 334            (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
 335             src[1].file == GRF));
 336 }
 337
 338 bool
 339 fs_visitor::can_do_source_mods(fs_inst *inst)
 340 {
 341    if (intel->gen == 6 && inst->is_math())
 342       return false;
 343
 344    if (inst->is_send_from_grf())
 345       return false;
 346
 347    return true;
 348 }
 349
 350 void
 351 fs_reg::init()
 352 {
 353    memset(this, 0, sizeof(*this));
 354    this->smear = -1;
 355 }
 356
 357 /** Generic unset register constructor. */
 358 fs_reg::fs_reg()
 359 {
 360    init();
 361    this->file = BAD_FILE;
 362 }
 363
 364 /** Immediate value constructor. */
 365 fs_reg::fs_reg(float f)
 366 {
 367    init();
 368    this->file = IMM;
 369    this->type = BRW_REGISTER_TYPE_F;
 370    this->imm.f = f;
 371 }
 372
 373 /** Immediate value constructor. */
 374 fs_reg::fs_reg(int32_t i)
 375 {
 376    init();
 377    this->file = IMM;
 378    this->type = BRW_REGISTER_TYPE_D;
 379    this->imm.i = i;
 380 }
 381
 382 /** Immediate value constructor. */
 383 fs_reg::fs_reg(uint32_t u)
 384 {
 385    init();
 386    this->file = IMM;
 387    this->type = BRW_REGISTER_TYPE_UD;
 388    this->imm.u = u;
 389 }
 390
 391 /** Fixed brw_reg Immediate value constructor. */
 392 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
 393 {
 394    init();
 395    this->file = FIXED_HW_REG;
 396    this->fixed_hw_reg = fixed_hw_reg;
 397    this->type = fixed_hw_reg.type;
 398 }
 399
 400 bool
 401 fs_reg::equals(const fs_reg &r) const
 402 {
 403    return (file == r.file &&
 404            reg == r.reg &&
 405            reg_offset == r.reg_offset &&
 406            type == r.type &&
 407            negate == r.negate &&
 408            abs == r.abs &&
 409            !reladdr && !r.reladdr &&
 410            memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
 411                   sizeof(fixed_hw_reg)) == 0 &&
 412            smear == r.smear &&
 413            imm.u == r.imm.u);
 414 }
 415
 416 bool
 417 fs_reg::is_zero() const
 418 {
 419    if (file != IMM)
 420       return false;
 421
 422    return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
 423 }
 424
 425 bool
 426 fs_reg::is_one() const
 427 {
 428    if (file != IMM)
 429       return false;
 430
 431    return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
 432 }
 433
 434 int
 435 fs_visitor::type_size(const struct glsl_type *type)
 436 {
 437    unsigned int size, i;
 438
 439    switch (type->base_type) {
 440    case GLSL_TYPE_UINT:
 441    case GLSL_TYPE_INT:
 442    case GLSL_TYPE_FLOAT:
 443    case GLSL_TYPE_BOOL:
 444       return type->components();
 445    case GLSL_TYPE_ARRAY:
 446       return type_size(type->fields.array) * type->length;
 447    case GLSL_TYPE_STRUCT:
 448       size = 0;
 449       for (i = 0; i < type->length; i++) {
 450          size += type_size(type->fields.structure[i].type);
 451       }
 452       return size;
 453    case GLSL_TYPE_SAMPLER:
 454       /* Samplers take up no register space, since they're baked in at
 455        * link time.
 456        */
 457       return 0;
 458    default:
 459       assert(!"not reached");
 460       return 0;
 461    }
 462 }
 463
 464 fs_reg
 465 fs_visitor::get_timestamp()
 466 {
 467    assert(intel->gen >= 7);
 468
 469    fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 470                                           BRW_ARF_TIMESTAMP,
 471                                           0),
 472                              BRW_REGISTER_TYPE_UD));
 473
 474    fs_reg dst = fs_reg(this, glsl_type::uint_type);
 475
 476    fs_inst *mov = emit(MOV(dst, ts));
 477    /* We want to read the 3 fields we care about (mostly field 0, but also 2)
 478     * even if it's not enabled in the dispatch.
 479     */
 480    mov->force_writemask_all = true;
 481    mov->force_uncompressed = true;
 482
 483    /* The caller wants the low 32 bits of the timestamp.  Since it's running
 484     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
 485     * which is plenty of time for our purposes.  It is identical across the
 486     * EUs, but since it's tracking GPU core speed it will increment at a
 487     * varying rate as render P-states change.
 488     *
 489     * The caller could also check if render P-states have changed (or anything
 490     * else that might disrupt timing) by setting smear to 2 and checking if
 491     * that field is != 0.
 492     */
 493    dst.smear = 0;
 494
 495    return dst;
 496 }
 497
 498 void
 499 fs_visitor::emit_shader_time_begin()
 500 {
 501    current_annotation = "shader time start";
 502    shader_start_time = get_timestamp();
 503 }
 504
 505 void
 506 fs_visitor::emit_shader_time_end()
 507 {
 508    current_annotation = "shader time end";
 509
 510    enum shader_time_shader_type type, written_type, reset_type;
 511    if (dispatch_width == 8) {
 512       type = ST_FS8;
 513       written_type = ST_FS8_WRITTEN;
 514       reset_type = ST_FS8_RESET;
 515    } else {
 516       assert(dispatch_width == 16);
 517       type = ST_FS16;
 518       written_type = ST_FS16_WRITTEN;
 519       reset_type = ST_FS16_RESET;
 520    }
 521
 522    fs_reg shader_end_time = get_timestamp();
 523
 524    /* Check that there weren't any timestamp reset events (assuming these
 525     * were the only two timestamp reads that happened).
 526     */
 527    fs_reg reset = shader_end_time;
 528    reset.smear = 2;
 529    fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
 530    test->conditional_mod = BRW_CONDITIONAL_Z;
 531    emit(IF(BRW_PREDICATE_NORMAL));
 532
 533    push_force_uncompressed();
 534    fs_reg start = shader_start_time;
 535    start.negate = true;
 536    fs_reg diff = fs_reg(this, glsl_type::uint_type);
 537    emit(ADD(diff, start, shader_end_time));
 538
 539    /* If there were no instructions between the two timestamp gets, the diff
 540     * is 2 cycles.  Remove that overhead, so I can forget about that when
 541     * trying to determine the time taken for single instructions.
 542     */
 543    emit(ADD(diff, diff, fs_reg(-2u)));
 544
 545    emit_shader_time_write(type, diff);
 546    emit_shader_time_write(written_type, fs_reg(1u));
 547    emit(BRW_OPCODE_ELSE);
 548    emit_shader_time_write(reset_type, fs_reg(1u));
 549    emit(BRW_OPCODE_ENDIF);
 550
 551    pop_force_uncompressed();
 552 }
 553
 554 void
 555 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
 556                                    fs_reg value)
 557 {
 558    /* Choose an index in the buffer and set up tracking information for our
 559     * printouts.
 560     */
 561    int shader_time_index = brw->shader_time.num_entries++;
 562    assert(shader_time_index <= brw->shader_time.max_entries);
 563    brw->shader_time.types[shader_time_index] = type;
 564    if (prog) {
 565       _mesa_reference_shader_program(ctx,
 566                                      &brw->shader_time.programs[shader_time_index],
 567                                      prog);
 568    }
 569
 570    int base_mrf = 6;
 571
 572    fs_reg offset_mrf = fs_reg(MRF, base_mrf);
 573    offset_mrf.type = BRW_REGISTER_TYPE_UD;
 574    emit(MOV(offset_mrf, fs_reg(shader_time_index * 4)));
 575
 576    fs_reg time_mrf = fs_reg(MRF, base_mrf + 1);
 577    time_mrf.type = BRW_REGISTER_TYPE_UD;
 578    emit(MOV(time_mrf, value));
 579
 580    fs_inst *inst = emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD));
 581    inst->base_mrf = base_mrf;
 582    inst->mlen = 2;
 583 }
 584
 585 void
 586 fs_visitor::fail(const char *format, ...)
 587 {
 588    va_list va;
 589    char *msg;
 590
 591    if (failed)
 592       return;
 593
 594    failed = true;
 595
 596    va_start(va, format);
 597    msg = ralloc_vasprintf(mem_ctx, format, va);
 598    va_end(va);
 599    msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
 600
 601    this->fail_msg = msg;
 602
 603    if (INTEL_DEBUG & DEBUG_WM) {
 604       fprintf(stderr, "%s",  msg);
 605    }
 606 }
 607
 608 fs_inst *
 609 fs_visitor::emit(enum opcode opcode)
 610 {
 611    return emit(fs_inst(opcode));
 612 }
 613
 614 fs_inst *
 615 fs_visitor::emit(enum opcode opcode, fs_reg dst)
 616 {
 617    return emit(fs_inst(opcode, dst));
 618 }
 619
 620 fs_inst *
 621 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
 622 {
 623    return emit(fs_inst(opcode, dst, src0));
 624 }
 625
 626 fs_inst *
 627 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 628 {
 629    return emit(fs_inst(opcode, dst, src0, src1));
 630 }
 631
 632 fs_inst *
 633 fs_visitor::emit(enum opcode opcode, fs_reg dst,
 634                  fs_reg src0, fs_reg src1, fs_reg src2)
 635 {
 636    return emit(fs_inst(opcode, dst, src0, src1, src2));
 637 }
 638
 639 void
 640 fs_visitor::push_force_uncompressed()
 641 {
 642    force_uncompressed_stack++;
 643 }
 644
 645 void
 646 fs_visitor::pop_force_uncompressed()
 647 {
 648    force_uncompressed_stack--;
 649    assert(force_uncompressed_stack >= 0);
 650 }
 651
 652 void
 653 fs_visitor::push_force_sechalf()
 654 {
 655    force_sechalf_stack++;
 656 }
 657
 658 void
 659 fs_visitor::pop_force_sechalf()
 660 {
 661    force_sechalf_stack--;
 662    assert(force_sechalf_stack >= 0);
 663 }
 664
 665 /**
 666  * Returns how many MRFs an FS opcode will write over.
 667  *
 668  * Note that this is not the 0 or 1 implied writes in an actual gen
 669  * instruction -- the FS opcodes often generate MOVs in addition.
 670  */
 671 int
 672 fs_visitor::implied_mrf_writes(fs_inst *inst)
 673 {
 674    if (inst->mlen == 0)
 675       return 0;
 676
 677    switch (inst->opcode) {
 678    case SHADER_OPCODE_RCP:
 679    case SHADER_OPCODE_RSQ:
 680    case SHADER_OPCODE_SQRT:
 681    case SHADER_OPCODE_EXP2:
 682    case SHADER_OPCODE_LOG2:
 683    case SHADER_OPCODE_SIN:
 684    case SHADER_OPCODE_COS:
 685       return 1 * dispatch_width / 8;
 686    case SHADER_OPCODE_POW:
 687    case SHADER_OPCODE_INT_QUOTIENT:
 688    case SHADER_OPCODE_INT_REMAINDER:
 689       return 2 * dispatch_width / 8;
 690    case SHADER_OPCODE_TEX:
 691    case FS_OPCODE_TXB:
 692    case SHADER_OPCODE_TXD:
 693    case SHADER_OPCODE_TXF:
 694    case SHADER_OPCODE_TXL:
 695    case SHADER_OPCODE_TXS:
 696       return 1;
 697    case SHADER_OPCODE_SHADER_TIME_ADD:
 698       return 0;
 699    case FS_OPCODE_FB_WRITE:
 700       return 2;
 701    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 702    case FS_OPCODE_UNSPILL:
 703       return 1;
 704    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
 705       return inst->header_present;
 706    case FS_OPCODE_SPILL:
 707       return 2;
 708    default:
 709       assert(!"not reached");
 710       return inst->mlen;
 711    }
 712 }
 713
 714 int
 715 fs_visitor::virtual_grf_alloc(int size)
 716 {
 717    if (virtual_grf_array_size <= virtual_grf_count) {
 718       if (virtual_grf_array_size == 0)
 719          virtual_grf_array_size = 16;
 720       else
 721          virtual_grf_array_size *= 2;
 722       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 723                                    virtual_grf_array_size);
 724    }
 725    virtual_grf_sizes[virtual_grf_count] = size;
 726    return virtual_grf_count++;
 727 }
 728
 729 /** Fixed HW reg constructor. */
 730 fs_reg::fs_reg(enum register_file file, int reg)
 731 {
 732    init();
 733    this->file = file;
 734    this->reg = reg;
 735    this->type = BRW_REGISTER_TYPE_F;
 736 }
 737
 738 /** Fixed HW reg constructor. */
 739 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
 740 {
 741    init();
 742    this->file = file;
 743    this->reg = reg;
 744    this->type = type;
 745 }
 746
 747 /** Automatic reg constructor. */
 748 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
 749 {
 750    init();
 751
 752    this->file = GRF;
 753    this->reg = v->virtual_grf_alloc(v->type_size(type));
 754    this->reg_offset = 0;
 755    this->type = brw_type_for_base_type(type);
 756 }
 757
 758 fs_reg *
 759 fs_visitor::variable_storage(ir_variable *var)
 760 {
 761    return (fs_reg *)hash_table_find(this->variable_ht, var);
 762 }
 763
 764 void
 765 import_uniforms_callback(const void *key,
 766                          void *data,
 767                          void *closure)
 768 {
 769    struct hash_table *dst_ht = (struct hash_table *)closure;
 770    const fs_reg *reg = (const fs_reg *)data;
 771
 772    if (reg->file != UNIFORM)
 773       return;
 774
 775    hash_table_insert(dst_ht, data, key);
 776 }
 777
 778 /* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
 779  * This brings in those uniform definitions
 780  */
 781 void
 782 fs_visitor::import_uniforms(fs_visitor *v)
 783 {
 784    hash_table_call_foreach(v->variable_ht,
 785                            import_uniforms_callback,
 786                            variable_ht);
 787    this->params_remap = v->params_remap;
 788 }
 789
 790 /* Our support for uniforms is piggy-backed on the struct
 791  * gl_fragment_program, because that's where the values actually
 792  * get stored, rather than in some global gl_shader_program uniform
 793  * store.
 794  */
 795 int
 796 fs_visitor::setup_uniform_values(int loc, const glsl_type *type)
 797 {
 798    unsigned int offset = 0;
 799
 800    if (type->is_matrix()) {
 801       const glsl_type *column = glsl_type::get_instance(GLSL_TYPE_FLOAT,
 802                                                         type->vector_elements,
 803                                                         1);
 804
 805       for (unsigned int i = 0; i < type->matrix_columns; i++) {
 806          offset += setup_uniform_values(loc + offset, column);
 807       }
 808
 809       return offset;
 810    }
 811
 812    switch (type->base_type) {
 813    case GLSL_TYPE_FLOAT:
 814    case GLSL_TYPE_UINT:
 815    case GLSL_TYPE_INT:
 816    case GLSL_TYPE_BOOL:
 817       for (unsigned int i = 0; i < type->vector_elements; i++) {
 818          unsigned int param = c->prog_data.nr_params++;
 819
 820          this->param_index[param] = loc;
 821          this->param_offset[param] = i;
 822       }
 823       return 1;
 824
 825    case GLSL_TYPE_STRUCT:
 826       for (unsigned int i = 0; i < type->length; i++) {
 827          offset += setup_uniform_values(loc + offset,
 828                                         type->fields.structure[i].type);
 829       }
 830       return offset;
 831
 832    case GLSL_TYPE_ARRAY:
 833       for (unsigned int i = 0; i < type->length; i++) {
 834          offset += setup_uniform_values(loc + offset, type->fields.array);
 835       }
 836       return offset;
 837
 838    case GLSL_TYPE_SAMPLER:
 839       /* The sampler takes up a slot, but we don't use any values from it. */
 840       return 1;
 841
 842    default:
 843       assert(!"not reached");
 844       return 0;
 845    }
 846 }
 847
 848
 849 /* Our support for builtin uniforms is even scarier than non-builtin.
 850  * It sits on top of the PROG_STATE_VAR parameters that are
 851  * automatically updated from GL context state.
 852  */
 853 void
 854 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
 855 {
 856    const ir_state_slot *const slots = ir->state_slots;
 857    assert(ir->state_slots != NULL);
 858
 859    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 860       /* This state reference has already been setup by ir_to_mesa, but we'll
 861        * get the same index back here.
 862        */
 863       int index = _mesa_add_state_reference(this->fp->Base.Parameters,
 864                                             (gl_state_index *)slots[i].tokens);
 865
 866       /* Add each of the unique swizzles of the element as a parameter.
 867        * This'll end up matching the expected layout of the
 868        * array/matrix/structure we're trying to fill in.
 869        */
 870       int last_swiz = -1;
 871       for (unsigned int j = 0; j < 4; j++) {
 872          int swiz = GET_SWZ(slots[i].swizzle, j);
 873          if (swiz == last_swiz)
 874             break;
 875          last_swiz = swiz;
 876
 877          this->param_index[c->prog_data.nr_params] = index;
 878          this->param_offset[c->prog_data.nr_params] = swiz;
 879          c->prog_data.nr_params++;
 880       }
 881    }
 882 }
 883
 884 fs_reg *
 885 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
 886 {
 887    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 888    fs_reg wpos = *reg;
 889    bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
 890
 891    /* gl_FragCoord.x */
 892    if (ir->pixel_center_integer) {
 893       emit(MOV(wpos, this->pixel_x));
 894    } else {
 895       emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
 896    }
 897    wpos.reg_offset++;
 898
 899    /* gl_FragCoord.y */
 900    if (!flip && ir->pixel_center_integer) {
 901       emit(MOV(wpos, this->pixel_y));
 902    } else {
 903       fs_reg pixel_y = this->pixel_y;
 904       float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
 905
 906       if (flip) {
 907          pixel_y.negate = true;
 908          offset += c->key.drawable_height - 1.0;
 909       }
 910
 911       emit(ADD(wpos, pixel_y, fs_reg(offset)));
 912    }
 913    wpos.reg_offset++;
 914
 915    /* gl_FragCoord.z */
 916    if (intel->gen >= 6) {
 917       emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
 918    } else {
 919       emit(FS_OPCODE_LINTERP, wpos,
 920            this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
 921            this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
 922            interp_reg(FRAG_ATTRIB_WPOS, 2));
 923    }
 924    wpos.reg_offset++;
 925
 926    /* gl_FragCoord.w: Already set up in emit_interpolation */
 927    emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
 928
 929    return reg;
 930 }
 931
 932 fs_inst *
 933 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
 934                          glsl_interp_qualifier interpolation_mode,
 935                          bool is_centroid)
 936 {
 937    brw_wm_barycentric_interp_mode barycoord_mode;
 938    if (is_centroid) {
 939       if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
 940          barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
 941       else
 942          barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
 943    } else {
 944       if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
 945          barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
 946       else
 947          barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
 948    }
 949    return emit(FS_OPCODE_LINTERP, attr,
 950                this->delta_x[barycoord_mode],
 951                this->delta_y[barycoord_mode], interp);
 952 }
 953
 954 fs_reg *
 955 fs_visitor::emit_general_interpolation(ir_variable *ir)
 956 {
 957    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 958    reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
 959    fs_reg attr = *reg;
 960
 961    unsigned int array_elements;
 962    const glsl_type *type;
 963
 964    if (ir->type->is_array()) {
 965       array_elements = ir->type->length;
 966       if (array_elements == 0) {
 967          fail("dereferenced array '%s' has length 0\n", ir->name);
 968       }
 969       type = ir->type->fields.array;
 970    } else {
 971       array_elements = 1;
 972       type = ir->type;
 973    }
 974
 975    glsl_interp_qualifier interpolation_mode =
 976       ir->determine_interpolation_mode(c->key.flat_shade);
 977
 978    int location = ir->location;
 979    for (unsigned int i = 0; i < array_elements; i++) {
 980       for (unsigned int j = 0; j < type->matrix_columns; j++) {
 981          if (urb_setup[location] == -1) {
 982             /* If there's no incoming setup data for this slot, don't
 983              * emit interpolation for it.
 984              */
 985             attr.reg_offset += type->vector_elements;
 986             location++;
 987             continue;
 988          }
 989
 990          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
 991             /* Constant interpolation (flat shading) case. The SF has
 992              * handed us defined values in only the constant offset
 993              * field of the setup reg.
 994              */
 995             for (unsigned int k = 0; k < type->vector_elements; k++) {
 996                struct brw_reg interp = interp_reg(location, k);
 997                interp = suboffset(interp, 3);
 998                interp.type = reg->type;
 999                emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1000                attr.reg_offset++;
1001             }
1002          } else {
1003             /* Smooth/noperspective interpolation case. */
1004             for (unsigned int k = 0; k < type->vector_elements; k++) {
1005                /* FINISHME: At some point we probably want to push
1006                 * this farther by giving similar treatment to the
1007                 * other potentially constant components of the
1008                 * attribute, as well as making brw_vs_constval.c
1009                 * handle varyings other than gl_TexCoord.
1010                 */
1011                if (location >= FRAG_ATTRIB_TEX0 &&
1012                    location <= FRAG_ATTRIB_TEX7 &&
1013                    k == 3 && !(c->key.proj_attrib_mask & (1 << location))) {
1014                   emit(BRW_OPCODE_MOV, attr, fs_reg(1.0f));
1015                } else {
1016                   struct brw_reg interp = interp_reg(location, k);
1017                   emit_linterp(attr, fs_reg(interp), interpolation_mode,
1018                                ir->centroid);
1019                   if (brw->needs_unlit_centroid_workaround && ir->centroid) {
1020                      /* Get the pixel/sample mask into f0 so that we know
1021                       * which pixels are lit.  Then, for each channel that is
1022                       * unlit, replace the centroid data with non-centroid
1023                       * data.
1024                       */
1025                      emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS, attr);
1026                      fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1027                                                   interpolation_mode, false);
1028                      inst->predicate = BRW_PREDICATE_NORMAL;
1029                      inst->predicate_inverse = true;
1030                   }
1031                   if (intel->gen < 6) {
1032                      emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1033                   }
1034                }
1035                attr.reg_offset++;
1036             }
1037
1038          }
1039          location++;
1040       }
1041    }
1042
1043    return reg;
1044 }
1045
1046 fs_reg *
1047 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1048 {
1049    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1050
1051    /* The frontfacing comes in as a bit in the thread payload. */
1052    if (intel->gen >= 6) {
1053       emit(BRW_OPCODE_ASR, *reg,
1054            fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1055            fs_reg(15));
1056       emit(BRW_OPCODE_NOT, *reg, *reg);
1057       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1058    } else {
1059       struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1060       /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1061        * us front face
1062        */
1063       emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1064       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1065    }
1066
1067    return reg;
1068 }
1069
1070 fs_reg
1071 fs_visitor::fix_math_operand(fs_reg src)
1072 {
1073    /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1074     * might be able to do better by doing execsize = 1 math and then
1075     * expanding that result out, but we would need to be careful with
1076     * masking.
1077     *
1078     * The hardware ignores source modifiers (negate and abs) on math
1079     * instructions, so we also move to a temp to set those up.
1080     */
1081    if (intel->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1082        !src.abs && !src.negate)
1083       return src;
1084
1085    /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1086     * operands to math
1087     */
1088    if (intel->gen >= 7 && src.file != IMM)
1089       return src;
1090
1091    fs_reg expanded = fs_reg(this, glsl_type::float_type);
1092    expanded.type = src.type;
1093    emit(BRW_OPCODE_MOV, expanded, src);
1094    return expanded;
1095 }
1096
1097 fs_inst *
1098 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1099 {
1100    switch (opcode) {
1101    case SHADER_OPCODE_RCP:
1102    case SHADER_OPCODE_RSQ:
1103    case SHADER_OPCODE_SQRT:
1104    case SHADER_OPCODE_EXP2:
1105    case SHADER_OPCODE_LOG2:
1106    case SHADER_OPCODE_SIN:
1107    case SHADER_OPCODE_COS:
1108       break;
1109    default:
1110       assert(!"not reached: bad math opcode");
1111       return NULL;
1112    }
1113
1114    /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
1115     * might be able to do better by doing execsize = 1 math and then
1116     * expanding that result out, but we would need to be careful with
1117     * masking.
1118     *
1119     * Gen 6 hardware ignores source modifiers (negate and abs) on math
1120     * instructions, so we also move to a temp to set those up.
1121     */
1122    if (intel->gen >= 6)
1123       src = fix_math_operand(src);
1124
1125    fs_inst *inst = emit(opcode, dst, src);
1126
1127    if (intel->gen < 6) {
1128       inst->base_mrf = 2;
1129       inst->mlen = dispatch_width / 8;
1130    }
1131
1132    return inst;
1133 }
1134
1135 fs_inst *
1136 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1137 {
1138    int base_mrf = 2;
1139    fs_inst *inst;
1140
1141    switch (opcode) {
1142    case SHADER_OPCODE_INT_QUOTIENT:
1143    case SHADER_OPCODE_INT_REMAINDER:
1144       if (intel->gen >= 7 && dispatch_width == 16)
1145          fail("16-wide INTDIV unsupported\n");
1146       break;
1147    case SHADER_OPCODE_POW:
1148       break;
1149    default:
1150       assert(!"not reached: unsupported binary math opcode.");
1151       return NULL;
1152    }
1153
1154    if (intel->gen >= 6) {
1155       src0 = fix_math_operand(src0);
1156       src1 = fix_math_operand(src1);
1157
1158       inst = emit(opcode, dst, src0, src1);
1159    } else {
1160       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1161        * "Message Payload":
1162        *
1163        * "Operand0[7].  For the INT DIV functions, this operand is the
1164        *  denominator."
1165        *  ...
1166        * "Operand1[7].  For the INT DIV functions, this operand is the
1167        *  numerator."
1168        */
1169       bool is_int_div = opcode != SHADER_OPCODE_POW;
1170       fs_reg &op0 = is_int_div ? src1 : src0;
1171       fs_reg &op1 = is_int_div ? src0 : src1;
1172
1173       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1174       inst = emit(opcode, dst, op0, reg_null_f);
1175
1176       inst->base_mrf = base_mrf;
1177       inst->mlen = 2 * dispatch_width / 8;
1178    }
1179    return inst;
1180 }
1181
1182 /**
1183  * To be called after the last _mesa_add_state_reference() call, to
1184  * set up prog_data.param[] for assign_curb_setup() and
1185  * setup_pull_constants().
1186  */
1187 void
1188 fs_visitor::setup_paramvalues_refs()
1189 {
1190    if (dispatch_width != 8)
1191       return;
1192
1193    /* Set up the pointers to ParamValues now that that array is finalized. */
1194    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1195       c->prog_data.param[i] =
1196          (const float *)fp->Base.Parameters->ParameterValues[this->param_index[i]] +
1197          this->param_offset[i];
1198    }
1199 }
1200
1201 void
1202 fs_visitor::assign_curb_setup()
1203 {
1204    c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
1205    if (dispatch_width == 8) {
1206       c->prog_data.first_curbe_grf = c->nr_payload_regs;
1207    } else {
1208       c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1209    }
1210
1211    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1212    foreach_list(node, &this->instructions) {
1213       fs_inst *inst = (fs_inst *)node;
1214
1215       for (unsigned int i = 0; i < 3; i++) {
1216          if (inst->src[i].file == UNIFORM) {
1217             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1218             struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1219                                                   constant_nr / 8,
1220                                                   constant_nr % 8);
1221
1222             inst->src[i].file = FIXED_HW_REG;
1223             inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
1224          }
1225       }
1226    }
1227 }
1228
1229 void
1230 fs_visitor::calculate_urb_setup()
1231 {
1232    for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
1233       urb_setup[i] = -1;
1234    }
1235
1236    int urb_next = 0;
1237    /* Figure out where each of the incoming setup attributes lands. */
1238    if (intel->gen >= 6) {
1239       for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
1240          if (fp->Base.InputsRead & BITFIELD64_BIT(i)) {
1241             urb_setup[i] = urb_next++;
1242          }
1243       }
1244    } else {
1245       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1246       for (unsigned int i = 0; i < VERT_RESULT_MAX; i++) {
1247          /* Point size is packed into the header, not as a general attribute */
1248          if (i == VERT_RESULT_PSIZ)
1249             continue;
1250
1251          if (c->key.vp_outputs_written & BITFIELD64_BIT(i)) {
1252             int fp_index = _mesa_vert_result_to_frag_attrib((gl_vert_result) i);
1253
1254             /* The back color slot is skipped when the front color is
1255              * also written to.  In addition, some slots can be
1256              * written in the vertex shader and not read in the
1257              * fragment shader.  So the register number must always be
1258              * incremented, mapped or not.
1259              */
1260             if (fp_index >= 0)
1261                urb_setup[fp_index] = urb_next;
1262             urb_next++;
1263          }
1264       }
1265
1266       /*
1267        * It's a FS only attribute, and we did interpolation for this attribute
1268        * in SF thread. So, count it here, too.
1269        *
1270        * See compile_sf_prog() for more info.
1271        */
1272       if (fp->Base.InputsRead & BITFIELD64_BIT(FRAG_ATTRIB_PNTC))
1273          urb_setup[FRAG_ATTRIB_PNTC] = urb_next++;
1274    }
1275
1276    /* Each attribute is 4 setup channels, each of which is half a reg. */
1277    c->prog_data.urb_read_length = urb_next * 2;
1278 }
1279
1280 void
1281 fs_visitor::assign_urb_setup()
1282 {
1283    int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1284
1285    /* Offset all the urb_setup[] index by the actual position of the
1286     * setup regs, now that the location of the constants has been chosen.
1287     */
1288    foreach_list(node, &this->instructions) {
1289       fs_inst *inst = (fs_inst *)node;
1290
1291       if (inst->opcode == FS_OPCODE_LINTERP) {
1292          assert(inst->src[2].file == FIXED_HW_REG);
1293          inst->src[2].fixed_hw_reg.nr += urb_start;
1294       }
1295
1296       if (inst->opcode == FS_OPCODE_CINTERP) {
1297          assert(inst->src[0].file == FIXED_HW_REG);
1298          inst->src[0].fixed_hw_reg.nr += urb_start;
1299       }
1300    }
1301
1302    this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length;
1303 }
1304
1305 /**
1306  * Split large virtual GRFs into separate components if we can.
1307  *
1308  * This is mostly duplicated with what brw_fs_vector_splitting does,
1309  * but that's really conservative because it's afraid of doing
1310  * splitting that doesn't result in real progress after the rest of
1311  * the optimization phases, which would cause infinite looping in
1312  * optimization.  We can do it once here, safely.  This also has the
1313  * opportunity to split interpolated values, or maybe even uniforms,
1314  * which we don't have at the IR level.
1315  *
1316  * We want to split, because virtual GRFs are what we register
1317  * allocate and spill (due to contiguousness requirements for some
1318  * instructions), and they're what we naturally generate in the
1319  * codegen process, but most virtual GRFs don't actually need to be
1320  * contiguous sets of GRFs.  If we split, we'll end up with reduced
1321  * live intervals and better dead code elimination and coalescing.
1322  */
1323 void
1324 fs_visitor::split_virtual_grfs()
1325 {
1326    int num_vars = this->virtual_grf_count;
1327    bool split_grf[num_vars];
1328    int new_virtual_grf[num_vars];
1329
1330    /* Try to split anything > 0 sized. */
1331    for (int i = 0; i < num_vars; i++) {
1332       if (this->virtual_grf_sizes[i] != 1)
1333          split_grf[i] = true;
1334       else
1335          split_grf[i] = false;
1336    }
1337
1338    if (brw->has_pln &&
1339        this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1340       /* PLN opcodes rely on the delta_xy being contiguous.  We only have to
1341        * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1342        * Gen6, that was the only supported interpolation mode, and since Gen6,
1343        * delta_x and delta_y are in fixed hardware registers.
1344        */
1345       split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1346          false;
1347    }
1348
1349    foreach_list(node, &this->instructions) {
1350       fs_inst *inst = (fs_inst *)node;
1351
1352       /* If there's a SEND message that requires contiguous destination
1353        * registers, no splitting is allowed.
1354        */
1355       if (inst->regs_written() > 1) {
1356          split_grf[inst->dst.reg] = false;
1357       }
1358    }
1359
1360    /* Allocate new space for split regs.  Note that the virtual
1361     * numbers will be contiguous.
1362     */
1363    for (int i = 0; i < num_vars; i++) {
1364       if (split_grf[i]) {
1365          new_virtual_grf[i] = virtual_grf_alloc(1);
1366          for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1367             int reg = virtual_grf_alloc(1);
1368             assert(reg == new_virtual_grf[i] + j - 1);
1369             (void) reg;
1370          }
1371          this->virtual_grf_sizes[i] = 1;
1372       }
1373    }
1374
1375    foreach_list(node, &this->instructions) {
1376       fs_inst *inst = (fs_inst *)node;
1377
1378       if (inst->dst.file == GRF &&
1379           split_grf[inst->dst.reg] &&
1380           inst->dst.reg_offset != 0) {
1381          inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1382                           inst->dst.reg_offset - 1);
1383          inst->dst.reg_offset = 0;
1384       }
1385       for (int i = 0; i < 3; i++) {
1386          if (inst->src[i].file == GRF &&
1387              split_grf[inst->src[i].reg] &&
1388              inst->src[i].reg_offset != 0) {
1389             inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1390                                 inst->src[i].reg_offset - 1);
1391             inst->src[i].reg_offset = 0;
1392          }
1393       }
1394    }
1395    this->live_intervals_valid = false;
1396 }
1397
1398 /**
1399  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1400  *
1401  * During code generation, we create tons of temporary variables, many of
1402  * which get immediately killed and are never used again.  Yet, in later
1403  * optimization and analysis passes, such as compute_live_intervals, we need
1404  * to loop over all the virtual GRFs.  Compacting them can save a lot of
1405  * overhead.
1406  */
1407 void
1408 fs_visitor::compact_virtual_grfs()
1409 {
1410    /* Mark which virtual GRFs are used, and count how many. */
1411    int remap_table[this->virtual_grf_count];
1412    memset(remap_table, -1, sizeof(remap_table));
1413
1414    foreach_list(node, &this->instructions) {
1415       const fs_inst *inst = (const fs_inst *) node;
1416
1417       if (inst->dst.file == GRF)
1418          remap_table[inst->dst.reg] = 0;
1419
1420       for (int i = 0; i < 3; i++) {
1421          if (inst->src[i].file == GRF)
1422             remap_table[inst->src[i].reg] = 0;
1423       }
1424    }
1425
1426    /* In addition to registers used in instructions, fs_visitor keeps
1427     * direct references to certain special values which must be patched:
1428     */
1429    fs_reg *special[] = {
1430       &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1431       &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1432       &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1433       &delta_x[0], &delta_x[1], &delta_x[2],
1434       &delta_x[3], &delta_x[4], &delta_x[5],
1435       &delta_y[0], &delta_y[1], &delta_y[2],
1436       &delta_y[3], &delta_y[4], &delta_y[5],
1437    };
1438    STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1439    STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1440
1441    /* Treat all special values as used, to be conservative */
1442    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1443       if (special[i]->file == GRF)
1444          remap_table[special[i]->reg] = 0;
1445    }
1446
1447    /* Compact the GRF arrays. */
1448    int new_index = 0;
1449    for (int i = 0; i < this->virtual_grf_count; i++) {
1450       if (remap_table[i] != -1) {
1451          remap_table[i] = new_index;
1452          virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1453          if (live_intervals_valid) {
1454             virtual_grf_use[new_index] = virtual_grf_use[i];
1455             virtual_grf_def[new_index] = virtual_grf_def[i];
1456          }
1457          ++new_index;
1458       }
1459    }
1460
1461    this->virtual_grf_count = new_index;
1462
1463    /* Patch all the instructions to use the newly renumbered registers */
1464    foreach_list(node, &this->instructions) {
1465       fs_inst *inst = (fs_inst *) node;
1466
1467       if (inst->dst.file == GRF)
1468          inst->dst.reg = remap_table[inst->dst.reg];
1469
1470       for (int i = 0; i < 3; i++) {
1471          if (inst->src[i].file == GRF)
1472             inst->src[i].reg = remap_table[inst->src[i].reg];
1473       }
1474    }
1475
1476    /* Patch all the references to special values */
1477    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1478       if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1479          special[i]->reg = remap_table[special[i]->reg];
1480    }
1481 }
1482
1483 bool
1484 fs_visitor::remove_dead_constants()
1485 {
1486    if (dispatch_width == 8) {
1487       this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
1488
1489       for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
1490          this->params_remap[i] = -1;
1491
1492       /* Find which params are still in use. */
1493       foreach_list(node, &this->instructions) {
1494          fs_inst *inst = (fs_inst *)node;
1495
1496          for (int i = 0; i < 3; i++) {
1497             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1498
1499             if (inst->src[i].file != UNIFORM)
1500                continue;
1501
1502             assert(constant_nr < (int)c->prog_data.nr_params);
1503
1504             /* For now, set this to non-negative.  We'll give it the
1505              * actual new number in a moment, in order to keep the
1506              * register numbers nicely ordered.
1507              */
1508             this->params_remap[constant_nr] = 0;
1509          }
1510       }
1511
1512       /* Figure out what the new numbers for the params will be.  At some
1513        * point when we're doing uniform array access, we're going to want
1514        * to keep the distinction between .reg and .reg_offset, but for
1515        * now we don't care.
1516        */
1517       unsigned int new_nr_params = 0;
1518       for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1519          if (this->params_remap[i] != -1) {
1520             this->params_remap[i] = new_nr_params++;
1521          }
1522       }
1523
1524       /* Update the list of params to be uploaded to match our new numbering. */
1525       for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1526          int remapped = this->params_remap[i];
1527
1528          if (remapped == -1)
1529             continue;
1530
1531          /* We've already done setup_paramvalues_refs() so no need to worry
1532           * about param_index and param_offset.
1533           */
1534          c->prog_data.param[remapped] = c->prog_data.param[i];
1535       }
1536
1537       c->prog_data.nr_params = new_nr_params;
1538    } else {
1539       /* This should have been generated in the 8-wide pass already. */
1540       assert(this->params_remap);
1541    }
1542
1543    /* Now do the renumbering of the shader to remove unused params. */
1544    foreach_list(node, &this->instructions) {
1545       fs_inst *inst = (fs_inst *)node;
1546
1547       for (int i = 0; i < 3; i++) {
1548          int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1549
1550          if (inst->src[i].file != UNIFORM)
1551             continue;
1552
1553          assert(this->params_remap[constant_nr] != -1);
1554          inst->src[i].reg = this->params_remap[constant_nr];
1555          inst->src[i].reg_offset = 0;
1556       }
1557    }
1558
1559    return true;
1560 }
1561
1562 /*
1563  * Implements array access of uniforms by inserting a
1564  * PULL_CONSTANT_LOAD instruction.
1565  *
1566  * Unlike temporary GRF array access (where we don't support it due to
1567  * the difficulty of doing relative addressing on instruction
1568  * destinations), we could potentially do array access of uniforms
1569  * that were loaded in GRF space as push constants.  In real-world
1570  * usage we've seen, though, the arrays being used are always larger
1571  * than we could load as push constants, so just always move all
1572  * uniform array access out to a pull constant buffer.
1573  */
1574 void
1575 fs_visitor::move_uniform_array_access_to_pull_constants()
1576 {
1577    int pull_constant_loc[c->prog_data.nr_params];
1578
1579    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1580       pull_constant_loc[i] = -1;
1581    }
1582
1583    /* Walk through and find array access of uniforms.  Put a copy of that
1584     * uniform in the pull constant buffer.
1585     *
1586     * Note that we don't move constant-indexed accesses to arrays.  No
1587     * testing has been done of the performance impact of this choice.
1588     */
1589    foreach_list_safe(node, &this->instructions) {
1590       fs_inst *inst = (fs_inst *)node;
1591
1592       for (int i = 0 ; i < 3; i++) {
1593          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1594             continue;
1595
1596          int uniform = inst->src[i].reg;
1597
1598          /* If this array isn't already present in the pull constant buffer,
1599           * add it.
1600           */
1601          if (pull_constant_loc[uniform] == -1) {
1602             const float **values = &c->prog_data.param[uniform];
1603
1604             pull_constant_loc[uniform] = c->prog_data.nr_pull_params;
1605
1606             assert(param_size[uniform]);
1607
1608             for (int j = 0; j < param_size[uniform]; j++) {
1609                c->prog_data.pull_param[c->prog_data.nr_pull_params++] =
1610                   values[j];
1611             }
1612          }
1613
1614          /* Set up the annotation tracking for new generated instructions. */
1615          base_ir = inst->ir;
1616          current_annotation = inst->annotation;
1617
1618          fs_reg offset = fs_reg(this, glsl_type::int_type);
1619          inst->insert_before(ADD(offset, *inst->src[i].reladdr,
1620                                  fs_reg(pull_constant_loc[uniform] +
1621                                         inst->src[i].reg_offset)));
1622
1623          fs_reg surf_index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1624          fs_reg temp = fs_reg(this, glsl_type::float_type);
1625          exec_list list = VARYING_PULL_CONSTANT_LOAD(temp,
1626                                                      surf_index, offset);
1627          inst->insert_before(&list);
1628
1629          inst->src[i].file = temp.file;
1630          inst->src[i].reg = temp.reg;
1631          inst->src[i].reg_offset = temp.reg_offset;
1632          inst->src[i].reladdr = NULL;
1633       }
1634    }
1635 }
1636
1637 /**
1638  * Choose accesses from the UNIFORM file to demote to using the pull
1639  * constant buffer.
1640  *
1641  * We allow a fragment shader to have more than the specified minimum
1642  * maximum number of fragment shader uniform components (64).  If
1643  * there are too many of these, they'd fill up all of register space.
1644  * So, this will push some of them out to the pull constant buffer and
1645  * update the program to load them.
1646  */
1647 void
1648 fs_visitor::setup_pull_constants()
1649 {
1650    /* Only allow 16 registers (128 uniform components) as push constants. */
1651    unsigned int max_uniform_components = 16 * 8;
1652    if (c->prog_data.nr_params <= max_uniform_components)
1653       return;
1654
1655    if (dispatch_width == 16) {
1656       fail("Pull constants not supported in 16-wide\n");
1657       return;
1658    }
1659
1660    /* Just demote the end of the list.  We could probably do better
1661     * here, demoting things that are rarely used in the program first.
1662     */
1663    unsigned int pull_uniform_base = max_uniform_components;
1664
1665    int pull_constant_loc[c->prog_data.nr_params];
1666    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1667       if (i < pull_uniform_base) {
1668          pull_constant_loc[i] = -1;
1669       } else {
1670          pull_constant_loc[i] = -1;
1671          /* If our constant is already being uploaded for reladdr purposes,
1672           * reuse it.
1673           */
1674          for (unsigned int j = 0; j < c->prog_data.nr_pull_params; j++) {
1675             if (c->prog_data.pull_param[j] == c->prog_data.param[i]) {
1676                pull_constant_loc[i] = j;
1677                break;
1678             }
1679          }
1680          if (pull_constant_loc[i] == -1) {
1681             int pull_index = c->prog_data.nr_pull_params++;
1682             c->prog_data.pull_param[pull_index] = c->prog_data.param[i];
1683             pull_constant_loc[i] = pull_index;;
1684          }
1685       }
1686    }
1687    c->prog_data.nr_params = pull_uniform_base;
1688
1689    foreach_list(node, &this->instructions) {
1690       fs_inst *inst = (fs_inst *)node;
1691
1692       for (int i = 0; i < 3; i++) {
1693          if (inst->src[i].file != UNIFORM)
1694             continue;
1695
1696          int pull_index = pull_constant_loc[inst->src[i].reg +
1697                                             inst->src[i].reg_offset];
1698          if (pull_index == -1)
1699             continue;
1700
1701          assert(!inst->src[i].reladdr);
1702
1703          fs_reg dst = fs_reg(this, glsl_type::float_type);
1704          fs_reg index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1705          fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1706          fs_inst *pull =
1707             new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1708                                  dst, index, offset);
1709          pull->ir = inst->ir;
1710          pull->annotation = inst->annotation;
1711          pull->base_mrf = 14;
1712          pull->mlen = 1;
1713
1714          inst->insert_before(pull);
1715
1716          inst->src[i].file = GRF;
1717          inst->src[i].reg = dst.reg;
1718          inst->src[i].reg_offset = 0;
1719          inst->src[i].smear = pull_index & 3;
1720       }
1721    }
1722 }
1723
1724 bool
1725 fs_visitor::opt_algebraic()
1726 {
1727    bool progress = false;
1728
1729    foreach_list(node, &this->instructions) {
1730       fs_inst *inst = (fs_inst *)node;
1731
1732       switch (inst->opcode) {
1733       case BRW_OPCODE_MUL:
1734          if (inst->src[1].file != IMM)
1735             continue;
1736
1737          /* a * 1.0 = a */
1738          if (inst->src[1].is_one()) {
1739             inst->opcode = BRW_OPCODE_MOV;
1740             inst->src[1] = reg_undef;
1741             progress = true;
1742             break;
1743          }
1744
1745          /* a * 0.0 = 0.0 */
1746          if (inst->src[1].is_zero()) {
1747             inst->opcode = BRW_OPCODE_MOV;
1748             inst->src[0] = inst->src[1];
1749             inst->src[1] = reg_undef;
1750             progress = true;
1751             break;
1752          }
1753
1754          break;
1755       case BRW_OPCODE_ADD:
1756          if (inst->src[1].file != IMM)
1757             continue;
1758
1759          /* a + 0.0 = a */
1760          if (inst->src[1].is_zero()) {
1761             inst->opcode = BRW_OPCODE_MOV;
1762             inst->src[1] = reg_undef;
1763             progress = true;
1764             break;
1765          }
1766          break;
1767       default:
1768          break;
1769       }
1770    }
1771
1772    return progress;
1773 }
1774
1775 /**
1776  * Must be called after calculate_live_intervales() to remove unused
1777  * writes to registers -- register allocation will fail otherwise
1778  * because something deffed but not used won't be considered to
1779  * interfere with other regs.
1780  */
1781 bool
1782 fs_visitor::dead_code_eliminate()
1783 {
1784    bool progress = false;
1785    int pc = 0;
1786
1787    calculate_live_intervals();
1788
1789    foreach_list_safe(node, &this->instructions) {
1790       fs_inst *inst = (fs_inst *)node;
1791
1792       if (inst->dst.file == GRF && this->virtual_grf_use[inst->dst.reg] <= pc) {
1793          inst->remove();
1794          progress = true;
1795       }
1796
1797       pc++;
1798    }
1799
1800    if (progress)
1801       live_intervals_valid = false;
1802
1803    return progress;
1804 }
1805
1806 /**
1807  * Implements a second type of register coalescing: This one checks if
1808  * the two regs involved in a raw move don't interfere, in which case
1809  * they can both by stored in the same place and the MOV removed.
1810  */
1811 bool
1812 fs_visitor::register_coalesce_2()
1813 {
1814    bool progress = false;
1815
1816    calculate_live_intervals();
1817
1818    foreach_list_safe(node, &this->instructions) {
1819       fs_inst *inst = (fs_inst *)node;
1820
1821       if (inst->opcode != BRW_OPCODE_MOV ||
1822           inst->predicate ||
1823           inst->saturate ||
1824           inst->src[0].file != GRF ||
1825           inst->src[0].negate ||
1826           inst->src[0].abs ||
1827           inst->src[0].smear != -1 ||
1828           inst->dst.file != GRF ||
1829           inst->dst.type != inst->src[0].type ||
1830           virtual_grf_sizes[inst->src[0].reg] != 1 ||
1831           virtual_grf_interferes(inst->dst.reg, inst->src[0].reg)) {
1832          continue;
1833       }
1834
1835       int reg_from = inst->src[0].reg;
1836       assert(inst->src[0].reg_offset == 0);
1837       int reg_to = inst->dst.reg;
1838       int reg_to_offset = inst->dst.reg_offset;
1839
1840       foreach_list_safe(node, &this->instructions) {
1841          fs_inst *scan_inst = (fs_inst *)node;
1842
1843          if (scan_inst->dst.file == GRF &&
1844              scan_inst->dst.reg == reg_from) {
1845             scan_inst->dst.reg = reg_to;
1846             scan_inst->dst.reg_offset = reg_to_offset;
1847          }
1848          for (int i = 0; i < 3; i++) {
1849             if (scan_inst->src[i].file == GRF &&
1850                 scan_inst->src[i].reg == reg_from) {
1851                scan_inst->src[i].reg = reg_to;
1852                scan_inst->src[i].reg_offset = reg_to_offset;
1853             }
1854          }
1855       }
1856
1857       inst->remove();
1858       live_intervals_valid = false;
1859       progress = true;
1860       continue;
1861    }
1862
1863    return progress;
1864 }
1865
1866 bool
1867 fs_visitor::register_coalesce()
1868 {
1869    bool progress = false;
1870    int if_depth = 0;
1871    int loop_depth = 0;
1872
1873    foreach_list_safe(node, &this->instructions) {
1874       fs_inst *inst = (fs_inst *)node;
1875
1876       /* Make sure that we dominate the instructions we're going to
1877        * scan for interfering with our coalescing, or we won't have
1878        * scanned enough to see if anything interferes with our
1879        * coalescing.  We don't dominate the following instructions if
1880        * we're in a loop or an if block.
1881        */
1882       switch (inst->opcode) {
1883       case BRW_OPCODE_DO:
1884          loop_depth++;
1885          break;
1886       case BRW_OPCODE_WHILE:
1887          loop_depth--;
1888          break;
1889       case BRW_OPCODE_IF:
1890          if_depth++;
1891          break;
1892       case BRW_OPCODE_ENDIF:
1893          if_depth--;
1894          break;
1895       default:
1896          break;
1897       }
1898       if (loop_depth || if_depth)
1899          continue;
1900
1901       if (inst->opcode != BRW_OPCODE_MOV ||
1902           inst->predicate ||
1903           inst->saturate ||
1904           inst->dst.file != GRF || (inst->src[0].file != GRF &&
1905                                     inst->src[0].file != UNIFORM)||
1906           inst->dst.type != inst->src[0].type)
1907          continue;
1908
1909       bool has_source_modifiers = (inst->src[0].abs ||
1910                                    inst->src[0].negate ||
1911                                    inst->src[0].file == UNIFORM);
1912
1913       /* Found a move of a GRF to a GRF.  Let's see if we can coalesce
1914        * them: check for no writes to either one until the exit of the
1915        * program.
1916        */
1917       bool interfered = false;
1918
1919       for (fs_inst *scan_inst = (fs_inst *)inst->next;
1920            !scan_inst->is_tail_sentinel();
1921            scan_inst = (fs_inst *)scan_inst->next) {
1922          if (scan_inst->dst.file == GRF) {
1923             if (scan_inst->overwrites_reg(inst->dst) ||
1924                 scan_inst->overwrites_reg(inst->src[0])) {
1925                interfered = true;
1926                break;
1927             }
1928          }
1929
1930          /* The gen6 MATH instruction can't handle source modifiers or
1931           * unusual register regions, so avoid coalescing those for
1932           * now.  We should do something more specific.
1933           */
1934          if (has_source_modifiers && !can_do_source_mods(scan_inst)) {
1935             interfered = true;
1936             break;
1937          }
1938
1939          /* The accumulator result appears to get used for the
1940           * conditional modifier generation.  When negating a UD
1941           * value, there is a 33rd bit generated for the sign in the
1942           * accumulator value, so now you can't check, for example,
1943           * equality with a 32-bit value.  See piglit fs-op-neg-uint.
1944           */
1945          if (scan_inst->conditional_mod &&
1946              inst->src[0].negate &&
1947              inst->src[0].type == BRW_REGISTER_TYPE_UD) {
1948             interfered = true;
1949             break;
1950          }
1951       }
1952       if (interfered) {
1953          continue;
1954       }
1955
1956       /* Rewrite the later usage to point at the source of the move to
1957        * be removed.
1958        */
1959       for (fs_inst *scan_inst = inst;
1960            !scan_inst->is_tail_sentinel();
1961            scan_inst = (fs_inst *)scan_inst->next) {
1962          for (int i = 0; i < 3; i++) {
1963             if (scan_inst->src[i].file == GRF &&
1964                 scan_inst->src[i].reg == inst->dst.reg &&
1965                 scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
1966                fs_reg new_src = inst->src[0];
1967                if (scan_inst->src[i].abs) {
1968                   new_src.negate = 0;
1969                   new_src.abs = 1;
1970                }
1971                new_src.negate ^= scan_inst->src[i].negate;
1972                scan_inst->src[i] = new_src;
1973             }
1974          }
1975       }
1976
1977       inst->remove();
1978       progress = true;
1979    }
1980
1981    if (progress)
1982       live_intervals_valid = false;
1983
1984    return progress;
1985 }
1986
1987
1988 bool
1989 fs_visitor::compute_to_mrf()
1990 {
1991    bool progress = false;
1992    int next_ip = 0;
1993
1994    calculate_live_intervals();
1995
1996    foreach_list_safe(node, &this->instructions) {
1997       fs_inst *inst = (fs_inst *)node;
1998
1999       int ip = next_ip;
2000       next_ip++;
2001
2002       if (inst->opcode != BRW_OPCODE_MOV ||
2003           inst->predicate ||
2004           inst->dst.file != MRF || inst->src[0].file != GRF ||
2005           inst->dst.type != inst->src[0].type ||
2006           inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
2007          continue;
2008
2009       /* Work out which hardware MRF registers are written by this
2010        * instruction.
2011        */
2012       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2013       int mrf_high;
2014       if (inst->dst.reg & BRW_MRF_COMPR4) {
2015          mrf_high = mrf_low + 4;
2016       } else if (dispatch_width == 16 &&
2017                  (!inst->force_uncompressed && !inst->force_sechalf)) {
2018          mrf_high = mrf_low + 1;
2019       } else {
2020          mrf_high = mrf_low;
2021       }
2022
2023       /* Can't compute-to-MRF this GRF if someone else was going to
2024        * read it later.
2025        */
2026       if (this->virtual_grf_use[inst->src[0].reg] > ip)
2027          continue;
2028
2029       /* Found a move of a GRF to a MRF.  Let's see if we can go
2030        * rewrite the thing that made this GRF to write into the MRF.
2031        */
2032       fs_inst *scan_inst;
2033       for (scan_inst = (fs_inst *)inst->prev;
2034            scan_inst->prev != NULL;
2035            scan_inst = (fs_inst *)scan_inst->prev) {
2036          if (scan_inst->dst.file == GRF &&
2037              scan_inst->dst.reg == inst->src[0].reg) {
2038             /* Found the last thing to write our reg we want to turn
2039              * into a compute-to-MRF.
2040              */
2041
2042             /* SENDs can only write to GRFs, so no compute-to-MRF. */
2043             if (scan_inst->mlen) {
2044                break;
2045             }
2046
2047             /* If it's predicated, it (probably) didn't populate all
2048              * the channels.  We might be able to rewrite everything
2049              * that writes that reg, but it would require smarter
2050              * tracking to delay the rewriting until complete success.
2051              */
2052             if (scan_inst->predicate)
2053                break;
2054
2055             /* If it's half of register setup and not the same half as
2056              * our MOV we're trying to remove, bail for now.
2057              */
2058             if (scan_inst->force_uncompressed != inst->force_uncompressed ||
2059                 scan_inst->force_sechalf != inst->force_sechalf) {
2060                break;
2061             }
2062
2063             /* SEND instructions can't have MRF as a destination. */
2064             if (scan_inst->mlen)
2065                break;
2066
2067             if (intel->gen >= 6) {
2068                /* gen6 math instructions must have the destination be
2069                 * GRF, so no compute-to-MRF for them.
2070                 */
2071                if (scan_inst->is_math()) {
2072                   break;
2073                }
2074             }
2075
2076             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2077                /* Found the creator of our MRF's source value. */
2078                scan_inst->dst.file = MRF;
2079                scan_inst->dst.reg = inst->dst.reg;
2080                scan_inst->saturate |= inst->saturate;
2081                inst->remove();
2082                progress = true;
2083             }
2084             break;
2085          }
2086
2087          /* We don't handle flow control here.  Most computation of
2088           * values that end up in MRFs are shortly before the MRF
2089           * write anyway.
2090           */
2091          if (scan_inst->opcode == BRW_OPCODE_DO ||
2092              scan_inst->opcode == BRW_OPCODE_WHILE ||
2093              scan_inst->opcode == BRW_OPCODE_ELSE ||
2094              scan_inst->opcode == BRW_OPCODE_ENDIF) {
2095             break;
2096          }
2097
2098          /* You can't read from an MRF, so if someone else reads our
2099           * MRF's source GRF that we wanted to rewrite, that stops us.
2100           */
2101          bool interfered = false;
2102          for (int i = 0; i < 3; i++) {
2103             if (scan_inst->src[i].file == GRF &&
2104                 scan_inst->src[i].reg == inst->src[0].reg &&
2105                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2106                interfered = true;
2107             }
2108          }
2109          if (interfered)
2110             break;
2111
2112          if (scan_inst->dst.file == MRF) {
2113             /* If somebody else writes our MRF here, we can't
2114              * compute-to-MRF before that.
2115              */
2116             int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2117             int scan_mrf_high;
2118
2119             if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2120                scan_mrf_high = scan_mrf_low + 4;
2121             } else if (dispatch_width == 16 &&
2122                        (!scan_inst->force_uncompressed &&
2123                         !scan_inst->force_sechalf)) {
2124                scan_mrf_high = scan_mrf_low + 1;
2125             } else {
2126                scan_mrf_high = scan_mrf_low;
2127             }
2128
2129             if (mrf_low == scan_mrf_low ||
2130                 mrf_low == scan_mrf_high ||
2131                 mrf_high == scan_mrf_low ||
2132                 mrf_high == scan_mrf_high) {
2133                break;
2134             }
2135          }
2136
2137          if (scan_inst->mlen > 0) {
2138             /* Found a SEND instruction, which means that there are
2139              * live values in MRFs from base_mrf to base_mrf +
2140              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
2141              * above it.
2142              */
2143             if (mrf_low >= scan_inst->base_mrf &&
2144                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2145                break;
2146             }
2147             if (mrf_high >= scan_inst->base_mrf &&
2148                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2149                break;
2150             }
2151          }
2152       }
2153    }
2154
2155    if (progress)
2156       live_intervals_valid = false;
2157
2158    return progress;
2159 }
2160
2161 /**
2162  * Walks through basic blocks, looking for repeated MRF writes and
2163  * removing the later ones.
2164  */
2165 bool
2166 fs_visitor::remove_duplicate_mrf_writes()
2167 {
2168    fs_inst *last_mrf_move[16];
2169    bool progress = false;
2170
2171    /* Need to update the MRF tracking for compressed instructions. */
2172    if (dispatch_width == 16)
2173       return false;
2174
2175    memset(last_mrf_move, 0, sizeof(last_mrf_move));
2176
2177    foreach_list_safe(node, &this->instructions) {
2178       fs_inst *inst = (fs_inst *)node;
2179
2180       switch (inst->opcode) {
2181       case BRW_OPCODE_DO:
2182       case BRW_OPCODE_WHILE:
2183       case BRW_OPCODE_IF:
2184       case BRW_OPCODE_ELSE:
2185       case BRW_OPCODE_ENDIF:
2186          memset(last_mrf_move, 0, sizeof(last_mrf_move));
2187          continue;
2188       default:
2189          break;
2190       }
2191
2192       if (inst->opcode == BRW_OPCODE_MOV &&
2193           inst->dst.file == MRF) {
2194          fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2195          if (prev_inst && inst->equals(prev_inst)) {
2196             inst->remove();
2197             progress = true;
2198             continue;
2199          }
2200       }
2201
2202       /* Clear out the last-write records for MRFs that were overwritten. */
2203       if (inst->dst.file == MRF) {
2204          last_mrf_move[inst->dst.reg] = NULL;
2205       }
2206
2207       if (inst->mlen > 0) {
2208          /* Found a SEND instruction, which will include two or fewer
2209           * implied MRF writes.  We could do better here.
2210           */
2211          for (int i = 0; i < implied_mrf_writes(inst); i++) {
2212             last_mrf_move[inst->base_mrf + i] = NULL;
2213          }
2214       }
2215
2216       /* Clear out any MRF move records whose sources got overwritten. */
2217       if (inst->dst.file == GRF) {
2218          for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2219             if (last_mrf_move[i] &&
2220                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2221                last_mrf_move[i] = NULL;
2222             }
2223          }
2224       }
2225
2226       if (inst->opcode == BRW_OPCODE_MOV &&
2227           inst->dst.file == MRF &&
2228           inst->src[0].file == GRF &&
2229           !inst->predicate) {
2230          last_mrf_move[inst->dst.reg] = inst;
2231       }
2232    }
2233
2234    if (progress)
2235       live_intervals_valid = false;
2236
2237    return progress;
2238 }
2239
2240 void
2241 fs_visitor::dump_instruction(fs_inst *inst)
2242 {
2243    if (inst->predicate) {
2244       printf("(%cf0.%d) ",
2245              inst->predicate_inverse ? '-' : '+',
2246              inst->flag_subreg);
2247    }
2248
2249    if (inst->opcode < ARRAY_SIZE(opcode_descs) &&
2250        opcode_descs[inst->opcode].name) {
2251       printf("%s", opcode_descs[inst->opcode].name);
2252    } else {
2253       printf("op%d", inst->opcode);
2254    }
2255    if (inst->saturate)
2256       printf(".sat");
2257    if (inst->conditional_mod) {
2258       printf(".cmod");
2259       if (!inst->predicate &&
2260           (intel->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2261                               inst->opcode != BRW_OPCODE_IF &&
2262                               inst->opcode != BRW_OPCODE_WHILE))) {
2263          printf(".f0.%d\n", inst->flag_subreg);
2264       }
2265    }
2266    printf(" ");
2267
2268
2269    switch (inst->dst.file) {
2270    case GRF:
2271       printf("vgrf%d", inst->dst.reg);
2272       if (inst->dst.reg_offset)
2273          printf("+%d", inst->dst.reg_offset);
2274       break;
2275    case MRF:
2276       printf("m%d", inst->dst.reg);
2277       break;
2278    case BAD_FILE:
2279       printf("(null)");
2280       break;
2281    case UNIFORM:
2282       printf("***u%d***", inst->dst.reg);
2283       break;
2284    default:
2285       printf("???");
2286       break;
2287    }
2288    printf(", ");
2289
2290    for (int i = 0; i < 3; i++) {
2291       if (inst->src[i].negate)
2292          printf("-");
2293       if (inst->src[i].abs)
2294          printf("|");
2295       switch (inst->src[i].file) {
2296       case GRF:
2297          printf("vgrf%d", inst->src[i].reg);
2298          if (inst->src[i].reg_offset)
2299             printf("+%d", inst->src[i].reg_offset);
2300          break;
2301       case MRF:
2302          printf("***m%d***", inst->src[i].reg);
2303          break;
2304       case UNIFORM:
2305          printf("u%d", inst->src[i].reg);
2306          if (inst->src[i].reg_offset)
2307             printf(".%d", inst->src[i].reg_offset);
2308          break;
2309       case BAD_FILE:
2310          printf("(null)");
2311          break;
2312       default:
2313          printf("???");
2314          break;
2315       }
2316       if (inst->src[i].abs)
2317          printf("|");
2318
2319       if (i < 3)
2320          printf(", ");
2321    }
2322
2323    printf(" ");
2324
2325    if (inst->force_uncompressed)
2326       printf("1sthalf ");
2327
2328    if (inst->force_sechalf)
2329       printf("2ndhalf ");
2330
2331    printf("\n");
2332 }
2333
2334 void
2335 fs_visitor::dump_instructions()
2336 {
2337    int ip = 0;
2338    foreach_list(node, &this->instructions) {
2339       fs_inst *inst = (fs_inst *)node;
2340       printf("%d: ", ip++);
2341       dump_instruction(inst);
2342    }
2343 }
2344
2345 /**
2346  * Possibly returns an instruction that set up @param reg.
2347  *
2348  * Sometimes we want to take the result of some expression/variable
2349  * dereference tree and rewrite the instruction generating the result
2350  * of the tree.  When processing the tree, we know that the
2351  * instructions generated are all writing temporaries that are dead
2352  * outside of this tree.  So, if we have some instructions that write
2353  * a temporary, we're free to point that temp write somewhere else.
2354  *
2355  * Note that this doesn't guarantee that the instruction generated
2356  * only reg -- it might be the size=4 destination of a texture instruction.
2357  */
2358 fs_inst *
2359 fs_visitor::get_instruction_generating_reg(fs_inst *start,
2360                                            fs_inst *end,
2361                                            fs_reg reg)
2362 {
2363    if (end == start ||
2364        end->predicate ||
2365        end->force_uncompressed ||
2366        end->force_sechalf ||
2367        reg.reladdr ||
2368        !reg.equals(end->dst)) {
2369       return NULL;
2370    } else {
2371       return end;
2372    }
2373 }
2374
2375 void
2376 fs_visitor::setup_payload_gen6()
2377 {
2378    struct intel_context *intel = &brw->intel;
2379    bool uses_depth =
2380       (fp->Base.InputsRead & (1 << FRAG_ATTRIB_WPOS)) != 0;
2381    unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
2382
2383    assert(intel->gen >= 6);
2384
2385    /* R0-1: masks, pixel X/Y coordinates. */
2386    c->nr_payload_regs = 2;
2387    /* R2: only for 32-pixel dispatch.*/
2388
2389    /* R3-26: barycentric interpolation coordinates.  These appear in the
2390     * same order that they appear in the brw_wm_barycentric_interp_mode
2391     * enum.  Each set of coordinates occupies 2 registers if dispatch width
2392     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
2393     * appear if they were enabled using the "Barycentric Interpolation
2394     * Mode" bits in WM_STATE.
2395     */
2396    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
2397       if (barycentric_interp_modes & (1 << i)) {
2398          c->barycentric_coord_reg[i] = c->nr_payload_regs;
2399          c->nr_payload_regs += 2;
2400          if (dispatch_width == 16) {
2401             c->nr_payload_regs += 2;
2402          }
2403       }
2404    }
2405
2406    /* R27: interpolated depth if uses source depth */
2407    if (uses_depth) {
2408       c->source_depth_reg = c->nr_payload_regs;
2409       c->nr_payload_regs++;
2410       if (dispatch_width == 16) {
2411          /* R28: interpolated depth if not 8-wide. */
2412          c->nr_payload_regs++;
2413       }
2414    }
2415    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
2416    if (uses_depth) {
2417       c->source_w_reg = c->nr_payload_regs;
2418       c->nr_payload_regs++;
2419       if (dispatch_width == 16) {
2420          /* R30: interpolated W if not 8-wide. */
2421          c->nr_payload_regs++;
2422       }
2423    }
2424    /* R31: MSAA position offsets. */
2425    /* R32-: bary for 32-pixel. */
2426    /* R58-59: interp W for 32-pixel. */
2427
2428    if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
2429       c->source_depth_to_render_target = true;
2430    }
2431 }
2432
2433 bool
2434 fs_visitor::run()
2435 {
2436    uint32_t orig_nr_params = c->prog_data.nr_params;
2437
2438    if (intel->gen >= 6)
2439       setup_payload_gen6();
2440    else
2441       setup_payload_gen4();
2442
2443    if (0) {
2444       emit_dummy_fs();
2445    } else {
2446       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2447          emit_shader_time_begin();
2448
2449       calculate_urb_setup();
2450       if (intel->gen < 6)
2451          emit_interpolation_setup_gen4();
2452       else
2453          emit_interpolation_setup_gen6();
2454
2455       /* We handle discards by keeping track of the still-live pixels in f0.1.
2456        * Initialize it with the dispatched pixels.
2457        */
2458       if (fp->UsesKill) {
2459          fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
2460          discard_init->flag_subreg = 1;
2461       }
2462
2463       /* Generate FS IR for main().  (the visitor only descends into
2464        * functions called "main").
2465        */
2466       if (shader) {
2467          foreach_list(node, &*shader->ir) {
2468             ir_instruction *ir = (ir_instruction *)node;
2469             base_ir = ir;
2470             this->result = reg_undef;
2471             ir->accept(this);
2472          }
2473       } else {
2474          emit_fragment_program_code();
2475       }
2476       base_ir = NULL;
2477       if (failed)
2478          return false;
2479
2480       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2481          emit_shader_time_end();
2482
2483       emit_fb_writes();
2484
2485       split_virtual_grfs();
2486
2487       setup_paramvalues_refs();
2488       move_uniform_array_access_to_pull_constants();
2489       setup_pull_constants();
2490
2491       bool progress;
2492       do {
2493          progress = false;
2494
2495          compact_virtual_grfs();
2496
2497          progress = remove_duplicate_mrf_writes() || progress;
2498
2499          progress = opt_algebraic() || progress;
2500          progress = opt_cse() || progress;
2501          progress = opt_copy_propagate() || progress;
2502          progress = dead_code_eliminate() || progress;
2503          progress = register_coalesce() || progress;
2504          progress = register_coalesce_2() || progress;
2505          progress = compute_to_mrf() || progress;
2506       } while (progress);
2507
2508       remove_dead_constants();
2509
2510       schedule_instructions(false);
2511
2512       assign_curb_setup();
2513       assign_urb_setup();
2514
2515       if (0) {
2516          /* Debug of register spilling: Go spill everything. */
2517          for (int i = 0; i < virtual_grf_count; i++) {
2518             spill_reg(i);
2519          }
2520       }
2521
2522       if (0)
2523          assign_regs_trivial();
2524       else {
2525          while (!assign_regs()) {
2526             if (failed)
2527                break;
2528          }
2529       }
2530    }
2531    assert(force_uncompressed_stack == 0);
2532    assert(force_sechalf_stack == 0);
2533
2534    if (failed)
2535       return false;
2536
2537    schedule_instructions(true);
2538
2539    if (dispatch_width == 8) {
2540       c->prog_data.reg_blocks = brw_register_blocks(grf_used);
2541    } else {
2542       c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
2543
2544       /* Make sure we didn't try to sneak in an extra uniform */
2545       assert(orig_nr_params == c->prog_data.nr_params);
2546       (void) orig_nr_params;
2547    }
2548
2549    return !failed;
2550 }
2551
2552 const unsigned *
2553 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
2554                struct gl_fragment_program *fp,
2555                struct gl_shader_program *prog,
2556                unsigned *final_assembly_size)
2557 {
2558    struct intel_context *intel = &brw->intel;
2559    bool start_busy = false;
2560    float start_time = 0;
2561
2562    if (unlikely(INTEL_DEBUG & DEBUG_PERF)) {
2563       start_busy = (intel->batch.last_bo &&
2564                     drm_intel_bo_busy(intel->batch.last_bo));
2565       start_time = get_time();
2566    }
2567
2568    struct brw_shader *shader = NULL;
2569    if (prog)
2570       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
2571
2572    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
2573       if (shader) {
2574          printf("GLSL IR for native fragment shader %d:\n", prog->Name);
2575          _mesa_print_ir(shader->ir, NULL);
2576          printf("\n\n");
2577       } else {
2578          printf("ARB_fragment_program %d ir for native fragment shader\n",
2579                 fp->Base.Id);
2580          _mesa_print_program(&fp->Base);
2581       }
2582    }
2583
2584    /* Now the main event: Visit the shader IR and generate our FS IR for it.
2585     */
2586    fs_visitor v(brw, c, prog, fp, 8);
2587    if (!v.run()) {
2588       prog->LinkStatus = false;
2589       ralloc_strcat(&prog->InfoLog, v.fail_msg);
2590
2591       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
2592                     v.fail_msg);
2593
2594       return NULL;
2595    }
2596
2597    exec_list *simd16_instructions = NULL;
2598    fs_visitor v2(brw, c, prog, fp, 16);
2599    if (intel->gen >= 5 && c->prog_data.nr_pull_params == 0) {
2600       v2.import_uniforms(&v);
2601       if (!v2.run()) {
2602          perf_debug("16-wide shader failed to compile, falling back to "
2603                     "8-wide at a 10-20%% performance cost: %s", v2.fail_msg);
2604       } else {
2605          simd16_instructions = &v2.instructions;
2606       }
2607    }
2608
2609    c->prog_data.dispatch_width = 8;
2610
2611    fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
2612    const unsigned *generated = g.generate_assembly(&v.instructions,
2613                                                    simd16_instructions,
2614                                                    final_assembly_size);
2615
2616    if (unlikely(INTEL_DEBUG & DEBUG_PERF) && shader) {
2617       if (shader->compiled_once)
2618          brw_wm_debug_recompile(brw, prog, &c->key);
2619       shader->compiled_once = true;
2620
2621       if (start_busy && !drm_intel_bo_busy(intel->batch.last_bo)) {
2622          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
2623                     (get_time() - start_time) * 1000);
2624       }
2625    }
2626
2627    return generated;
2628 }
2629
2630 bool
2631 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
2632 {
2633    struct brw_context *brw = brw_context(ctx);
2634    struct intel_context *intel = &brw->intel;
2635    struct brw_wm_prog_key key;
2636
2637    if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
2638       return true;
2639
2640    struct gl_fragment_program *fp = (struct gl_fragment_program *)
2641       prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
2642    struct brw_fragment_program *bfp = brw_fragment_program(fp);
2643    bool program_uses_dfdy = fp->UsesDFdy;
2644
2645    memset(&key, 0, sizeof(key));
2646
2647    if (intel->gen < 6) {
2648       if (fp->UsesKill)
2649          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
2650
2651       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
2652          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
2653
2654       /* Just assume depth testing. */
2655       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
2656       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
2657    }
2658
2659    if (prog->Name != 0)
2660       key.proj_attrib_mask = 0xffffffff;
2661
2662    if (intel->gen < 6)
2663       key.vp_outputs_written |= BITFIELD64_BIT(FRAG_ATTRIB_WPOS);
2664
2665    for (int i = 0; i < FRAG_ATTRIB_MAX; i++) {
2666       if (!(fp->Base.InputsRead & BITFIELD64_BIT(i)))
2667          continue;
2668
2669       if (prog->Name == 0)
2670          key.proj_attrib_mask |= 1 << i;
2671
2672       if (intel->gen < 6) {
2673          int vp_index = _mesa_vert_result_to_frag_attrib((gl_vert_result) i);
2674
2675          if (vp_index >= 0)
2676             key.vp_outputs_written |= BITFIELD64_BIT(vp_index);
2677       }
2678    }
2679
2680    key.clamp_fragment_color = true;
2681
2682    for (int i = 0; i < MAX_SAMPLERS; i++) {
2683       if (fp->Base.ShadowSamplers & (1 << i)) {
2684          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
2685          key.tex.swizzles[i] =
2686             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
2687       } else {
2688          /* Color sampler: assume no swizzling. */
2689          key.tex.swizzles[i] = SWIZZLE_XYZW;
2690       }
2691    }
2692
2693    if (fp->Base.InputsRead & FRAG_BIT_WPOS) {
2694       key.drawable_height = ctx->DrawBuffer->Height;
2695    }
2696
2697    if ((fp->Base.InputsRead & FRAG_BIT_WPOS) || program_uses_dfdy) {
2698       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
2699    }
2700
2701    key.nr_color_regions = 1;
2702
2703    key.program_string_id = bfp->id;
2704
2705    uint32_t old_prog_offset = brw->wm.prog_offset;
2706    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
2707
2708    bool success = do_wm_prog(brw, prog, bfp, &key);
2709
2710    brw->wm.prog_offset = old_prog_offset;
2711    brw->wm.prog_data = old_prog_data;
2712
2713    return success;
2714 }