src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 extern "C" {
  32
  33 #include <sys/types.h>
  34
  35 #include "main/macros.h"
  36 #include "main/shaderobj.h"
  37 #include "main/uniforms.h"
  38 #include "main/fbobject.h"
  39 #include "program/prog_parameter.h"
  40 #include "program/prog_print.h"
  41 #include "program/register_allocate.h"
  42 #include "program/sampler.h"
  43 #include "program/hash_table.h"
  44 #include "brw_context.h"
  45 #include "brw_eu.h"
  46 #include "brw_wm.h"
  47 }
  48 #include "brw_fs.h"
  49 #include "glsl/glsl_types.h"
  50 #include "glsl/ir_print_visitor.h"
  51
  52 void
  53 fs_inst::init()
  54 {
  55    memset(this, 0, sizeof(*this));
  56    this->opcode = BRW_OPCODE_NOP;
  57    this->conditional_mod = BRW_CONDITIONAL_NONE;
  58
  59    this->dst = reg_undef;
  60    this->src[0] = reg_undef;
  61    this->src[1] = reg_undef;
  62    this->src[2] = reg_undef;
  63 }
  64
  65 fs_inst::fs_inst()
  66 {
  67    init();
  68 }
  69
  70 fs_inst::fs_inst(enum opcode opcode)
  71 {
  72    init();
  73    this->opcode = opcode;
  74 }
  75
  76 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
  77 {
  78    init();
  79    this->opcode = opcode;
  80    this->dst = dst;
  81
  82    if (dst.file == GRF)
  83       assert(dst.reg_offset >= 0);
  84 }
  85
  86 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
  87 {
  88    init();
  89    this->opcode = opcode;
  90    this->dst = dst;
  91    this->src[0] = src0;
  92
  93    if (dst.file == GRF)
  94       assert(dst.reg_offset >= 0);
  95    if (src[0].file == GRF)
  96       assert(src[0].reg_offset >= 0);
  97 }
  98
  99 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 100 {
 101    init();
 102    this->opcode = opcode;
 103    this->dst = dst;
 104    this->src[0] = src0;
 105    this->src[1] = src1;
 106
 107    if (dst.file == GRF)
 108       assert(dst.reg_offset >= 0);
 109    if (src[0].file == GRF)
 110       assert(src[0].reg_offset >= 0);
 111    if (src[1].file == GRF)
 112       assert(src[1].reg_offset >= 0);
 113 }
 114
 115 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
 116                  fs_reg src0, fs_reg src1, fs_reg src2)
 117 {
 118    init();
 119    this->opcode = opcode;
 120    this->dst = dst;
 121    this->src[0] = src0;
 122    this->src[1] = src1;
 123    this->src[2] = src2;
 124
 125    if (dst.file == GRF)
 126       assert(dst.reg_offset >= 0);
 127    if (src[0].file == GRF)
 128       assert(src[0].reg_offset >= 0);
 129    if (src[1].file == GRF)
 130       assert(src[1].reg_offset >= 0);
 131    if (src[2].file == GRF)
 132       assert(src[2].reg_offset >= 0);
 133 }
 134
 135 #define ALU1(op)                                                        \
 136    fs_inst *                                                            \
 137    fs_visitor::op(fs_reg dst, fs_reg src0)                              \
 138    {                                                                    \
 139       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
 140    }
 141
 142 #define ALU2(op)                                                        \
 143    fs_inst *                                                            \
 144    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1)                 \
 145    {                                                                    \
 146       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
 147    }
 148
 149 ALU1(NOT)
 150 ALU1(MOV)
 151 ALU1(FRC)
 152 ALU1(RNDD)
 153 ALU1(RNDE)
 154 ALU1(RNDZ)
 155 ALU2(ADD)
 156 ALU2(MUL)
 157 ALU2(MACH)
 158 ALU2(AND)
 159 ALU2(OR)
 160 ALU2(XOR)
 161 ALU2(SHL)
 162 ALU2(SHR)
 163 ALU2(ASR)
 164
 165 /** Gen4 predicated IF. */
 166 fs_inst *
 167 fs_visitor::IF(uint32_t predicate)
 168 {
 169    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
 170    inst->predicate = predicate;
 171    return inst;
 172 }
 173
 174 /** Gen6+ IF with embedded comparison. */
 175 fs_inst *
 176 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
 177 {
 178    assert(intel->gen >= 6);
 179    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
 180                                         reg_null_d, src0, src1);
 181    inst->conditional_mod = condition;
 182    return inst;
 183 }
 184
 185 /**
 186  * CMP: Sets the low bit of the destination channels with the result
 187  * of the comparison, while the upper bits are undefined, and updates
 188  * the flag register with the packed 16 bits of the result.
 189  */
 190 fs_inst *
 191 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
 192 {
 193    fs_inst *inst;
 194
 195    /* Take the instruction:
 196     *
 197     * CMP null<d> src0<f> src1<f>
 198     *
 199     * Original gen4 does type conversion to the destination type before
 200     * comparison, producing garbage results for floating point comparisons.
 201     * gen5 does the comparison on the execution type (resolved source types),
 202     * so dst type doesn't matter.  gen6 does comparison and then uses the
 203     * result as if it was the dst type with no conversion, which happens to
 204     * mostly work out for float-interpreted-as-int since our comparisons are
 205     * for >0, =0, <0.
 206     */
 207    if (intel->gen == 4) {
 208       dst.type = src0.type;
 209       if (dst.file == FIXED_HW_REG)
 210          dst.fixed_hw_reg.type = dst.type;
 211    }
 212
 213    resolve_ud_negate(&src0);
 214    resolve_ud_negate(&src1);
 215
 216    inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
 217    inst->conditional_mod = condition;
 218
 219    return inst;
 220 }
 221
 222 exec_list
 223 fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
 224                                        fs_reg offset)
 225 {
 226    exec_list instructions;
 227    fs_inst *inst;
 228
 229    if (intel->gen >= 7) {
 230       inst = new(mem_ctx) fs_inst(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7,
 231                                   dst, surf_index, offset);
 232       instructions.push_tail(inst);
 233    } else {
 234       int base_mrf = 13;
 235       bool header_present = true;
 236
 237       fs_reg mrf = fs_reg(MRF, base_mrf + header_present);
 238       mrf.type = BRW_REGISTER_TYPE_D;
 239
 240       /* On gen6+ we want the dword offset passed in, but on gen4/5 we need a
 241        * dword-aligned byte offset.
 242        */
 243       if (intel->gen == 6) {
 244          instructions.push_tail(MOV(mrf, offset));
 245       } else {
 246          instructions.push_tail(MUL(mrf, offset, fs_reg(4)));
 247       }
 248       inst = MOV(mrf, offset);
 249       inst = new(mem_ctx) fs_inst(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD,
 250                                   dst, surf_index);
 251       inst->header_present = header_present;
 252       inst->base_mrf = base_mrf;
 253       inst->mlen = header_present + dispatch_width / 8;
 254
 255       instructions.push_tail(inst);
 256    }
 257
 258    return instructions;
 259 }
 260
 261 bool
 262 fs_inst::equals(fs_inst *inst)
 263 {
 264    return (opcode == inst->opcode &&
 265            dst.equals(inst->dst) &&
 266            src[0].equals(inst->src[0]) &&
 267            src[1].equals(inst->src[1]) &&
 268            src[2].equals(inst->src[2]) &&
 269            saturate == inst->saturate &&
 270            predicate == inst->predicate &&
 271            conditional_mod == inst->conditional_mod &&
 272            mlen == inst->mlen &&
 273            base_mrf == inst->base_mrf &&
 274            sampler == inst->sampler &&
 275            target == inst->target &&
 276            eot == inst->eot &&
 277            header_present == inst->header_present &&
 278            shadow_compare == inst->shadow_compare &&
 279            offset == inst->offset);
 280 }
 281
 282 int
 283 fs_inst::regs_written()
 284 {
 285    if (is_tex())
 286       return 4;
 287
 288    /* The SINCOS and INT_DIV_QUOTIENT_AND_REMAINDER math functions return 2,
 289     * but we don't currently use them...nor do we have an opcode for them.
 290     */
 291
 292    return 1;
 293 }
 294
 295 bool
 296 fs_inst::overwrites_reg(const fs_reg &reg)
 297 {
 298    return (reg.file == dst.file &&
 299            reg.reg == dst.reg &&
 300            reg.reg_offset >= dst.reg_offset  &&
 301            reg.reg_offset < dst.reg_offset + regs_written());
 302 }
 303
 304 bool
 305 fs_inst::is_tex()
 306 {
 307    return (opcode == SHADER_OPCODE_TEX ||
 308            opcode == FS_OPCODE_TXB ||
 309            opcode == SHADER_OPCODE_TXD ||
 310            opcode == SHADER_OPCODE_TXF ||
 311            opcode == SHADER_OPCODE_TXL ||
 312            opcode == SHADER_OPCODE_TXS);
 313 }
 314
 315 bool
 316 fs_inst::is_math()
 317 {
 318    return (opcode == SHADER_OPCODE_RCP ||
 319            opcode == SHADER_OPCODE_RSQ ||
 320            opcode == SHADER_OPCODE_SQRT ||
 321            opcode == SHADER_OPCODE_EXP2 ||
 322            opcode == SHADER_OPCODE_LOG2 ||
 323            opcode == SHADER_OPCODE_SIN ||
 324            opcode == SHADER_OPCODE_COS ||
 325            opcode == SHADER_OPCODE_INT_QUOTIENT ||
 326            opcode == SHADER_OPCODE_INT_REMAINDER ||
 327            opcode == SHADER_OPCODE_POW);
 328 }
 329
 330 bool
 331 fs_inst::is_send_from_grf()
 332 {
 333    return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
 334            (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
 335             src[1].file == GRF));
 336 }
 337
 338 bool
 339 fs_visitor::can_do_source_mods(fs_inst *inst)
 340 {
 341    if (intel->gen == 6 && inst->is_math())
 342       return false;
 343
 344    if (inst->is_send_from_grf())
 345       return false;
 346
 347    return true;
 348 }
 349
 350 void
 351 fs_reg::init()
 352 {
 353    memset(this, 0, sizeof(*this));
 354    this->smear = -1;
 355 }
 356
 357 /** Generic unset register constructor. */
 358 fs_reg::fs_reg()
 359 {
 360    init();
 361    this->file = BAD_FILE;
 362 }
 363
 364 /** Immediate value constructor. */
 365 fs_reg::fs_reg(float f)
 366 {
 367    init();
 368    this->file = IMM;
 369    this->type = BRW_REGISTER_TYPE_F;
 370    this->imm.f = f;
 371 }
 372
 373 /** Immediate value constructor. */
 374 fs_reg::fs_reg(int32_t i)
 375 {
 376    init();
 377    this->file = IMM;
 378    this->type = BRW_REGISTER_TYPE_D;
 379    this->imm.i = i;
 380 }
 381
 382 /** Immediate value constructor. */
 383 fs_reg::fs_reg(uint32_t u)
 384 {
 385    init();
 386    this->file = IMM;
 387    this->type = BRW_REGISTER_TYPE_UD;
 388    this->imm.u = u;
 389 }
 390
 391 /** Fixed brw_reg Immediate value constructor. */
 392 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
 393 {
 394    init();
 395    this->file = FIXED_HW_REG;
 396    this->fixed_hw_reg = fixed_hw_reg;
 397    this->type = fixed_hw_reg.type;
 398 }
 399
 400 bool
 401 fs_reg::equals(const fs_reg &r) const
 402 {
 403    return (file == r.file &&
 404            reg == r.reg &&
 405            reg_offset == r.reg_offset &&
 406            type == r.type &&
 407            negate == r.negate &&
 408            abs == r.abs &&
 409            !reladdr && !r.reladdr &&
 410            memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
 411                   sizeof(fixed_hw_reg)) == 0 &&
 412            smear == r.smear &&
 413            imm.u == r.imm.u);
 414 }
 415
 416 bool
 417 fs_reg::is_zero() const
 418 {
 419    if (file != IMM)
 420       return false;
 421
 422    return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
 423 }
 424
 425 bool
 426 fs_reg::is_one() const
 427 {
 428    if (file != IMM)
 429       return false;
 430
 431    return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
 432 }
 433
 434 int
 435 fs_visitor::type_size(const struct glsl_type *type)
 436 {
 437    unsigned int size, i;
 438
 439    switch (type->base_type) {
 440    case GLSL_TYPE_UINT:
 441    case GLSL_TYPE_INT:
 442    case GLSL_TYPE_FLOAT:
 443    case GLSL_TYPE_BOOL:
 444       return type->components();
 445    case GLSL_TYPE_ARRAY:
 446       return type_size(type->fields.array) * type->length;
 447    case GLSL_TYPE_STRUCT:
 448       size = 0;
 449       for (i = 0; i < type->length; i++) {
 450          size += type_size(type->fields.structure[i].type);
 451       }
 452       return size;
 453    case GLSL_TYPE_SAMPLER:
 454       /* Samplers take up no register space, since they're baked in at
 455        * link time.
 456        */
 457       return 0;
 458    default:
 459       assert(!"not reached");
 460       return 0;
 461    }
 462 }
 463
 464 fs_reg
 465 fs_visitor::get_timestamp()
 466 {
 467    assert(intel->gen >= 7);
 468
 469    fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 470                                           BRW_ARF_TIMESTAMP,
 471                                           0),
 472                              BRW_REGISTER_TYPE_UD));
 473
 474    fs_reg dst = fs_reg(this, glsl_type::uint_type);
 475
 476    fs_inst *mov = emit(MOV(dst, ts));
 477    /* We want to read the 3 fields we care about (mostly field 0, but also 2)
 478     * even if it's not enabled in the dispatch.
 479     */
 480    mov->force_writemask_all = true;
 481    mov->force_uncompressed = true;
 482
 483    /* The caller wants the low 32 bits of the timestamp.  Since it's running
 484     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
 485     * which is plenty of time for our purposes.  It is identical across the
 486     * EUs, but since it's tracking GPU core speed it will increment at a
 487     * varying rate as render P-states change.
 488     *
 489     * The caller could also check if render P-states have changed (or anything
 490     * else that might disrupt timing) by setting smear to 2 and checking if
 491     * that field is != 0.
 492     */
 493    dst.smear = 0;
 494
 495    return dst;
 496 }
 497
 498 void
 499 fs_visitor::emit_shader_time_begin()
 500 {
 501    current_annotation = "shader time start";
 502    shader_start_time = get_timestamp();
 503 }
 504
 505 void
 506 fs_visitor::emit_shader_time_end()
 507 {
 508    current_annotation = "shader time end";
 509
 510    enum shader_time_shader_type type, written_type, reset_type;
 511    if (dispatch_width == 8) {
 512       type = ST_FS8;
 513       written_type = ST_FS8_WRITTEN;
 514       reset_type = ST_FS8_RESET;
 515    } else {
 516       assert(dispatch_width == 16);
 517       type = ST_FS16;
 518       written_type = ST_FS16_WRITTEN;
 519       reset_type = ST_FS16_RESET;
 520    }
 521
 522    fs_reg shader_end_time = get_timestamp();
 523
 524    /* Check that there weren't any timestamp reset events (assuming these
 525     * were the only two timestamp reads that happened).
 526     */
 527    fs_reg reset = shader_end_time;
 528    reset.smear = 2;
 529    fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
 530    test->conditional_mod = BRW_CONDITIONAL_Z;
 531    emit(IF(BRW_PREDICATE_NORMAL));
 532
 533    push_force_uncompressed();
 534    fs_reg start = shader_start_time;
 535    start.negate = true;
 536    fs_reg diff = fs_reg(this, glsl_type::uint_type);
 537    emit(ADD(diff, start, shader_end_time));
 538
 539    /* If there were no instructions between the two timestamp gets, the diff
 540     * is 2 cycles.  Remove that overhead, so I can forget about that when
 541     * trying to determine the time taken for single instructions.
 542     */
 543    emit(ADD(diff, diff, fs_reg(-2u)));
 544
 545    emit_shader_time_write(type, diff);
 546    emit_shader_time_write(written_type, fs_reg(1u));
 547    emit(BRW_OPCODE_ELSE);
 548    emit_shader_time_write(reset_type, fs_reg(1u));
 549    emit(BRW_OPCODE_ENDIF);
 550
 551    pop_force_uncompressed();
 552 }
 553
 554 void
 555 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
 556                                    fs_reg value)
 557 {
 558    /* Choose an index in the buffer and set up tracking information for our
 559     * printouts.
 560     */
 561    int shader_time_index = brw->shader_time.num_entries++;
 562    assert(shader_time_index <= brw->shader_time.max_entries);
 563    brw->shader_time.types[shader_time_index] = type;
 564    if (prog) {
 565       _mesa_reference_shader_program(ctx,
 566                                      &brw->shader_time.programs[shader_time_index],
 567                                      prog);
 568    }
 569
 570    int base_mrf = 6;
 571
 572    fs_reg offset_mrf = fs_reg(MRF, base_mrf);
 573    offset_mrf.type = BRW_REGISTER_TYPE_UD;
 574    emit(MOV(offset_mrf, fs_reg(shader_time_index * 4)));
 575
 576    fs_reg time_mrf = fs_reg(MRF, base_mrf + 1);
 577    time_mrf.type = BRW_REGISTER_TYPE_UD;
 578    emit(MOV(time_mrf, value));
 579
 580    fs_inst *inst = emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD));
 581    inst->base_mrf = base_mrf;
 582    inst->mlen = 2;
 583 }
 584
 585 void
 586 fs_visitor::fail(const char *format, ...)
 587 {
 588    va_list va;
 589    char *msg;
 590
 591    if (failed)
 592       return;
 593
 594    failed = true;
 595
 596    va_start(va, format);
 597    msg = ralloc_vasprintf(mem_ctx, format, va);
 598    va_end(va);
 599    msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
 600
 601    this->fail_msg = msg;
 602
 603    if (INTEL_DEBUG & DEBUG_WM) {
 604       fprintf(stderr, "%s",  msg);
 605    }
 606 }
 607
 608 fs_inst *
 609 fs_visitor::emit(enum opcode opcode)
 610 {
 611    return emit(fs_inst(opcode));
 612 }
 613
 614 fs_inst *
 615 fs_visitor::emit(enum opcode opcode, fs_reg dst)
 616 {
 617    return emit(fs_inst(opcode, dst));
 618 }
 619
 620 fs_inst *
 621 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
 622 {
 623    return emit(fs_inst(opcode, dst, src0));
 624 }
 625
 626 fs_inst *
 627 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 628 {
 629    return emit(fs_inst(opcode, dst, src0, src1));
 630 }
 631
 632 fs_inst *
 633 fs_visitor::emit(enum opcode opcode, fs_reg dst,
 634                  fs_reg src0, fs_reg src1, fs_reg src2)
 635 {
 636    return emit(fs_inst(opcode, dst, src0, src1, src2));
 637 }
 638
 639 void
 640 fs_visitor::push_force_uncompressed()
 641 {
 642    force_uncompressed_stack++;
 643 }
 644
 645 void
 646 fs_visitor::pop_force_uncompressed()
 647 {
 648    force_uncompressed_stack--;
 649    assert(force_uncompressed_stack >= 0);
 650 }
 651
 652 void
 653 fs_visitor::push_force_sechalf()
 654 {
 655    force_sechalf_stack++;
 656 }
 657
 658 void
 659 fs_visitor::pop_force_sechalf()
 660 {
 661    force_sechalf_stack--;
 662    assert(force_sechalf_stack >= 0);
 663 }
 664
 665 /**
 666  * Returns how many MRFs an FS opcode will write over.
 667  *
 668  * Note that this is not the 0 or 1 implied writes in an actual gen
 669  * instruction -- the FS opcodes often generate MOVs in addition.
 670  */
 671 int
 672 fs_visitor::implied_mrf_writes(fs_inst *inst)
 673 {
 674    if (inst->mlen == 0)
 675       return 0;
 676
 677    switch (inst->opcode) {
 678    case SHADER_OPCODE_RCP:
 679    case SHADER_OPCODE_RSQ:
 680    case SHADER_OPCODE_SQRT:
 681    case SHADER_OPCODE_EXP2:
 682    case SHADER_OPCODE_LOG2:
 683    case SHADER_OPCODE_SIN:
 684    case SHADER_OPCODE_COS:
 685       return 1 * dispatch_width / 8;
 686    case SHADER_OPCODE_POW:
 687    case SHADER_OPCODE_INT_QUOTIENT:
 688    case SHADER_OPCODE_INT_REMAINDER:
 689       return 2 * dispatch_width / 8;
 690    case SHADER_OPCODE_TEX:
 691    case FS_OPCODE_TXB:
 692    case SHADER_OPCODE_TXD:
 693    case SHADER_OPCODE_TXF:
 694    case SHADER_OPCODE_TXL:
 695    case SHADER_OPCODE_TXS:
 696       return 1;
 697    case SHADER_OPCODE_SHADER_TIME_ADD:
 698       return 0;
 699    case FS_OPCODE_FB_WRITE:
 700       return 2;
 701    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 702    case FS_OPCODE_UNSPILL:
 703       return 1;
 704    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
 705       return inst->header_present;
 706    case FS_OPCODE_SPILL:
 707       return 2;
 708    default:
 709       assert(!"not reached");
 710       return inst->mlen;
 711    }
 712 }
 713
 714 int
 715 fs_visitor::virtual_grf_alloc(int size)
 716 {
 717    if (virtual_grf_array_size <= virtual_grf_count) {
 718       if (virtual_grf_array_size == 0)
 719          virtual_grf_array_size = 16;
 720       else
 721          virtual_grf_array_size *= 2;
 722       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 723                                    virtual_grf_array_size);
 724    }
 725    virtual_grf_sizes[virtual_grf_count] = size;
 726    return virtual_grf_count++;
 727 }
 728
 729 /** Fixed HW reg constructor. */
 730 fs_reg::fs_reg(enum register_file file, int reg)
 731 {
 732    init();
 733    this->file = file;
 734    this->reg = reg;
 735    this->type = BRW_REGISTER_TYPE_F;
 736 }
 737
 738 /** Fixed HW reg constructor. */
 739 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
 740 {
 741    init();
 742    this->file = file;
 743    this->reg = reg;
 744    this->type = type;
 745 }
 746
 747 /** Automatic reg constructor. */
 748 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
 749 {
 750    init();
 751
 752    this->file = GRF;
 753    this->reg = v->virtual_grf_alloc(v->type_size(type));
 754    this->reg_offset = 0;
 755    this->type = brw_type_for_base_type(type);
 756 }
 757
 758 fs_reg *
 759 fs_visitor::variable_storage(ir_variable *var)
 760 {
 761    return (fs_reg *)hash_table_find(this->variable_ht, var);
 762 }
 763
 764 void
 765 import_uniforms_callback(const void *key,
 766                          void *data,
 767                          void *closure)
 768 {
 769    struct hash_table *dst_ht = (struct hash_table *)closure;
 770    const fs_reg *reg = (const fs_reg *)data;
 771
 772    if (reg->file != UNIFORM)
 773       return;
 774
 775    hash_table_insert(dst_ht, data, key);
 776 }
 777
 778 /* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
 779  * This brings in those uniform definitions
 780  */
 781 void
 782 fs_visitor::import_uniforms(fs_visitor *v)
 783 {
 784    hash_table_call_foreach(v->variable_ht,
 785                            import_uniforms_callback,
 786                            variable_ht);
 787    this->params_remap = v->params_remap;
 788 }
 789
 790 /* Our support for uniforms is piggy-backed on the struct
 791  * gl_fragment_program, because that's where the values actually
 792  * get stored, rather than in some global gl_shader_program uniform
 793  * store.
 794  */
 795 int
 796 fs_visitor::setup_uniform_values(int loc, const glsl_type *type)
 797 {
 798    unsigned int offset = 0;
 799
 800    if (type->is_matrix()) {
 801       const glsl_type *column = glsl_type::get_instance(GLSL_TYPE_FLOAT,
 802                                                         type->vector_elements,
 803                                                         1);
 804
 805       for (unsigned int i = 0; i < type->matrix_columns; i++) {
 806          offset += setup_uniform_values(loc + offset, column);
 807       }
 808
 809       return offset;
 810    }
 811
 812    switch (type->base_type) {
 813    case GLSL_TYPE_FLOAT:
 814    case GLSL_TYPE_UINT:
 815    case GLSL_TYPE_INT:
 816    case GLSL_TYPE_BOOL:
 817       for (unsigned int i = 0; i < type->vector_elements; i++) {
 818          unsigned int param = c->prog_data.nr_params++;
 819
 820          this->param_index[param] = loc;
 821          this->param_offset[param] = i;
 822       }
 823       return 1;
 824
 825    case GLSL_TYPE_STRUCT:
 826       for (unsigned int i = 0; i < type->length; i++) {
 827          offset += setup_uniform_values(loc + offset,
 828                                         type->fields.structure[i].type);
 829       }
 830       return offset;
 831
 832    case GLSL_TYPE_ARRAY:
 833       for (unsigned int i = 0; i < type->length; i++) {
 834          offset += setup_uniform_values(loc + offset, type->fields.array);
 835       }
 836       return offset;
 837
 838    case GLSL_TYPE_SAMPLER:
 839       /* The sampler takes up a slot, but we don't use any values from it. */
 840       return 1;
 841
 842    default:
 843       assert(!"not reached");
 844       return 0;
 845    }
 846 }
 847
 848
 849 /* Our support for builtin uniforms is even scarier than non-builtin.
 850  * It sits on top of the PROG_STATE_VAR parameters that are
 851  * automatically updated from GL context state.
 852  */
 853 void
 854 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
 855 {
 856    const ir_state_slot *const slots = ir->state_slots;
 857    assert(ir->state_slots != NULL);
 858
 859    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 860       /* This state reference has already been setup by ir_to_mesa, but we'll
 861        * get the same index back here.
 862        */
 863       int index = _mesa_add_state_reference(this->fp->Base.Parameters,
 864                                             (gl_state_index *)slots[i].tokens);
 865
 866       /* Add each of the unique swizzles of the element as a parameter.
 867        * This'll end up matching the expected layout of the
 868        * array/matrix/structure we're trying to fill in.
 869        */
 870       int last_swiz = -1;
 871       for (unsigned int j = 0; j < 4; j++) {
 872          int swiz = GET_SWZ(slots[i].swizzle, j);
 873          if (swiz == last_swiz)
 874             break;
 875          last_swiz = swiz;
 876
 877          this->param_index[c->prog_data.nr_params] = index;
 878          this->param_offset[c->prog_data.nr_params] = swiz;
 879          c->prog_data.nr_params++;
 880       }
 881    }
 882 }
 883
 884 fs_reg *
 885 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
 886 {
 887    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 888    fs_reg wpos = *reg;
 889    bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
 890
 891    /* gl_FragCoord.x */
 892    if (ir->pixel_center_integer) {
 893       emit(MOV(wpos, this->pixel_x));
 894    } else {
 895       emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
 896    }
 897    wpos.reg_offset++;
 898
 899    /* gl_FragCoord.y */
 900    if (!flip && ir->pixel_center_integer) {
 901       emit(MOV(wpos, this->pixel_y));
 902    } else {
 903       fs_reg pixel_y = this->pixel_y;
 904       float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
 905
 906       if (flip) {
 907          pixel_y.negate = true;
 908          offset += c->key.drawable_height - 1.0;
 909       }
 910
 911       emit(ADD(wpos, pixel_y, fs_reg(offset)));
 912    }
 913    wpos.reg_offset++;
 914
 915    /* gl_FragCoord.z */
 916    if (intel->gen >= 6) {
 917       emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
 918    } else {
 919       emit(FS_OPCODE_LINTERP, wpos,
 920            this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
 921            this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
 922            interp_reg(FRAG_ATTRIB_WPOS, 2));
 923    }
 924    wpos.reg_offset++;
 925
 926    /* gl_FragCoord.w: Already set up in emit_interpolation */
 927    emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
 928
 929    return reg;
 930 }
 931
 932 fs_inst *
 933 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
 934                          glsl_interp_qualifier interpolation_mode,
 935                          bool is_centroid)
 936 {
 937    brw_wm_barycentric_interp_mode barycoord_mode;
 938    if (is_centroid) {
 939       if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
 940          barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
 941       else
 942          barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
 943    } else {
 944       if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
 945          barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
 946       else
 947          barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
 948    }
 949    return emit(FS_OPCODE_LINTERP, attr,
 950                this->delta_x[barycoord_mode],
 951                this->delta_y[barycoord_mode], interp);
 952 }
 953
 954 fs_reg *
 955 fs_visitor::emit_general_interpolation(ir_variable *ir)
 956 {
 957    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 958    reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
 959    fs_reg attr = *reg;
 960
 961    unsigned int array_elements;
 962    const glsl_type *type;
 963
 964    if (ir->type->is_array()) {
 965       array_elements = ir->type->length;
 966       if (array_elements == 0) {
 967          fail("dereferenced array '%s' has length 0\n", ir->name);
 968       }
 969       type = ir->type->fields.array;
 970    } else {
 971       array_elements = 1;
 972       type = ir->type;
 973    }
 974
 975    glsl_interp_qualifier interpolation_mode =
 976       ir->determine_interpolation_mode(c->key.flat_shade);
 977
 978    int location = ir->location;
 979    for (unsigned int i = 0; i < array_elements; i++) {
 980       for (unsigned int j = 0; j < type->matrix_columns; j++) {
 981          if (urb_setup[location] == -1) {
 982             /* If there's no incoming setup data for this slot, don't
 983              * emit interpolation for it.
 984              */
 985             attr.reg_offset += type->vector_elements;
 986             location++;
 987             continue;
 988          }
 989
 990          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
 991             /* Constant interpolation (flat shading) case. The SF has
 992              * handed us defined values in only the constant offset
 993              * field of the setup reg.
 994              */
 995             for (unsigned int k = 0; k < type->vector_elements; k++) {
 996                struct brw_reg interp = interp_reg(location, k);
 997                interp = suboffset(interp, 3);
 998                interp.type = reg->type;
 999                emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1000                attr.reg_offset++;
1001             }
1002          } else {
1003             /* Smooth/noperspective interpolation case. */
1004             for (unsigned int k = 0; k < type->vector_elements; k++) {
1005                /* FINISHME: At some point we probably want to push
1006                 * this farther by giving similar treatment to the
1007                 * other potentially constant components of the
1008                 * attribute, as well as making brw_vs_constval.c
1009                 * handle varyings other than gl_TexCoord.
1010                 */
1011                if (location >= FRAG_ATTRIB_TEX0 &&
1012                    location <= FRAG_ATTRIB_TEX7 &&
1013                    k == 3 && !(c->key.proj_attrib_mask & (1 << location))) {
1014                   emit(BRW_OPCODE_MOV, attr, fs_reg(1.0f));
1015                } else {
1016                   struct brw_reg interp = interp_reg(location, k);
1017                   emit_linterp(attr, fs_reg(interp), interpolation_mode,
1018                                ir->centroid);
1019                   if (brw->needs_unlit_centroid_workaround && ir->centroid) {
1020                      /* Get the pixel/sample mask into f0 so that we know
1021                       * which pixels are lit.  Then, for each channel that is
1022                       * unlit, replace the centroid data with non-centroid
1023                       * data.
1024                       */
1025                      emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS, attr);
1026                      fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1027                                                   interpolation_mode, false);
1028                      inst->predicate = BRW_PREDICATE_NORMAL;
1029                      inst->predicate_inverse = true;
1030                   }
1031                   if (intel->gen < 6) {
1032                      emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1033                   }
1034                }
1035                attr.reg_offset++;
1036             }
1037
1038          }
1039          location++;
1040       }
1041    }
1042
1043    return reg;
1044 }
1045
1046 fs_reg *
1047 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1048 {
1049    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1050
1051    /* The frontfacing comes in as a bit in the thread payload. */
1052    if (intel->gen >= 6) {
1053       emit(BRW_OPCODE_ASR, *reg,
1054            fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1055            fs_reg(15));
1056       emit(BRW_OPCODE_NOT, *reg, *reg);
1057       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1058    } else {
1059       struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1060       /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1061        * us front face
1062        */
1063       emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1064       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1065    }
1066
1067    return reg;
1068 }
1069
1070 fs_reg
1071 fs_visitor::fix_math_operand(fs_reg src)
1072 {
1073    /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1074     * might be able to do better by doing execsize = 1 math and then
1075     * expanding that result out, but we would need to be careful with
1076     * masking.
1077     *
1078     * The hardware ignores source modifiers (negate and abs) on math
1079     * instructions, so we also move to a temp to set those up.
1080     */
1081    if (intel->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1082        !src.abs && !src.negate)
1083       return src;
1084
1085    /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1086     * operands to math
1087     */
1088    if (intel->gen >= 7 && src.file != IMM)
1089       return src;
1090
1091    fs_reg expanded = fs_reg(this, glsl_type::float_type);
1092    expanded.type = src.type;
1093    emit(BRW_OPCODE_MOV, expanded, src);
1094    return expanded;
1095 }
1096
1097 fs_inst *
1098 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1099 {
1100    switch (opcode) {
1101    case SHADER_OPCODE_RCP:
1102    case SHADER_OPCODE_RSQ:
1103    case SHADER_OPCODE_SQRT:
1104    case SHADER_OPCODE_EXP2:
1105    case SHADER_OPCODE_LOG2:
1106    case SHADER_OPCODE_SIN:
1107    case SHADER_OPCODE_COS:
1108       break;
1109    default:
1110       assert(!"not reached: bad math opcode");
1111       return NULL;
1112    }
1113
1114    /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
1115     * might be able to do better by doing execsize = 1 math and then
1116     * expanding that result out, but we would need to be careful with
1117     * masking.
1118     *
1119     * Gen 6 hardware ignores source modifiers (negate and abs) on math
1120     * instructions, so we also move to a temp to set those up.
1121     */
1122    if (intel->gen >= 6)
1123       src = fix_math_operand(src);
1124
1125    fs_inst *inst = emit(opcode, dst, src);
1126
1127    if (intel->gen < 6) {
1128       inst->base_mrf = 2;
1129       inst->mlen = dispatch_width / 8;
1130    }
1131
1132    return inst;
1133 }
1134
1135 fs_inst *
1136 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1137 {
1138    int base_mrf = 2;
1139    fs_inst *inst;
1140
1141    switch (opcode) {
1142    case SHADER_OPCODE_INT_QUOTIENT:
1143    case SHADER_OPCODE_INT_REMAINDER:
1144       if (intel->gen >= 7 && dispatch_width == 16)
1145          fail("16-wide INTDIV unsupported\n");
1146       break;
1147    case SHADER_OPCODE_POW:
1148       break;
1149    default:
1150       assert(!"not reached: unsupported binary math opcode.");
1151       return NULL;
1152    }
1153
1154    if (intel->gen >= 6) {
1155       src0 = fix_math_operand(src0);
1156       src1 = fix_math_operand(src1);
1157
1158       inst = emit(opcode, dst, src0, src1);
1159    } else {
1160       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1161        * "Message Payload":
1162        *
1163        * "Operand0[7].  For the INT DIV functions, this operand is the
1164        *  denominator."
1165        *  ...
1166        * "Operand1[7].  For the INT DIV functions, this operand is the
1167        *  numerator."
1168        */
1169       bool is_int_div = opcode != SHADER_OPCODE_POW;
1170       fs_reg &op0 = is_int_div ? src1 : src0;
1171       fs_reg &op1 = is_int_div ? src0 : src1;
1172
1173       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1174       inst = emit(opcode, dst, op0, reg_null_f);
1175
1176       inst->base_mrf = base_mrf;
1177       inst->mlen = 2 * dispatch_width / 8;
1178    }
1179    return inst;
1180 }
1181
1182 /**
1183  * To be called after the last _mesa_add_state_reference() call, to
1184  * set up prog_data.param[] for assign_curb_setup() and
1185  * setup_pull_constants().
1186  */
1187 void
1188 fs_visitor::setup_paramvalues_refs()
1189 {
1190    if (dispatch_width != 8)
1191       return;
1192
1193    /* Set up the pointers to ParamValues now that that array is finalized. */
1194    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1195       c->prog_data.param[i] =
1196          (const float *)fp->Base.Parameters->ParameterValues[this->param_index[i]] +
1197          this->param_offset[i];
1198    }
1199 }
1200
1201 void
1202 fs_visitor::assign_curb_setup()
1203 {
1204    c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
1205    if (dispatch_width == 8) {
1206       c->prog_data.first_curbe_grf = c->nr_payload_regs;
1207    } else {
1208       c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1209    }
1210
1211    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1212    foreach_list(node, &this->instructions) {
1213       fs_inst *inst = (fs_inst *)node;
1214
1215       for (unsigned int i = 0; i < 3; i++) {
1216          if (inst->src[i].file == UNIFORM) {
1217             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1218             struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1219                                                   constant_nr / 8,
1220                                                   constant_nr % 8);
1221
1222             inst->src[i].file = FIXED_HW_REG;
1223             inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
1224          }
1225       }
1226    }
1227 }
1228
1229 void
1230 fs_visitor::calculate_urb_setup()
1231 {
1232    for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
1233       urb_setup[i] = -1;
1234    }
1235
1236    int urb_next = 0;
1237    /* Figure out where each of the incoming setup attributes lands. */
1238    if (intel->gen >= 6) {
1239       for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
1240          if (fp->Base.InputsRead & BITFIELD64_BIT(i)) {
1241             urb_setup[i] = urb_next++;
1242          }
1243       }
1244    } else {
1245       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1246       for (unsigned int i = 0; i < VERT_RESULT_MAX; i++) {
1247          /* Point size is packed into the header, not as a general attribute */
1248          if (i == VERT_RESULT_PSIZ)
1249             continue;
1250
1251          if (c->key.vp_outputs_written & BITFIELD64_BIT(i)) {
1252             int fp_index = _mesa_vert_result_to_frag_attrib((gl_vert_result) i);
1253
1254             /* The back color slot is skipped when the front color is
1255              * also written to.  In addition, some slots can be
1256              * written in the vertex shader and not read in the
1257              * fragment shader.  So the register number must always be
1258              * incremented, mapped or not.
1259              */
1260             if (fp_index >= 0)
1261                urb_setup[fp_index] = urb_next;
1262             urb_next++;
1263          }
1264       }
1265
1266       /*
1267        * It's a FS only attribute, and we did interpolation for this attribute
1268        * in SF thread. So, count it here, too.
1269        *
1270        * See compile_sf_prog() for more info.
1271        */
1272       if (fp->Base.InputsRead & BITFIELD64_BIT(FRAG_ATTRIB_PNTC))
1273          urb_setup[FRAG_ATTRIB_PNTC] = urb_next++;
1274    }
1275
1276    /* Each attribute is 4 setup channels, each of which is half a reg. */
1277    c->prog_data.urb_read_length = urb_next * 2;
1278 }
1279
1280 void
1281 fs_visitor::assign_urb_setup()
1282 {
1283    int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1284
1285    /* Offset all the urb_setup[] index by the actual position of the
1286     * setup regs, now that the location of the constants has been chosen.
1287     */
1288    foreach_list(node, &this->instructions) {
1289       fs_inst *inst = (fs_inst *)node;
1290
1291       if (inst->opcode == FS_OPCODE_LINTERP) {
1292          assert(inst->src[2].file == FIXED_HW_REG);
1293          inst->src[2].fixed_hw_reg.nr += urb_start;
1294       }
1295
1296       if (inst->opcode == FS_OPCODE_CINTERP) {
1297          assert(inst->src[0].file == FIXED_HW_REG);
1298          inst->src[0].fixed_hw_reg.nr += urb_start;
1299       }
1300    }
1301
1302    this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length;
1303 }
1304
1305 /**
1306  * Split large virtual GRFs into separate components if we can.
1307  *
1308  * This is mostly duplicated with what brw_fs_vector_splitting does,
1309  * but that's really conservative because it's afraid of doing
1310  * splitting that doesn't result in real progress after the rest of
1311  * the optimization phases, which would cause infinite looping in
1312  * optimization.  We can do it once here, safely.  This also has the
1313  * opportunity to split interpolated values, or maybe even uniforms,
1314  * which we don't have at the IR level.
1315  *
1316  * We want to split, because virtual GRFs are what we register
1317  * allocate and spill (due to contiguousness requirements for some
1318  * instructions), and they're what we naturally generate in the
1319  * codegen process, but most virtual GRFs don't actually need to be
1320  * contiguous sets of GRFs.  If we split, we'll end up with reduced
1321  * live intervals and better dead code elimination and coalescing.
1322  */
1323 void
1324 fs_visitor::split_virtual_grfs()
1325 {
1326    int num_vars = this->virtual_grf_count;
1327    bool split_grf[num_vars];
1328    int new_virtual_grf[num_vars];
1329
1330    /* Try to split anything > 0 sized. */
1331    for (int i = 0; i < num_vars; i++) {
1332       if (this->virtual_grf_sizes[i] != 1)
1333          split_grf[i] = true;
1334       else
1335          split_grf[i] = false;
1336    }
1337
1338    if (brw->has_pln &&
1339        this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1340       /* PLN opcodes rely on the delta_xy being contiguous.  We only have to
1341        * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1342        * Gen6, that was the only supported interpolation mode, and since Gen6,
1343        * delta_x and delta_y are in fixed hardware registers.
1344        */
1345       split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1346          false;
1347    }
1348
1349    foreach_list(node, &this->instructions) {
1350       fs_inst *inst = (fs_inst *)node;
1351
1352       /* If there's a SEND message that requires contiguous destination
1353        * registers, no splitting is allowed.
1354        */
1355       if (inst->regs_written() > 1) {
1356          split_grf[inst->dst.reg] = false;
1357       }
1358    }
1359
1360    /* Allocate new space for split regs.  Note that the virtual
1361     * numbers will be contiguous.
1362     */
1363    for (int i = 0; i < num_vars; i++) {
1364       if (split_grf[i]) {
1365          new_virtual_grf[i] = virtual_grf_alloc(1);
1366          for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1367             int reg = virtual_grf_alloc(1);
1368             assert(reg == new_virtual_grf[i] + j - 1);
1369             (void) reg;
1370          }
1371          this->virtual_grf_sizes[i] = 1;
1372       }
1373    }
1374
1375    foreach_list(node, &this->instructions) {
1376       fs_inst *inst = (fs_inst *)node;
1377
1378       if (inst->dst.file == GRF &&
1379           split_grf[inst->dst.reg] &&
1380           inst->dst.reg_offset != 0) {
1381          inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1382                           inst->dst.reg_offset - 1);
1383          inst->dst.reg_offset = 0;
1384       }
1385       for (int i = 0; i < 3; i++) {
1386          if (inst->src[i].file == GRF &&
1387              split_grf[inst->src[i].reg] &&
1388              inst->src[i].reg_offset != 0) {
1389             inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1390                                 inst->src[i].reg_offset - 1);
1391             inst->src[i].reg_offset = 0;
1392          }
1393       }
1394    }
1395    this->live_intervals_valid = false;
1396 }
1397
1398 /**
1399  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1400  *
1401  * During code generation, we create tons of temporary variables, many of
1402  * which get immediately killed and are never used again.  Yet, in later
1403  * optimization and analysis passes, such as compute_live_intervals, we need
1404  * to loop over all the virtual GRFs.  Compacting them can save a lot of
1405  * overhead.
1406  */
1407 void
1408 fs_visitor::compact_virtual_grfs()
1409 {
1410    /* Mark which virtual GRFs are used, and count how many. */
1411    int remap_table[this->virtual_grf_count];
1412    memset(remap_table, -1, sizeof(remap_table));
1413
1414    foreach_list(node, &this->instructions) {
1415       const fs_inst *inst = (const fs_inst *) node;
1416
1417       if (inst->dst.file == GRF)
1418          remap_table[inst->dst.reg] = 0;
1419
1420       for (int i = 0; i < 3; i++) {
1421          if (inst->src[i].file == GRF)
1422             remap_table[inst->src[i].reg] = 0;
1423       }
1424    }
1425
1426    /* In addition to registers used in instructions, fs_visitor keeps
1427     * direct references to certain special values which must be patched:
1428     */
1429    fs_reg *special[] = {
1430       &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1431       &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1432       &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1433       &delta_x[0], &delta_x[1], &delta_x[2],
1434       &delta_x[3], &delta_x[4], &delta_x[5],
1435       &delta_y[0], &delta_y[1], &delta_y[2],
1436       &delta_y[3], &delta_y[4], &delta_y[5],
1437    };
1438    STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1439    STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1440
1441    /* Treat all special values as used, to be conservative */
1442    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1443       if (special[i]->file == GRF)
1444          remap_table[special[i]->reg] = 0;
1445    }
1446
1447    /* Compact the GRF arrays. */
1448    int new_index = 0;
1449    for (int i = 0; i < this->virtual_grf_count; i++) {
1450       if (remap_table[i] != -1) {
1451          remap_table[i] = new_index;
1452          virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1453          if (live_intervals_valid) {
1454             virtual_grf_use[new_index] = virtual_grf_use[i];
1455             virtual_grf_def[new_index] = virtual_grf_def[i];
1456          }
1457          ++new_index;
1458       }
1459    }
1460
1461    this->virtual_grf_count = new_index;
1462
1463    /* Patch all the instructions to use the newly renumbered registers */
1464    foreach_list(node, &this->instructions) {
1465       fs_inst *inst = (fs_inst *) node;
1466
1467       if (inst->dst.file == GRF)
1468          inst->dst.reg = remap_table[inst->dst.reg];
1469
1470       for (int i = 0; i < 3; i++) {
1471          if (inst->src[i].file == GRF)
1472             inst->src[i].reg = remap_table[inst->src[i].reg];
1473       }
1474    }
1475
1476    /* Patch all the references to special values */
1477    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1478       if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1479          special[i]->reg = remap_table[special[i]->reg];
1480    }
1481 }
1482
1483 bool
1484 fs_visitor::remove_dead_constants()
1485 {
1486    if (dispatch_width == 8) {
1487       this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
1488
1489       for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
1490          this->params_remap[i] = -1;
1491
1492       /* Find which params are still in use. */
1493       foreach_list(node, &this->instructions) {
1494          fs_inst *inst = (fs_inst *)node;
1495
1496          for (int i = 0; i < 3; i++) {
1497             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1498
1499             if (inst->src[i].file != UNIFORM)
1500                continue;
1501
1502             assert(constant_nr < (int)c->prog_data.nr_params);
1503
1504             /* For now, set this to non-negative.  We'll give it the
1505              * actual new number in a moment, in order to keep the
1506              * register numbers nicely ordered.
1507              */
1508             this->params_remap[constant_nr] = 0;
1509          }
1510       }
1511
1512       /* Figure out what the new numbers for the params will be.  At some
1513        * point when we're doing uniform array access, we're going to want
1514        * to keep the distinction between .reg and .reg_offset, but for
1515        * now we don't care.
1516        */
1517       unsigned int new_nr_params = 0;
1518       for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1519          if (this->params_remap[i] != -1) {
1520             this->params_remap[i] = new_nr_params++;
1521          }
1522       }
1523
1524       /* Update the list of params to be uploaded to match our new numbering. */
1525       for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1526          int remapped = this->params_remap[i];
1527
1528          if (remapped == -1)
1529             continue;
1530
1531          /* We've already done setup_paramvalues_refs() so no need to worry
1532           * about param_index and param_offset.
1533           */
1534          c->prog_data.param[remapped] = c->prog_data.param[i];
1535       }
1536
1537       c->prog_data.nr_params = new_nr_params;
1538    } else {
1539       /* This should have been generated in the 8-wide pass already. */
1540       assert(this->params_remap);
1541    }
1542
1543    /* Now do the renumbering of the shader to remove unused params. */
1544    foreach_list(node, &this->instructions) {
1545       fs_inst *inst = (fs_inst *)node;
1546
1547       for (int i = 0; i < 3; i++) {
1548          int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1549
1550          if (inst->src[i].file != UNIFORM)
1551             continue;
1552
1553          assert(this->params_remap[constant_nr] != -1);
1554          inst->src[i].reg = this->params_remap[constant_nr];
1555          inst->src[i].reg_offset = 0;
1556       }
1557    }
1558
1559    return true;
1560 }
1561
1562 /*
1563  * Implements array access of uniforms by inserting a
1564  * PULL_CONSTANT_LOAD instruction.
1565  *
1566  * Unlike temporary GRF array access (where we don't support it due to
1567  * the difficulty of doing relative addressing on instruction
1568  * destinations), we could potentially do array access of uniforms
1569  * that were loaded in GRF space as push constants.  In real-world
1570  * usage we've seen, though, the arrays being used are always larger
1571  * than we could load as push constants, so just always move all
1572  * uniform array access out to a pull constant buffer.
1573  */
1574 void
1575 fs_visitor::move_uniform_array_access_to_pull_constants()
1576 {
1577    int pull_constant_loc[c->prog_data.nr_params];
1578
1579    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1580       pull_constant_loc[i] = -1;
1581    }
1582
1583    /* Walk through and find array access of uniforms.  Put a copy of that
1584     * uniform in the pull constant buffer.
1585     *
1586     * Note that we don't move constant-indexed accesses to arrays.  No
1587     * testing has been done of the performance impact of this choice.
1588     */
1589    foreach_list_safe(node, &this->instructions) {
1590       fs_inst *inst = (fs_inst *)node;
1591
1592       for (int i = 0 ; i < 3; i++) {
1593          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1594             continue;
1595
1596          int uniform = inst->src[i].reg;
1597
1598          /* If this array isn't already present in the pull constant buffer,
1599           * add it.
1600           */
1601          if (pull_constant_loc[uniform] == -1) {
1602             const float **values = &c->prog_data.param[uniform];
1603
1604             pull_constant_loc[uniform] = c->prog_data.nr_pull_params;
1605
1606             assert(param_size[uniform]);
1607
1608             for (int j = 0; j < param_size[uniform]; j++) {
1609                c->prog_data.pull_param[c->prog_data.nr_pull_params++] =
1610                   values[j];
1611             }
1612          }
1613
1614          /* Set up the annotation tracking for new generated instructions. */
1615          base_ir = inst->ir;
1616          current_annotation = inst->annotation;
1617
1618          fs_reg offset = fs_reg(this, glsl_type::int_type);
1619          inst->insert_before(ADD(offset, *inst->src[i].reladdr,
1620                                  fs_reg(pull_constant_loc[uniform] +
1621                                         inst->src[i].reg_offset)));
1622
1623          fs_reg surf_index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1624          fs_reg temp = fs_reg(this, glsl_type::float_type);
1625          exec_list list = VARYING_PULL_CONSTANT_LOAD(temp,
1626                                                      surf_index, offset);
1627          inst->insert_before(&list);
1628
1629          inst->src[i].file = temp.file;
1630          inst->src[i].reg = temp.reg;
1631          inst->src[i].reg_offset = temp.reg_offset;
1632          inst->src[i].reladdr = NULL;
1633       }
1634    }
1635 }
1636
1637 /**
1638  * Choose accesses from the UNIFORM file to demote to using the pull
1639  * constant buffer.
1640  *
1641  * We allow a fragment shader to have more than the specified minimum
1642  * maximum number of fragment shader uniform components (64).  If
1643  * there are too many of these, they'd fill up all of register space.
1644  * So, this will push some of them out to the pull constant buffer and
1645  * update the program to load them.
1646  */
1647 void
1648 fs_visitor::setup_pull_constants()
1649 {
1650    /* Only allow 16 registers (128 uniform components) as push constants. */
1651    unsigned int max_uniform_components = 16 * 8;
1652    if (c->prog_data.nr_params <= max_uniform_components)
1653       return;
1654
1655    if (dispatch_width == 16) {
1656       fail("Pull constants not supported in 16-wide\n");
1657       return;
1658    }
1659
1660    /* Just demote the end of the list.  We could probably do better
1661     * here, demoting things that are rarely used in the program first.
1662     */
1663    unsigned int pull_uniform_base = max_uniform_components;
1664
1665    int pull_constant_loc[c->prog_data.nr_params];
1666    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1667       if (i < pull_uniform_base) {
1668          pull_constant_loc[i] = -1;
1669       } else {
1670          pull_constant_loc[i] = -1;
1671          /* If our constant is already being uploaded for reladdr purposes,
1672           * reuse it.
1673           */
1674          for (unsigned int j = 0; j < c->prog_data.nr_pull_params; j++) {
1675             if (c->prog_data.pull_param[j] == c->prog_data.param[i]) {
1676                pull_constant_loc[i] = j;
1677                break;
1678             }
1679          }
1680          if (pull_constant_loc[i] == -1) {
1681             int pull_index = c->prog_data.nr_pull_params++;
1682             c->prog_data.pull_param[pull_index] = c->prog_data.param[i];
1683             pull_constant_loc[i] = pull_index;;
1684          }
1685       }
1686    }
1687    c->prog_data.nr_params = pull_uniform_base;
1688
1689    foreach_list(node, &this->instructions) {
1690       fs_inst *inst = (fs_inst *)node;
1691
1692       for (int i = 0; i < 3; i++) {
1693          if (inst->src[i].file != UNIFORM)
1694             continue;
1695
1696          int pull_index = pull_constant_loc[inst->src[i].reg +
1697                                             inst->src[i].reg_offset];
1698          if (pull_index == -1)
1699             continue;
1700
1701          assert(!inst->src[i].reladdr);
1702
1703          fs_reg dst = fs_reg(this, glsl_type::float_type);
1704          fs_reg index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1705          fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1706          fs_inst *pull =
1707             new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1708                                  dst, index, offset);
1709          pull->ir = inst->ir;
1710          pull->annotation = inst->annotation;
1711          pull->base_mrf = 14;
1712          pull->mlen = 1;
1713
1714          inst->insert_before(pull);
1715
1716          inst->src[i].file = GRF;
1717          inst->src[i].reg = dst.reg;
1718          inst->src[i].reg_offset = 0;
1719          inst->src[i].smear = pull_index & 3;
1720       }
1721    }
1722 }
1723
1724 bool
1725 fs_visitor::opt_algebraic()
1726 {
1727    bool progress = false;
1728
1729    foreach_list(node, &this->instructions) {
1730       fs_inst *inst = (fs_inst *)node;
1731
1732       switch (inst->opcode) {
1733       case BRW_OPCODE_MUL:
1734          if (inst->src[1].file != IMM)
1735             continue;
1736
1737          /* a * 1.0 = a */
1738          if (inst->src[1].is_one()) {
1739             inst->opcode = BRW_OPCODE_MOV;
1740             inst->src[1] = reg_undef;
1741             progress = true;
1742             break;
1743          }
1744
1745          /* a * 0.0 = 0.0 */
1746          if (inst->src[1].is_zero()) {
1747             inst->opcode = BRW_OPCODE_MOV;
1748             inst->src[0] = inst->src[1];
1749             inst->src[1] = reg_undef;
1750             progress = true;
1751             break;
1752          }
1753
1754          break;
1755       case BRW_OPCODE_ADD:
1756          if (inst->src[1].file != IMM)
1757             continue;
1758
1759          /* a + 0.0 = a */
1760          if (inst->src[1].is_zero()) {
1761             inst->opcode = BRW_OPCODE_MOV;
1762             inst->src[1] = reg_undef;
1763             progress = true;
1764             break;
1765          }
1766          break;
1767       default:
1768          break;
1769       }
1770    }
1771
1772    return progress;
1773 }
1774
1775 /**
1776  * Must be called after calculate_live_intervales() to remove unused
1777  * writes to registers -- register allocation will fail otherwise
1778  * because something deffed but not used won't be considered to
1779  * interfere with other regs.
1780  */
1781 bool
1782 fs_visitor::dead_code_eliminate()
1783 {
1784    bool progress = false;
1785    int pc = 0;
1786
1787    calculate_live_intervals();
1788
1789    foreach_list_safe(node, &this->instructions) {
1790       fs_inst *inst = (fs_inst *)node;
1791
1792       if (inst->dst.file == GRF && this->virtual_grf_use[inst->dst.reg] <= pc) {
1793          inst->remove();
1794          progress = true;
1795       }
1796
1797       pc++;
1798    }
1799
1800    if (progress)
1801       live_intervals_valid = false;
1802
1803    return progress;
1804 }
1805
1806 /**
1807  * Implements a second type of register coalescing: This one checks if
1808  * the two regs involved in a raw move don't interfere, in which case
1809  * they can both by stored in the same place and the MOV removed.
1810  */
1811 bool
1812 fs_visitor::register_coalesce_2()
1813 {
1814    bool progress = false;
1815
1816    calculate_live_intervals();
1817
1818    foreach_list_safe(node, &this->instructions) {
1819       fs_inst *inst = (fs_inst *)node;
1820
1821       if (inst->opcode != BRW_OPCODE_MOV ||
1822           inst->predicate ||
1823           inst->saturate ||
1824           inst->src[0].file != GRF ||
1825           inst->src[0].negate ||
1826           inst->src[0].abs ||
1827           inst->src[0].smear != -1 ||
1828           inst->dst.file != GRF ||
1829           inst->dst.type != inst->src[0].type ||
1830           virtual_grf_sizes[inst->src[0].reg] != 1 ||
1831           virtual_grf_interferes(inst->dst.reg, inst->src[0].reg)) {
1832          continue;
1833       }
1834
1835       int reg_from = inst->src[0].reg;
1836       assert(inst->src[0].reg_offset == 0);
1837       int reg_to = inst->dst.reg;
1838       int reg_to_offset = inst->dst.reg_offset;
1839
1840       foreach_list(node, &this->instructions) {
1841          fs_inst *scan_inst = (fs_inst *)node;
1842
1843          if (scan_inst->dst.file == GRF &&
1844              scan_inst->dst.reg == reg_from) {
1845             scan_inst->dst.reg = reg_to;
1846             scan_inst->dst.reg_offset = reg_to_offset;
1847          }
1848          for (int i = 0; i < 3; i++) {
1849             if (scan_inst->src[i].file == GRF &&
1850                 scan_inst->src[i].reg == reg_from) {
1851                scan_inst->src[i].reg = reg_to;
1852                scan_inst->src[i].reg_offset = reg_to_offset;
1853             }
1854          }
1855       }
1856
1857       inst->remove();
1858
1859       /* We don't need to recalculate live intervals inside the loop despite
1860        * flagging live_intervals_valid because we only use live intervals for
1861        * the interferes test, and we must have had a situation where the
1862        * intervals were:
1863        *
1864        *  from  to
1865        *  ^
1866        *  |
1867        *  v
1868        *        ^
1869        *        |
1870        *        v
1871        *
1872        * Some register R that might get coalesced with one of these two could
1873        * only be referencing "to", otherwise "from"'s range would have been
1874        * longer.  R's range could also only start at the end of "to" or later,
1875        * otherwise it will conflict with "to" when we try to coalesce "to"
1876        * into Rw anyway.
1877        */
1878       live_intervals_valid = false;
1879
1880       progress = true;
1881       continue;
1882    }
1883
1884    return progress;
1885 }
1886
1887 bool
1888 fs_visitor::register_coalesce()
1889 {
1890    bool progress = false;
1891    int if_depth = 0;
1892    int loop_depth = 0;
1893
1894    foreach_list_safe(node, &this->instructions) {
1895       fs_inst *inst = (fs_inst *)node;
1896
1897       /* Make sure that we dominate the instructions we're going to
1898        * scan for interfering with our coalescing, or we won't have
1899        * scanned enough to see if anything interferes with our
1900        * coalescing.  We don't dominate the following instructions if
1901        * we're in a loop or an if block.
1902        */
1903       switch (inst->opcode) {
1904       case BRW_OPCODE_DO:
1905          loop_depth++;
1906          break;
1907       case BRW_OPCODE_WHILE:
1908          loop_depth--;
1909          break;
1910       case BRW_OPCODE_IF:
1911          if_depth++;
1912          break;
1913       case BRW_OPCODE_ENDIF:
1914          if_depth--;
1915          break;
1916       default:
1917          break;
1918       }
1919       if (loop_depth || if_depth)
1920          continue;
1921
1922       if (inst->opcode != BRW_OPCODE_MOV ||
1923           inst->predicate ||
1924           inst->saturate ||
1925           inst->dst.file != GRF || (inst->src[0].file != GRF &&
1926                                     inst->src[0].file != UNIFORM)||
1927           inst->dst.type != inst->src[0].type)
1928          continue;
1929
1930       bool has_source_modifiers = (inst->src[0].abs ||
1931                                    inst->src[0].negate ||
1932                                    inst->src[0].file == UNIFORM);
1933
1934       /* Found a move of a GRF to a GRF.  Let's see if we can coalesce
1935        * them: check for no writes to either one until the exit of the
1936        * program.
1937        */
1938       bool interfered = false;
1939
1940       for (fs_inst *scan_inst = (fs_inst *)inst->next;
1941            !scan_inst->is_tail_sentinel();
1942            scan_inst = (fs_inst *)scan_inst->next) {
1943          if (scan_inst->dst.file == GRF) {
1944             if (scan_inst->overwrites_reg(inst->dst) ||
1945                 scan_inst->overwrites_reg(inst->src[0])) {
1946                interfered = true;
1947                break;
1948             }
1949          }
1950
1951          /* The gen6 MATH instruction can't handle source modifiers or
1952           * unusual register regions, so avoid coalescing those for
1953           * now.  We should do something more specific.
1954           */
1955          if (has_source_modifiers && !can_do_source_mods(scan_inst)) {
1956             interfered = true;
1957             break;
1958          }
1959
1960          /* The accumulator result appears to get used for the
1961           * conditional modifier generation.  When negating a UD
1962           * value, there is a 33rd bit generated for the sign in the
1963           * accumulator value, so now you can't check, for example,
1964           * equality with a 32-bit value.  See piglit fs-op-neg-uint.
1965           */
1966          if (scan_inst->conditional_mod &&
1967              inst->src[0].negate &&
1968              inst->src[0].type == BRW_REGISTER_TYPE_UD) {
1969             interfered = true;
1970             break;
1971          }
1972       }
1973       if (interfered) {
1974          continue;
1975       }
1976
1977       /* Rewrite the later usage to point at the source of the move to
1978        * be removed.
1979        */
1980       for (fs_inst *scan_inst = inst;
1981            !scan_inst->is_tail_sentinel();
1982            scan_inst = (fs_inst *)scan_inst->next) {
1983          for (int i = 0; i < 3; i++) {
1984             if (scan_inst->src[i].file == GRF &&
1985                 scan_inst->src[i].reg == inst->dst.reg &&
1986                 scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
1987                fs_reg new_src = inst->src[0];
1988                if (scan_inst->src[i].abs) {
1989                   new_src.negate = 0;
1990                   new_src.abs = 1;
1991                }
1992                new_src.negate ^= scan_inst->src[i].negate;
1993                scan_inst->src[i] = new_src;
1994             }
1995          }
1996       }
1997
1998       inst->remove();
1999       progress = true;
2000    }
2001
2002    if (progress)
2003       live_intervals_valid = false;
2004
2005    return progress;
2006 }
2007
2008
2009 bool
2010 fs_visitor::compute_to_mrf()
2011 {
2012    bool progress = false;
2013    int next_ip = 0;
2014
2015    calculate_live_intervals();
2016
2017    foreach_list_safe(node, &this->instructions) {
2018       fs_inst *inst = (fs_inst *)node;
2019
2020       int ip = next_ip;
2021       next_ip++;
2022
2023       if (inst->opcode != BRW_OPCODE_MOV ||
2024           inst->predicate ||
2025           inst->dst.file != MRF || inst->src[0].file != GRF ||
2026           inst->dst.type != inst->src[0].type ||
2027           inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
2028          continue;
2029
2030       /* Work out which hardware MRF registers are written by this
2031        * instruction.
2032        */
2033       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2034       int mrf_high;
2035       if (inst->dst.reg & BRW_MRF_COMPR4) {
2036          mrf_high = mrf_low + 4;
2037       } else if (dispatch_width == 16 &&
2038                  (!inst->force_uncompressed && !inst->force_sechalf)) {
2039          mrf_high = mrf_low + 1;
2040       } else {
2041          mrf_high = mrf_low;
2042       }
2043
2044       /* Can't compute-to-MRF this GRF if someone else was going to
2045        * read it later.
2046        */
2047       if (this->virtual_grf_use[inst->src[0].reg] > ip)
2048          continue;
2049
2050       /* Found a move of a GRF to a MRF.  Let's see if we can go
2051        * rewrite the thing that made this GRF to write into the MRF.
2052        */
2053       fs_inst *scan_inst;
2054       for (scan_inst = (fs_inst *)inst->prev;
2055            scan_inst->prev != NULL;
2056            scan_inst = (fs_inst *)scan_inst->prev) {
2057          if (scan_inst->dst.file == GRF &&
2058              scan_inst->dst.reg == inst->src[0].reg) {
2059             /* Found the last thing to write our reg we want to turn
2060              * into a compute-to-MRF.
2061              */
2062
2063             /* SENDs can only write to GRFs, so no compute-to-MRF. */
2064             if (scan_inst->mlen) {
2065                break;
2066             }
2067
2068             /* If it's predicated, it (probably) didn't populate all
2069              * the channels.  We might be able to rewrite everything
2070              * that writes that reg, but it would require smarter
2071              * tracking to delay the rewriting until complete success.
2072              */
2073             if (scan_inst->predicate)
2074                break;
2075
2076             /* If it's half of register setup and not the same half as
2077              * our MOV we're trying to remove, bail for now.
2078              */
2079             if (scan_inst->force_uncompressed != inst->force_uncompressed ||
2080                 scan_inst->force_sechalf != inst->force_sechalf) {
2081                break;
2082             }
2083
2084             /* SEND instructions can't have MRF as a destination. */
2085             if (scan_inst->mlen)
2086                break;
2087
2088             if (intel->gen >= 6) {
2089                /* gen6 math instructions must have the destination be
2090                 * GRF, so no compute-to-MRF for them.
2091                 */
2092                if (scan_inst->is_math()) {
2093                   break;
2094                }
2095             }
2096
2097             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2098                /* Found the creator of our MRF's source value. */
2099                scan_inst->dst.file = MRF;
2100                scan_inst->dst.reg = inst->dst.reg;
2101                scan_inst->saturate |= inst->saturate;
2102                inst->remove();
2103                progress = true;
2104             }
2105             break;
2106          }
2107
2108          /* We don't handle flow control here.  Most computation of
2109           * values that end up in MRFs are shortly before the MRF
2110           * write anyway.
2111           */
2112          if (scan_inst->opcode == BRW_OPCODE_DO ||
2113              scan_inst->opcode == BRW_OPCODE_WHILE ||
2114              scan_inst->opcode == BRW_OPCODE_ELSE ||
2115              scan_inst->opcode == BRW_OPCODE_ENDIF) {
2116             break;
2117          }
2118
2119          /* You can't read from an MRF, so if someone else reads our
2120           * MRF's source GRF that we wanted to rewrite, that stops us.
2121           */
2122          bool interfered = false;
2123          for (int i = 0; i < 3; i++) {
2124             if (scan_inst->src[i].file == GRF &&
2125                 scan_inst->src[i].reg == inst->src[0].reg &&
2126                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2127                interfered = true;
2128             }
2129          }
2130          if (interfered)
2131             break;
2132
2133          if (scan_inst->dst.file == MRF) {
2134             /* If somebody else writes our MRF here, we can't
2135              * compute-to-MRF before that.
2136              */
2137             int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2138             int scan_mrf_high;
2139
2140             if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2141                scan_mrf_high = scan_mrf_low + 4;
2142             } else if (dispatch_width == 16 &&
2143                        (!scan_inst->force_uncompressed &&
2144                         !scan_inst->force_sechalf)) {
2145                scan_mrf_high = scan_mrf_low + 1;
2146             } else {
2147                scan_mrf_high = scan_mrf_low;
2148             }
2149
2150             if (mrf_low == scan_mrf_low ||
2151                 mrf_low == scan_mrf_high ||
2152                 mrf_high == scan_mrf_low ||
2153                 mrf_high == scan_mrf_high) {
2154                break;
2155             }
2156          }
2157
2158          if (scan_inst->mlen > 0) {
2159             /* Found a SEND instruction, which means that there are
2160              * live values in MRFs from base_mrf to base_mrf +
2161              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
2162              * above it.
2163              */
2164             if (mrf_low >= scan_inst->base_mrf &&
2165                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2166                break;
2167             }
2168             if (mrf_high >= scan_inst->base_mrf &&
2169                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2170                break;
2171             }
2172          }
2173       }
2174    }
2175
2176    if (progress)
2177       live_intervals_valid = false;
2178
2179    return progress;
2180 }
2181
2182 /**
2183  * Walks through basic blocks, looking for repeated MRF writes and
2184  * removing the later ones.
2185  */
2186 bool
2187 fs_visitor::remove_duplicate_mrf_writes()
2188 {
2189    fs_inst *last_mrf_move[16];
2190    bool progress = false;
2191
2192    /* Need to update the MRF tracking for compressed instructions. */
2193    if (dispatch_width == 16)
2194       return false;
2195
2196    memset(last_mrf_move, 0, sizeof(last_mrf_move));
2197
2198    foreach_list_safe(node, &this->instructions) {
2199       fs_inst *inst = (fs_inst *)node;
2200
2201       switch (inst->opcode) {
2202       case BRW_OPCODE_DO:
2203       case BRW_OPCODE_WHILE:
2204       case BRW_OPCODE_IF:
2205       case BRW_OPCODE_ELSE:
2206       case BRW_OPCODE_ENDIF:
2207          memset(last_mrf_move, 0, sizeof(last_mrf_move));
2208          continue;
2209       default:
2210          break;
2211       }
2212
2213       if (inst->opcode == BRW_OPCODE_MOV &&
2214           inst->dst.file == MRF) {
2215          fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2216          if (prev_inst && inst->equals(prev_inst)) {
2217             inst->remove();
2218             progress = true;
2219             continue;
2220          }
2221       }
2222
2223       /* Clear out the last-write records for MRFs that were overwritten. */
2224       if (inst->dst.file == MRF) {
2225          last_mrf_move[inst->dst.reg] = NULL;
2226       }
2227
2228       if (inst->mlen > 0) {
2229          /* Found a SEND instruction, which will include two or fewer
2230           * implied MRF writes.  We could do better here.
2231           */
2232          for (int i = 0; i < implied_mrf_writes(inst); i++) {
2233             last_mrf_move[inst->base_mrf + i] = NULL;
2234          }
2235       }
2236
2237       /* Clear out any MRF move records whose sources got overwritten. */
2238       if (inst->dst.file == GRF) {
2239          for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2240             if (last_mrf_move[i] &&
2241                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2242                last_mrf_move[i] = NULL;
2243             }
2244          }
2245       }
2246
2247       if (inst->opcode == BRW_OPCODE_MOV &&
2248           inst->dst.file == MRF &&
2249           inst->src[0].file == GRF &&
2250           !inst->predicate) {
2251          last_mrf_move[inst->dst.reg] = inst;
2252       }
2253    }
2254
2255    if (progress)
2256       live_intervals_valid = false;
2257
2258    return progress;
2259 }
2260
2261 void
2262 fs_visitor::dump_instruction(fs_inst *inst)
2263 {
2264    if (inst->predicate) {
2265       printf("(%cf0.%d) ",
2266              inst->predicate_inverse ? '-' : '+',
2267              inst->flag_subreg);
2268    }
2269
2270    if (inst->opcode < ARRAY_SIZE(opcode_descs) &&
2271        opcode_descs[inst->opcode].name) {
2272       printf("%s", opcode_descs[inst->opcode].name);
2273    } else {
2274       printf("op%d", inst->opcode);
2275    }
2276    if (inst->saturate)
2277       printf(".sat");
2278    if (inst->conditional_mod) {
2279       printf(".cmod");
2280       if (!inst->predicate &&
2281           (intel->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2282                               inst->opcode != BRW_OPCODE_IF &&
2283                               inst->opcode != BRW_OPCODE_WHILE))) {
2284          printf(".f0.%d\n", inst->flag_subreg);
2285       }
2286    }
2287    printf(" ");
2288
2289
2290    switch (inst->dst.file) {
2291    case GRF:
2292       printf("vgrf%d", inst->dst.reg);
2293       if (inst->dst.reg_offset)
2294          printf("+%d", inst->dst.reg_offset);
2295       break;
2296    case MRF:
2297       printf("m%d", inst->dst.reg);
2298       break;
2299    case BAD_FILE:
2300       printf("(null)");
2301       break;
2302    case UNIFORM:
2303       printf("***u%d***", inst->dst.reg);
2304       break;
2305    default:
2306       printf("???");
2307       break;
2308    }
2309    printf(", ");
2310
2311    for (int i = 0; i < 3; i++) {
2312       if (inst->src[i].negate)
2313          printf("-");
2314       if (inst->src[i].abs)
2315          printf("|");
2316       switch (inst->src[i].file) {
2317       case GRF:
2318          printf("vgrf%d", inst->src[i].reg);
2319          if (inst->src[i].reg_offset)
2320             printf("+%d", inst->src[i].reg_offset);
2321          break;
2322       case MRF:
2323          printf("***m%d***", inst->src[i].reg);
2324          break;
2325       case UNIFORM:
2326          printf("u%d", inst->src[i].reg);
2327          if (inst->src[i].reg_offset)
2328             printf(".%d", inst->src[i].reg_offset);
2329          break;
2330       case BAD_FILE:
2331          printf("(null)");
2332          break;
2333       default:
2334          printf("???");
2335          break;
2336       }
2337       if (inst->src[i].abs)
2338          printf("|");
2339
2340       if (i < 3)
2341          printf(", ");
2342    }
2343
2344    printf(" ");
2345
2346    if (inst->force_uncompressed)
2347       printf("1sthalf ");
2348
2349    if (inst->force_sechalf)
2350       printf("2ndhalf ");
2351
2352    printf("\n");
2353 }
2354
2355 void
2356 fs_visitor::dump_instructions()
2357 {
2358    int ip = 0;
2359    foreach_list(node, &this->instructions) {
2360       fs_inst *inst = (fs_inst *)node;
2361       printf("%d: ", ip++);
2362       dump_instruction(inst);
2363    }
2364 }
2365
2366 /**
2367  * Possibly returns an instruction that set up @param reg.
2368  *
2369  * Sometimes we want to take the result of some expression/variable
2370  * dereference tree and rewrite the instruction generating the result
2371  * of the tree.  When processing the tree, we know that the
2372  * instructions generated are all writing temporaries that are dead
2373  * outside of this tree.  So, if we have some instructions that write
2374  * a temporary, we're free to point that temp write somewhere else.
2375  *
2376  * Note that this doesn't guarantee that the instruction generated
2377  * only reg -- it might be the size=4 destination of a texture instruction.
2378  */
2379 fs_inst *
2380 fs_visitor::get_instruction_generating_reg(fs_inst *start,
2381                                            fs_inst *end,
2382                                            fs_reg reg)
2383 {
2384    if (end == start ||
2385        end->predicate ||
2386        end->force_uncompressed ||
2387        end->force_sechalf ||
2388        reg.reladdr ||
2389        !reg.equals(end->dst)) {
2390       return NULL;
2391    } else {
2392       return end;
2393    }
2394 }
2395
2396 void
2397 fs_visitor::setup_payload_gen6()
2398 {
2399    struct intel_context *intel = &brw->intel;
2400    bool uses_depth =
2401       (fp->Base.InputsRead & (1 << FRAG_ATTRIB_WPOS)) != 0;
2402    unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
2403
2404    assert(intel->gen >= 6);
2405
2406    /* R0-1: masks, pixel X/Y coordinates. */
2407    c->nr_payload_regs = 2;
2408    /* R2: only for 32-pixel dispatch.*/
2409
2410    /* R3-26: barycentric interpolation coordinates.  These appear in the
2411     * same order that they appear in the brw_wm_barycentric_interp_mode
2412     * enum.  Each set of coordinates occupies 2 registers if dispatch width
2413     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
2414     * appear if they were enabled using the "Barycentric Interpolation
2415     * Mode" bits in WM_STATE.
2416     */
2417    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
2418       if (barycentric_interp_modes & (1 << i)) {
2419          c->barycentric_coord_reg[i] = c->nr_payload_regs;
2420          c->nr_payload_regs += 2;
2421          if (dispatch_width == 16) {
2422             c->nr_payload_regs += 2;
2423          }
2424       }
2425    }
2426
2427    /* R27: interpolated depth if uses source depth */
2428    if (uses_depth) {
2429       c->source_depth_reg = c->nr_payload_regs;
2430       c->nr_payload_regs++;
2431       if (dispatch_width == 16) {
2432          /* R28: interpolated depth if not 8-wide. */
2433          c->nr_payload_regs++;
2434       }
2435    }
2436    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
2437    if (uses_depth) {
2438       c->source_w_reg = c->nr_payload_regs;
2439       c->nr_payload_regs++;
2440       if (dispatch_width == 16) {
2441          /* R30: interpolated W if not 8-wide. */
2442          c->nr_payload_regs++;
2443       }
2444    }
2445    /* R31: MSAA position offsets. */
2446    /* R32-: bary for 32-pixel. */
2447    /* R58-59: interp W for 32-pixel. */
2448
2449    if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
2450       c->source_depth_to_render_target = true;
2451    }
2452 }
2453
2454 bool
2455 fs_visitor::run()
2456 {
2457    sanity_param_count = fp->Base.Parameters->NumParameters;
2458    uint32_t orig_nr_params = c->prog_data.nr_params;
2459
2460    if (intel->gen >= 6)
2461       setup_payload_gen6();
2462    else
2463       setup_payload_gen4();
2464
2465    if (0) {
2466       emit_dummy_fs();
2467    } else {
2468       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2469          emit_shader_time_begin();
2470
2471       calculate_urb_setup();
2472       if (intel->gen < 6)
2473          emit_interpolation_setup_gen4();
2474       else
2475          emit_interpolation_setup_gen6();
2476
2477       /* We handle discards by keeping track of the still-live pixels in f0.1.
2478        * Initialize it with the dispatched pixels.
2479        */
2480       if (fp->UsesKill) {
2481          fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
2482          discard_init->flag_subreg = 1;
2483       }
2484
2485       /* Generate FS IR for main().  (the visitor only descends into
2486        * functions called "main").
2487        */
2488       if (shader) {
2489          foreach_list(node, &*shader->ir) {
2490             ir_instruction *ir = (ir_instruction *)node;
2491             base_ir = ir;
2492             this->result = reg_undef;
2493             ir->accept(this);
2494          }
2495       } else {
2496          emit_fragment_program_code();
2497       }
2498       base_ir = NULL;
2499       if (failed)
2500          return false;
2501
2502       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2503          emit_shader_time_end();
2504
2505       emit_fb_writes();
2506
2507       split_virtual_grfs();
2508
2509       setup_paramvalues_refs();
2510       move_uniform_array_access_to_pull_constants();
2511       setup_pull_constants();
2512
2513       bool progress;
2514       do {
2515          progress = false;
2516
2517          compact_virtual_grfs();
2518
2519          progress = remove_duplicate_mrf_writes() || progress;
2520
2521          progress = opt_algebraic() || progress;
2522          progress = opt_cse() || progress;
2523          progress = opt_copy_propagate() || progress;
2524          progress = dead_code_eliminate() || progress;
2525          progress = register_coalesce() || progress;
2526          progress = register_coalesce_2() || progress;
2527          progress = compute_to_mrf() || progress;
2528       } while (progress);
2529
2530       remove_dead_constants();
2531
2532       schedule_instructions(false);
2533
2534       assign_curb_setup();
2535       assign_urb_setup();
2536
2537       if (0) {
2538          /* Debug of register spilling: Go spill everything. */
2539          for (int i = 0; i < virtual_grf_count; i++) {
2540             spill_reg(i);
2541          }
2542       }
2543
2544       if (0)
2545          assign_regs_trivial();
2546       else {
2547          while (!assign_regs()) {
2548             if (failed)
2549                break;
2550          }
2551       }
2552    }
2553    assert(force_uncompressed_stack == 0);
2554    assert(force_sechalf_stack == 0);
2555
2556    if (failed)
2557       return false;
2558
2559    schedule_instructions(true);
2560
2561    if (dispatch_width == 8) {
2562       c->prog_data.reg_blocks = brw_register_blocks(grf_used);
2563    } else {
2564       c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
2565
2566       /* Make sure we didn't try to sneak in an extra uniform */
2567       assert(orig_nr_params == c->prog_data.nr_params);
2568       (void) orig_nr_params;
2569    }
2570
2571    /* If any state parameters were appended, then ParameterValues could have
2572     * been realloced, in which case the driver uniform storage set up by
2573     * _mesa_associate_uniform_storage() would point to freed memory.  Make
2574     * sure that didn't happen.
2575     */
2576    assert(sanity_param_count == fp->Base.Parameters->NumParameters);
2577
2578    return !failed;
2579 }
2580
2581 const unsigned *
2582 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
2583                struct gl_fragment_program *fp,
2584                struct gl_shader_program *prog,
2585                unsigned *final_assembly_size)
2586 {
2587    struct intel_context *intel = &brw->intel;
2588    bool start_busy = false;
2589    float start_time = 0;
2590
2591    if (unlikely(INTEL_DEBUG & DEBUG_PERF)) {
2592       start_busy = (intel->batch.last_bo &&
2593                     drm_intel_bo_busy(intel->batch.last_bo));
2594       start_time = get_time();
2595    }
2596
2597    struct brw_shader *shader = NULL;
2598    if (prog)
2599       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
2600
2601    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
2602       if (shader) {
2603          printf("GLSL IR for native fragment shader %d:\n", prog->Name);
2604          _mesa_print_ir(shader->ir, NULL);
2605          printf("\n\n");
2606       } else {
2607          printf("ARB_fragment_program %d ir for native fragment shader\n",
2608                 fp->Base.Id);
2609          _mesa_print_program(&fp->Base);
2610       }
2611    }
2612
2613    /* Now the main event: Visit the shader IR and generate our FS IR for it.
2614     */
2615    fs_visitor v(brw, c, prog, fp, 8);
2616    if (!v.run()) {
2617       prog->LinkStatus = false;
2618       ralloc_strcat(&prog->InfoLog, v.fail_msg);
2619
2620       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
2621                     v.fail_msg);
2622
2623       return NULL;
2624    }
2625
2626    exec_list *simd16_instructions = NULL;
2627    fs_visitor v2(brw, c, prog, fp, 16);
2628    if (intel->gen >= 5 && c->prog_data.nr_pull_params == 0) {
2629       v2.import_uniforms(&v);
2630       if (!v2.run()) {
2631          perf_debug("16-wide shader failed to compile, falling back to "
2632                     "8-wide at a 10-20%% performance cost: %s", v2.fail_msg);
2633       } else {
2634          simd16_instructions = &v2.instructions;
2635       }
2636    }
2637
2638    c->prog_data.dispatch_width = 8;
2639
2640    fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
2641    const unsigned *generated = g.generate_assembly(&v.instructions,
2642                                                    simd16_instructions,
2643                                                    final_assembly_size);
2644
2645    if (unlikely(INTEL_DEBUG & DEBUG_PERF) && shader) {
2646       if (shader->compiled_once)
2647          brw_wm_debug_recompile(brw, prog, &c->key);
2648       shader->compiled_once = true;
2649
2650       if (start_busy && !drm_intel_bo_busy(intel->batch.last_bo)) {
2651          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
2652                     (get_time() - start_time) * 1000);
2653       }
2654    }
2655
2656    return generated;
2657 }
2658
2659 bool
2660 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
2661 {
2662    struct brw_context *brw = brw_context(ctx);
2663    struct intel_context *intel = &brw->intel;
2664    struct brw_wm_prog_key key;
2665
2666    if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
2667       return true;
2668
2669    struct gl_fragment_program *fp = (struct gl_fragment_program *)
2670       prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
2671    struct brw_fragment_program *bfp = brw_fragment_program(fp);
2672    bool program_uses_dfdy = fp->UsesDFdy;
2673
2674    memset(&key, 0, sizeof(key));
2675
2676    if (intel->gen < 6) {
2677       if (fp->UsesKill)
2678          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
2679
2680       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
2681          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
2682
2683       /* Just assume depth testing. */
2684       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
2685       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
2686    }
2687
2688    if (prog->Name != 0)
2689       key.proj_attrib_mask = 0xffffffff;
2690
2691    if (intel->gen < 6)
2692       key.vp_outputs_written |= BITFIELD64_BIT(FRAG_ATTRIB_WPOS);
2693
2694    for (int i = 0; i < FRAG_ATTRIB_MAX; i++) {
2695       if (!(fp->Base.InputsRead & BITFIELD64_BIT(i)))
2696          continue;
2697
2698       if (prog->Name == 0)
2699          key.proj_attrib_mask |= 1 << i;
2700
2701       if (intel->gen < 6) {
2702          int vp_index = _mesa_vert_result_to_frag_attrib((gl_vert_result) i);
2703
2704          if (vp_index >= 0)
2705             key.vp_outputs_written |= BITFIELD64_BIT(vp_index);
2706       }
2707    }
2708
2709    key.clamp_fragment_color = true;
2710
2711    for (int i = 0; i < MAX_SAMPLERS; i++) {
2712       if (fp->Base.ShadowSamplers & (1 << i)) {
2713          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
2714          key.tex.swizzles[i] =
2715             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
2716       } else {
2717          /* Color sampler: assume no swizzling. */
2718          key.tex.swizzles[i] = SWIZZLE_XYZW;
2719       }
2720    }
2721
2722    if (fp->Base.InputsRead & FRAG_BIT_WPOS) {
2723       key.drawable_height = ctx->DrawBuffer->Height;
2724    }
2725
2726    if ((fp->Base.InputsRead & FRAG_BIT_WPOS) || program_uses_dfdy) {
2727       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
2728    }
2729
2730    key.nr_color_regions = 1;
2731
2732    key.program_string_id = bfp->id;
2733
2734    uint32_t old_prog_offset = brw->wm.prog_offset;
2735    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
2736
2737    bool success = do_wm_prog(brw, prog, bfp, &key);
2738
2739    brw->wm.prog_offset = old_prog_offset;
2740    brw->wm.prog_data = old_prog_data;
2741
2742    return success;
2743 }