src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 extern "C" {
  32
  33 #include <sys/types.h>
  34
  35 #include "main/hash_table.h"
  36 #include "main/macros.h"
  37 #include "main/shaderobj.h"
  38 #include "main/fbobject.h"
  39 #include "program/prog_parameter.h"
  40 #include "program/prog_print.h"
  41 #include "program/register_allocate.h"
  42 #include "program/sampler.h"
  43 #include "program/hash_table.h"
  44 #include "brw_context.h"
  45 #include "brw_eu.h"
  46 #include "brw_wm.h"
  47 }
  48 #include "brw_fs.h"
  49 #include "main/uniforms.h"
  50 #include "brw_fs_live_variables.h"
  51 #include "glsl/glsl_types.h"
  52
  53 void
  54 fs_inst::init()
  55 {
  56    memset(this, 0, sizeof(*this));
  57    this->opcode = BRW_OPCODE_NOP;
  58    this->conditional_mod = BRW_CONDITIONAL_NONE;
  59
  60    this->dst = reg_undef;
  61    this->src[0] = reg_undef;
  62    this->src[1] = reg_undef;
  63    this->src[2] = reg_undef;
  64
  65    /* This will be the case for almost all instructions. */
  66    this->regs_written = 1;
  67 }
  68
  69 fs_inst::fs_inst()
  70 {
  71    init();
  72 }
  73
  74 fs_inst::fs_inst(enum opcode opcode)
  75 {
  76    init();
  77    this->opcode = opcode;
  78 }
  79
  80 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
  81 {
  82    init();
  83    this->opcode = opcode;
  84    this->dst = dst;
  85
  86    if (dst.file == GRF)
  87       assert(dst.reg_offset >= 0);
  88 }
  89
  90 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
  91 {
  92    init();
  93    this->opcode = opcode;
  94    this->dst = dst;
  95    this->src[0] = src0;
  96
  97    if (dst.file == GRF)
  98       assert(dst.reg_offset >= 0);
  99    if (src[0].file == GRF)
 100       assert(src[0].reg_offset >= 0);
 101 }
 102
 103 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 104 {
 105    init();
 106    this->opcode = opcode;
 107    this->dst = dst;
 108    this->src[0] = src0;
 109    this->src[1] = src1;
 110
 111    if (dst.file == GRF)
 112       assert(dst.reg_offset >= 0);
 113    if (src[0].file == GRF)
 114       assert(src[0].reg_offset >= 0);
 115    if (src[1].file == GRF)
 116       assert(src[1].reg_offset >= 0);
 117 }
 118
 119 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
 120                  fs_reg src0, fs_reg src1, fs_reg src2)
 121 {
 122    init();
 123    this->opcode = opcode;
 124    this->dst = dst;
 125    this->src[0] = src0;
 126    this->src[1] = src1;
 127    this->src[2] = src2;
 128
 129    if (dst.file == GRF)
 130       assert(dst.reg_offset >= 0);
 131    if (src[0].file == GRF)
 132       assert(src[0].reg_offset >= 0);
 133    if (src[1].file == GRF)
 134       assert(src[1].reg_offset >= 0);
 135    if (src[2].file == GRF)
 136       assert(src[2].reg_offset >= 0);
 137 }
 138
 139 #define ALU1(op)                                                        \
 140    fs_inst *                                                            \
 141    fs_visitor::op(fs_reg dst, fs_reg src0)                              \
 142    {                                                                    \
 143       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
 144    }
 145
 146 #define ALU2(op)                                                        \
 147    fs_inst *                                                            \
 148    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1)                 \
 149    {                                                                    \
 150       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
 151    }
 152
 153 #define ALU3(op)                                                        \
 154    fs_inst *                                                            \
 155    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1, fs_reg src2)    \
 156    {                                                                    \
 157       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
 158    }
 159
 160 ALU1(NOT)
 161 ALU1(MOV)
 162 ALU1(FRC)
 163 ALU1(RNDD)
 164 ALU1(RNDE)
 165 ALU1(RNDZ)
 166 ALU2(ADD)
 167 ALU2(MUL)
 168 ALU2(MACH)
 169 ALU2(AND)
 170 ALU2(OR)
 171 ALU2(XOR)
 172 ALU2(SHL)
 173 ALU2(SHR)
 174 ALU2(ASR)
 175 ALU3(LRP)
 176 ALU1(BFREV)
 177 ALU3(BFE)
 178 ALU2(BFI1)
 179 ALU3(BFI2)
 180 ALU1(FBH)
 181 ALU1(FBL)
 182 ALU1(CBIT)
 183 ALU3(MAD)
 184 ALU2(ADDC)
 185 ALU2(SUBB)
 186
 187 /** Gen4 predicated IF. */
 188 fs_inst *
 189 fs_visitor::IF(uint32_t predicate)
 190 {
 191    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
 192    inst->predicate = predicate;
 193    return inst;
 194 }
 195
 196 /** Gen6+ IF with embedded comparison. */
 197 fs_inst *
 198 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
 199 {
 200    assert(brw->gen >= 6);
 201    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
 202                                         reg_null_d, src0, src1);
 203    inst->conditional_mod = condition;
 204    return inst;
 205 }
 206
 207 /**
 208  * CMP: Sets the low bit of the destination channels with the result
 209  * of the comparison, while the upper bits are undefined, and updates
 210  * the flag register with the packed 16 bits of the result.
 211  */
 212 fs_inst *
 213 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
 214 {
 215    fs_inst *inst;
 216
 217    /* Take the instruction:
 218     *
 219     * CMP null<d> src0<f> src1<f>
 220     *
 221     * Original gen4 does type conversion to the destination type before
 222     * comparison, producing garbage results for floating point comparisons.
 223     * gen5 does the comparison on the execution type (resolved source types),
 224     * so dst type doesn't matter.  gen6 does comparison and then uses the
 225     * result as if it was the dst type with no conversion, which happens to
 226     * mostly work out for float-interpreted-as-int since our comparisons are
 227     * for >0, =0, <0.
 228     */
 229    if (brw->gen == 4) {
 230       dst.type = src0.type;
 231       if (dst.file == HW_REG)
 232          dst.fixed_hw_reg.type = dst.type;
 233    }
 234
 235    resolve_ud_negate(&src0);
 236    resolve_ud_negate(&src1);
 237
 238    inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
 239    inst->conditional_mod = condition;
 240
 241    return inst;
 242 }
 243
 244 exec_list
 245 fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
 246                                        fs_reg varying_offset,
 247                                        uint32_t const_offset)
 248 {
 249    exec_list instructions;
 250    fs_inst *inst;
 251
 252    /* We have our constant surface use a pitch of 4 bytes, so our index can
 253     * be any component of a vector, and then we load 4 contiguous
 254     * components starting from that.
 255     *
 256     * We break down the const_offset to a portion added to the variable
 257     * offset and a portion done using reg_offset, which means that if you
 258     * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
 259     * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
 260     * CSE can later notice that those loads are all the same and eliminate
 261     * the redundant ones.
 262     */
 263    fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
 264    instructions.push_tail(ADD(vec4_offset,
 265                               varying_offset, const_offset & ~3));
 266
 267    int scale = 1;
 268    if (brw->gen == 4 && dispatch_width == 8) {
 269       /* Pre-gen5, we can either use a SIMD8 message that requires (header,
 270        * u, v, r) as parameters, or we can just use the SIMD16 message
 271        * consisting of (header, u).  We choose the second, at the cost of a
 272        * longer return length.
 273        */
 274       scale = 2;
 275    }
 276
 277    enum opcode op;
 278    if (brw->gen >= 7)
 279       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
 280    else
 281       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
 282    fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4 * scale), dst.type);
 283    inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
 284    inst->regs_written = 4 * scale;
 285    instructions.push_tail(inst);
 286
 287    if (brw->gen < 7) {
 288       inst->base_mrf = 13;
 289       inst->header_present = true;
 290       if (brw->gen == 4)
 291          inst->mlen = 3;
 292       else
 293          inst->mlen = 1 + dispatch_width / 8;
 294    }
 295
 296    vec4_result.reg_offset += (const_offset & 3) * scale;
 297    instructions.push_tail(MOV(dst, vec4_result));
 298
 299    return instructions;
 300 }
 301
 302 /**
 303  * A helper for MOV generation for fixing up broken hardware SEND dependency
 304  * handling.
 305  */
 306 fs_inst *
 307 fs_visitor::DEP_RESOLVE_MOV(int grf)
 308 {
 309    fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
 310
 311    inst->ir = NULL;
 312    inst->annotation = "send dependency resolve";
 313
 314    /* The caller always wants uncompressed to emit the minimal extra
 315     * dependencies, and to avoid having to deal with aligning its regs to 2.
 316     */
 317    inst->force_uncompressed = true;
 318
 319    return inst;
 320 }
 321
 322 bool
 323 fs_inst::equals(fs_inst *inst)
 324 {
 325    return (opcode == inst->opcode &&
 326            dst.equals(inst->dst) &&
 327            src[0].equals(inst->src[0]) &&
 328            src[1].equals(inst->src[1]) &&
 329            src[2].equals(inst->src[2]) &&
 330            saturate == inst->saturate &&
 331            predicate == inst->predicate &&
 332            conditional_mod == inst->conditional_mod &&
 333            mlen == inst->mlen &&
 334            base_mrf == inst->base_mrf &&
 335            sampler == inst->sampler &&
 336            target == inst->target &&
 337            eot == inst->eot &&
 338            header_present == inst->header_present &&
 339            shadow_compare == inst->shadow_compare &&
 340            offset == inst->offset);
 341 }
 342
 343 bool
 344 fs_inst::overwrites_reg(const fs_reg &reg)
 345 {
 346    return (reg.file == dst.file &&
 347            reg.reg == dst.reg &&
 348            reg.reg_offset >= dst.reg_offset  &&
 349            reg.reg_offset < dst.reg_offset + regs_written);
 350 }
 351
 352 bool
 353 fs_inst::is_send_from_grf()
 354 {
 355    return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
 356            opcode == SHADER_OPCODE_SHADER_TIME_ADD ||
 357            (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
 358             src[1].file == GRF) ||
 359            (is_tex() && src[0].file == GRF));
 360 }
 361
 362 bool
 363 fs_visitor::can_do_source_mods(fs_inst *inst)
 364 {
 365    if (brw->gen == 6 && inst->is_math())
 366       return false;
 367
 368    if (inst->is_send_from_grf())
 369       return false;
 370
 371    if (!inst->can_do_source_mods())
 372       return false;
 373
 374    return true;
 375 }
 376
 377 void
 378 fs_reg::init()
 379 {
 380    memset(this, 0, sizeof(*this));
 381    this->smear = -1;
 382 }
 383
 384 /** Generic unset register constructor. */
 385 fs_reg::fs_reg()
 386 {
 387    init();
 388    this->file = BAD_FILE;
 389 }
 390
 391 /** Immediate value constructor. */
 392 fs_reg::fs_reg(float f)
 393 {
 394    init();
 395    this->file = IMM;
 396    this->type = BRW_REGISTER_TYPE_F;
 397    this->imm.f = f;
 398 }
 399
 400 /** Immediate value constructor. */
 401 fs_reg::fs_reg(int32_t i)
 402 {
 403    init();
 404    this->file = IMM;
 405    this->type = BRW_REGISTER_TYPE_D;
 406    this->imm.i = i;
 407 }
 408
 409 /** Immediate value constructor. */
 410 fs_reg::fs_reg(uint32_t u)
 411 {
 412    init();
 413    this->file = IMM;
 414    this->type = BRW_REGISTER_TYPE_UD;
 415    this->imm.u = u;
 416 }
 417
 418 /** Fixed brw_reg Immediate value constructor. */
 419 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
 420 {
 421    init();
 422    this->file = HW_REG;
 423    this->fixed_hw_reg = fixed_hw_reg;
 424    this->type = fixed_hw_reg.type;
 425 }
 426
 427 bool
 428 fs_reg::equals(const fs_reg &r) const
 429 {
 430    return (file == r.file &&
 431            reg == r.reg &&
 432            reg_offset == r.reg_offset &&
 433            type == r.type &&
 434            negate == r.negate &&
 435            abs == r.abs &&
 436            !reladdr && !r.reladdr &&
 437            memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
 438                   sizeof(fixed_hw_reg)) == 0 &&
 439            smear == r.smear &&
 440            imm.u == r.imm.u);
 441 }
 442
 443 fs_reg
 444 fs_reg::retype(uint32_t type)
 445 {
 446    fs_reg result = *this;
 447    result.type = type;
 448    return result;
 449 }
 450
 451 bool
 452 fs_reg::is_zero() const
 453 {
 454    if (file != IMM)
 455       return false;
 456
 457    return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
 458 }
 459
 460 bool
 461 fs_reg::is_one() const
 462 {
 463    if (file != IMM)
 464       return false;
 465
 466    return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
 467 }
 468
 469 bool
 470 fs_reg::is_null() const
 471 {
 472    return file == HW_REG &&
 473           fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
 474           fixed_hw_reg.nr == BRW_ARF_NULL;
 475 }
 476
 477 bool
 478 fs_reg::is_valid_3src() const
 479 {
 480    return file == GRF || file == UNIFORM;
 481 }
 482
 483 int
 484 fs_visitor::type_size(const struct glsl_type *type)
 485 {
 486    unsigned int size, i;
 487
 488    switch (type->base_type) {
 489    case GLSL_TYPE_UINT:
 490    case GLSL_TYPE_INT:
 491    case GLSL_TYPE_FLOAT:
 492    case GLSL_TYPE_BOOL:
 493       return type->components();
 494    case GLSL_TYPE_ARRAY:
 495       return type_size(type->fields.array) * type->length;
 496    case GLSL_TYPE_STRUCT:
 497       size = 0;
 498       for (i = 0; i < type->length; i++) {
 499          size += type_size(type->fields.structure[i].type);
 500       }
 501       return size;
 502    case GLSL_TYPE_SAMPLER:
 503       /* Samplers take up no register space, since they're baked in at
 504        * link time.
 505        */
 506       return 0;
 507    case GLSL_TYPE_ATOMIC_UINT:
 508       return 0;
 509    case GLSL_TYPE_VOID:
 510    case GLSL_TYPE_ERROR:
 511    case GLSL_TYPE_INTERFACE:
 512       assert(!"not reached");
 513       break;
 514    }
 515
 516    return 0;
 517 }
 518
 519 fs_reg
 520 fs_visitor::get_timestamp()
 521 {
 522    assert(brw->gen >= 7);
 523
 524    fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 525                                           BRW_ARF_TIMESTAMP,
 526                                           0),
 527                              BRW_REGISTER_TYPE_UD));
 528
 529    fs_reg dst = fs_reg(this, glsl_type::uint_type);
 530
 531    fs_inst *mov = emit(MOV(dst, ts));
 532    /* We want to read the 3 fields we care about (mostly field 0, but also 2)
 533     * even if it's not enabled in the dispatch.
 534     */
 535    mov->force_writemask_all = true;
 536    mov->force_uncompressed = true;
 537
 538    /* The caller wants the low 32 bits of the timestamp.  Since it's running
 539     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
 540     * which is plenty of time for our purposes.  It is identical across the
 541     * EUs, but since it's tracking GPU core speed it will increment at a
 542     * varying rate as render P-states change.
 543     *
 544     * The caller could also check if render P-states have changed (or anything
 545     * else that might disrupt timing) by setting smear to 2 and checking if
 546     * that field is != 0.
 547     */
 548    dst.smear = 0;
 549
 550    return dst;
 551 }
 552
 553 void
 554 fs_visitor::emit_shader_time_begin()
 555 {
 556    current_annotation = "shader time start";
 557    shader_start_time = get_timestamp();
 558 }
 559
 560 void
 561 fs_visitor::emit_shader_time_end()
 562 {
 563    current_annotation = "shader time end";
 564
 565    enum shader_time_shader_type type, written_type, reset_type;
 566    if (dispatch_width == 8) {
 567       type = ST_FS8;
 568       written_type = ST_FS8_WRITTEN;
 569       reset_type = ST_FS8_RESET;
 570    } else {
 571       assert(dispatch_width == 16);
 572       type = ST_FS16;
 573       written_type = ST_FS16_WRITTEN;
 574       reset_type = ST_FS16_RESET;
 575    }
 576
 577    fs_reg shader_end_time = get_timestamp();
 578
 579    /* Check that there weren't any timestamp reset events (assuming these
 580     * were the only two timestamp reads that happened).
 581     */
 582    fs_reg reset = shader_end_time;
 583    reset.smear = 2;
 584    fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
 585    test->conditional_mod = BRW_CONDITIONAL_Z;
 586    emit(IF(BRW_PREDICATE_NORMAL));
 587
 588    push_force_uncompressed();
 589    fs_reg start = shader_start_time;
 590    start.negate = true;
 591    fs_reg diff = fs_reg(this, glsl_type::uint_type);
 592    emit(ADD(diff, start, shader_end_time));
 593
 594    /* If there were no instructions between the two timestamp gets, the diff
 595     * is 2 cycles.  Remove that overhead, so I can forget about that when
 596     * trying to determine the time taken for single instructions.
 597     */
 598    emit(ADD(diff, diff, fs_reg(-2u)));
 599
 600    emit_shader_time_write(type, diff);
 601    emit_shader_time_write(written_type, fs_reg(1u));
 602    emit(BRW_OPCODE_ELSE);
 603    emit_shader_time_write(reset_type, fs_reg(1u));
 604    emit(BRW_OPCODE_ENDIF);
 605
 606    pop_force_uncompressed();
 607 }
 608
 609 void
 610 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
 611                                    fs_reg value)
 612 {
 613    int shader_time_index =
 614       brw_get_shader_time_index(brw, shader_prog, &fp->Base, type);
 615    fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
 616
 617    fs_reg payload;
 618    if (dispatch_width == 8)
 619       payload = fs_reg(this, glsl_type::uvec2_type);
 620    else
 621       payload = fs_reg(this, glsl_type::uint_type);
 622
 623    emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
 624                 fs_reg(), payload, offset, value));
 625 }
 626
 627 void
 628 fs_visitor::fail(const char *format, ...)
 629 {
 630    va_list va;
 631    char *msg;
 632
 633    if (failed)
 634       return;
 635
 636    failed = true;
 637
 638    va_start(va, format);
 639    msg = ralloc_vasprintf(mem_ctx, format, va);
 640    va_end(va);
 641    msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
 642
 643    this->fail_msg = msg;
 644
 645    if (INTEL_DEBUG & DEBUG_WM) {
 646       fprintf(stderr, "%s",  msg);
 647    }
 648 }
 649
 650 fs_inst *
 651 fs_visitor::emit(enum opcode opcode)
 652 {
 653    return emit(fs_inst(opcode));
 654 }
 655
 656 fs_inst *
 657 fs_visitor::emit(enum opcode opcode, fs_reg dst)
 658 {
 659    return emit(fs_inst(opcode, dst));
 660 }
 661
 662 fs_inst *
 663 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
 664 {
 665    return emit(fs_inst(opcode, dst, src0));
 666 }
 667
 668 fs_inst *
 669 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 670 {
 671    return emit(fs_inst(opcode, dst, src0, src1));
 672 }
 673
 674 fs_inst *
 675 fs_visitor::emit(enum opcode opcode, fs_reg dst,
 676                  fs_reg src0, fs_reg src1, fs_reg src2)
 677 {
 678    return emit(fs_inst(opcode, dst, src0, src1, src2));
 679 }
 680
 681 void
 682 fs_visitor::push_force_uncompressed()
 683 {
 684    force_uncompressed_stack++;
 685 }
 686
 687 void
 688 fs_visitor::pop_force_uncompressed()
 689 {
 690    force_uncompressed_stack--;
 691    assert(force_uncompressed_stack >= 0);
 692 }
 693
 694 void
 695 fs_visitor::push_force_sechalf()
 696 {
 697    force_sechalf_stack++;
 698 }
 699
 700 void
 701 fs_visitor::pop_force_sechalf()
 702 {
 703    force_sechalf_stack--;
 704    assert(force_sechalf_stack >= 0);
 705 }
 706
 707 /**
 708  * Returns true if the instruction has a flag that means it won't
 709  * update an entire destination register.
 710  *
 711  * For example, dead code elimination and live variable analysis want to know
 712  * when a write to a variable screens off any preceding values that were in
 713  * it.
 714  */
 715 bool
 716 fs_inst::is_partial_write()
 717 {
 718    return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
 719            this->force_uncompressed ||
 720            this->force_sechalf);
 721 }
 722
 723 int
 724 fs_inst::regs_read(fs_visitor *v, int arg)
 725 {
 726    if (is_tex() && arg == 0 && src[0].file == GRF) {
 727       if (v->dispatch_width == 16)
 728          return (mlen + 1) / 2;
 729       else
 730          return mlen;
 731    }
 732    return 1;
 733 }
 734
 735 bool
 736 fs_inst::reads_flag()
 737 {
 738    return predicate;
 739 }
 740
 741 bool
 742 fs_inst::writes_flag()
 743 {
 744    return (conditional_mod && opcode != BRW_OPCODE_SEL) ||
 745           opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
 746 }
 747
 748 /**
 749  * Returns how many MRFs an FS opcode will write over.
 750  *
 751  * Note that this is not the 0 or 1 implied writes in an actual gen
 752  * instruction -- the FS opcodes often generate MOVs in addition.
 753  */
 754 int
 755 fs_visitor::implied_mrf_writes(fs_inst *inst)
 756 {
 757    if (inst->mlen == 0)
 758       return 0;
 759
 760    if (inst->base_mrf == -1)
 761       return 0;
 762
 763    switch (inst->opcode) {
 764    case SHADER_OPCODE_RCP:
 765    case SHADER_OPCODE_RSQ:
 766    case SHADER_OPCODE_SQRT:
 767    case SHADER_OPCODE_EXP2:
 768    case SHADER_OPCODE_LOG2:
 769    case SHADER_OPCODE_SIN:
 770    case SHADER_OPCODE_COS:
 771       return 1 * dispatch_width / 8;
 772    case SHADER_OPCODE_POW:
 773    case SHADER_OPCODE_INT_QUOTIENT:
 774    case SHADER_OPCODE_INT_REMAINDER:
 775       return 2 * dispatch_width / 8;
 776    case SHADER_OPCODE_TEX:
 777    case FS_OPCODE_TXB:
 778    case SHADER_OPCODE_TXD:
 779    case SHADER_OPCODE_TXF:
 780    case SHADER_OPCODE_TXF_MS:
 781    case SHADER_OPCODE_TG4:
 782    case SHADER_OPCODE_TG4_OFFSET:
 783    case SHADER_OPCODE_TXL:
 784    case SHADER_OPCODE_TXS:
 785    case SHADER_OPCODE_LOD:
 786       return 1;
 787    case FS_OPCODE_FB_WRITE:
 788       return 2;
 789    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 790    case SHADER_OPCODE_GEN4_SCRATCH_READ:
 791       return 1;
 792    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
 793       return inst->mlen;
 794    case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
 795       return 2;
 796    case SHADER_OPCODE_UNTYPED_ATOMIC:
 797    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
 798       return 0;
 799    default:
 800       assert(!"not reached");
 801       return inst->mlen;
 802    }
 803 }
 804
 805 int
 806 fs_visitor::virtual_grf_alloc(int size)
 807 {
 808    if (virtual_grf_array_size <= virtual_grf_count) {
 809       if (virtual_grf_array_size == 0)
 810          virtual_grf_array_size = 16;
 811       else
 812          virtual_grf_array_size *= 2;
 813       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 814                                    virtual_grf_array_size);
 815    }
 816    virtual_grf_sizes[virtual_grf_count] = size;
 817    return virtual_grf_count++;
 818 }
 819
 820 /** Fixed HW reg constructor. */
 821 fs_reg::fs_reg(enum register_file file, int reg)
 822 {
 823    init();
 824    this->file = file;
 825    this->reg = reg;
 826    this->type = BRW_REGISTER_TYPE_F;
 827 }
 828
 829 /** Fixed HW reg constructor. */
 830 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
 831 {
 832    init();
 833    this->file = file;
 834    this->reg = reg;
 835    this->type = type;
 836 }
 837
 838 /** Automatic reg constructor. */
 839 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
 840 {
 841    init();
 842
 843    this->file = GRF;
 844    this->reg = v->virtual_grf_alloc(v->type_size(type));
 845    this->reg_offset = 0;
 846    this->type = brw_type_for_base_type(type);
 847 }
 848
 849 fs_reg *
 850 fs_visitor::variable_storage(ir_variable *var)
 851 {
 852    return (fs_reg *)hash_table_find(this->variable_ht, var);
 853 }
 854
 855 void
 856 import_uniforms_callback(const void *key,
 857                          void *data,
 858                          void *closure)
 859 {
 860    struct hash_table *dst_ht = (struct hash_table *)closure;
 861    const fs_reg *reg = (const fs_reg *)data;
 862
 863    if (reg->file != UNIFORM)
 864       return;
 865
 866    hash_table_insert(dst_ht, data, key);
 867 }
 868
 869 /* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
 870  * This brings in those uniform definitions
 871  */
 872 void
 873 fs_visitor::import_uniforms(fs_visitor *v)
 874 {
 875    hash_table_call_foreach(v->variable_ht,
 876                            import_uniforms_callback,
 877                            variable_ht);
 878    this->params_remap = v->params_remap;
 879    this->nr_params_remap = v->nr_params_remap;
 880 }
 881
 882 /* Our support for uniforms is piggy-backed on the struct
 883  * gl_fragment_program, because that's where the values actually
 884  * get stored, rather than in some global gl_shader_program uniform
 885  * store.
 886  */
 887 void
 888 fs_visitor::setup_uniform_values(ir_variable *ir)
 889 {
 890    int namelen = strlen(ir->name);
 891
 892    /* The data for our (non-builtin) uniforms is stored in a series of
 893     * gl_uniform_driver_storage structs for each subcomponent that
 894     * glGetUniformLocation() could name.  We know it's been set up in the same
 895     * order we'd walk the type, so walk the list of storage and find anything
 896     * with our name, or the prefix of a component that starts with our name.
 897     */
 898    unsigned params_before = c->prog_data.nr_params;
 899    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
 900       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 901
 902       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 903           (storage->name[namelen] != 0 &&
 904            storage->name[namelen] != '.' &&
 905            storage->name[namelen] != '[')) {
 906          continue;
 907       }
 908
 909       unsigned slots = storage->type->component_slots();
 910       if (storage->array_elements)
 911          slots *= storage->array_elements;
 912
 913       for (unsigned i = 0; i < slots; i++) {
 914          c->prog_data.param[c->prog_data.nr_params++] =
 915             &storage->storage[i].f;
 916       }
 917    }
 918
 919    /* Make sure we actually initialized the right amount of stuff here. */
 920    assert(params_before + ir->type->component_slots() ==
 921           c->prog_data.nr_params);
 922    (void)params_before;
 923 }
 924
 925
 926 /* Our support for builtin uniforms is even scarier than non-builtin.
 927  * It sits on top of the PROG_STATE_VAR parameters that are
 928  * automatically updated from GL context state.
 929  */
 930 void
 931 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
 932 {
 933    const ir_state_slot *const slots = ir->state_slots;
 934    assert(ir->state_slots != NULL);
 935
 936    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 937       /* This state reference has already been setup by ir_to_mesa, but we'll
 938        * get the same index back here.
 939        */
 940       int index = _mesa_add_state_reference(this->fp->Base.Parameters,
 941                                             (gl_state_index *)slots[i].tokens);
 942
 943       /* Add each of the unique swizzles of the element as a parameter.
 944        * This'll end up matching the expected layout of the
 945        * array/matrix/structure we're trying to fill in.
 946        */
 947       int last_swiz = -1;
 948       for (unsigned int j = 0; j < 4; j++) {
 949          int swiz = GET_SWZ(slots[i].swizzle, j);
 950          if (swiz == last_swiz)
 951             break;
 952          last_swiz = swiz;
 953
 954          c->prog_data.param[c->prog_data.nr_params++] =
 955             &fp->Base.Parameters->ParameterValues[index][swiz].f;
 956       }
 957    }
 958 }
 959
 960 fs_reg *
 961 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
 962 {
 963    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 964    fs_reg wpos = *reg;
 965    bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
 966
 967    /* gl_FragCoord.x */
 968    if (ir->pixel_center_integer) {
 969       emit(MOV(wpos, this->pixel_x));
 970    } else {
 971       emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
 972    }
 973    wpos.reg_offset++;
 974
 975    /* gl_FragCoord.y */
 976    if (!flip && ir->pixel_center_integer) {
 977       emit(MOV(wpos, this->pixel_y));
 978    } else {
 979       fs_reg pixel_y = this->pixel_y;
 980       float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
 981
 982       if (flip) {
 983          pixel_y.negate = true;
 984          offset += c->key.drawable_height - 1.0;
 985       }
 986
 987       emit(ADD(wpos, pixel_y, fs_reg(offset)));
 988    }
 989    wpos.reg_offset++;
 990
 991    /* gl_FragCoord.z */
 992    if (brw->gen >= 6) {
 993       emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
 994    } else {
 995       emit(FS_OPCODE_LINTERP, wpos,
 996            this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
 997            this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
 998            interp_reg(VARYING_SLOT_POS, 2));
 999    }
1000    wpos.reg_offset++;
1001
1002    /* gl_FragCoord.w: Already set up in emit_interpolation */
1003    emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1004
1005    return reg;
1006 }
1007
1008 fs_inst *
1009 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1010                          glsl_interp_qualifier interpolation_mode,
1011                          bool is_centroid)
1012 {
1013    brw_wm_barycentric_interp_mode barycoord_mode;
1014    if (brw->gen >= 6) {
1015       if (is_centroid) {
1016          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1017             barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1018          else
1019             barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1020       } else {
1021          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1022             barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1023          else
1024             barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1025       }
1026    } else {
1027       /* On Ironlake and below, there is only one interpolation mode.
1028        * Centroid interpolation doesn't mean anything on this hardware --
1029        * there is no multisampling.
1030        */
1031       barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1032    }
1033    return emit(FS_OPCODE_LINTERP, attr,
1034                this->delta_x[barycoord_mode],
1035                this->delta_y[barycoord_mode], interp);
1036 }
1037
1038 fs_reg *
1039 fs_visitor::emit_general_interpolation(ir_variable *ir)
1040 {
1041    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1042    reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
1043    fs_reg attr = *reg;
1044
1045    unsigned int array_elements;
1046    const glsl_type *type;
1047
1048    if (ir->type->is_array()) {
1049       array_elements = ir->type->length;
1050       if (array_elements == 0) {
1051          fail("dereferenced array '%s' has length 0\n", ir->name);
1052       }
1053       type = ir->type->fields.array;
1054    } else {
1055       array_elements = 1;
1056       type = ir->type;
1057    }
1058
1059    glsl_interp_qualifier interpolation_mode =
1060       ir->determine_interpolation_mode(c->key.flat_shade);
1061
1062    int location = ir->location;
1063    for (unsigned int i = 0; i < array_elements; i++) {
1064       for (unsigned int j = 0; j < type->matrix_columns; j++) {
1065          if (c->prog_data.urb_setup[location] == -1) {
1066             /* If there's no incoming setup data for this slot, don't
1067              * emit interpolation for it.
1068              */
1069             attr.reg_offset += type->vector_elements;
1070             location++;
1071             continue;
1072          }
1073
1074          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1075             /* Constant interpolation (flat shading) case. The SF has
1076              * handed us defined values in only the constant offset
1077              * field of the setup reg.
1078              */
1079             for (unsigned int k = 0; k < type->vector_elements; k++) {
1080                struct brw_reg interp = interp_reg(location, k);
1081                interp = suboffset(interp, 3);
1082                interp.type = reg->type;
1083                emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1084                attr.reg_offset++;
1085             }
1086          } else {
1087             /* Smooth/noperspective interpolation case. */
1088             for (unsigned int k = 0; k < type->vector_elements; k++) {
1089                /* FINISHME: At some point we probably want to push
1090                 * this farther by giving similar treatment to the
1091                 * other potentially constant components of the
1092                 * attribute, as well as making brw_vs_constval.c
1093                 * handle varyings other than gl_TexCoord.
1094                 */
1095                struct brw_reg interp = interp_reg(location, k);
1096                emit_linterp(attr, fs_reg(interp), interpolation_mode,
1097                             ir->centroid);
1098                if (brw->needs_unlit_centroid_workaround && ir->centroid) {
1099                   /* Get the pixel/sample mask into f0 so that we know
1100                    * which pixels are lit.  Then, for each channel that is
1101                    * unlit, replace the centroid data with non-centroid
1102                    * data.
1103                    */
1104                   emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1105                   fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1106                                                interpolation_mode, false);
1107                   inst->predicate = BRW_PREDICATE_NORMAL;
1108                   inst->predicate_inverse = true;
1109                }
1110                if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1111                   emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1112                }
1113                attr.reg_offset++;
1114             }
1115
1116          }
1117          location++;
1118       }
1119    }
1120
1121    return reg;
1122 }
1123
1124 fs_reg *
1125 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1126 {
1127    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1128
1129    /* The frontfacing comes in as a bit in the thread payload. */
1130    if (brw->gen >= 6) {
1131       emit(BRW_OPCODE_ASR, *reg,
1132            fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1133            fs_reg(15));
1134       emit(BRW_OPCODE_NOT, *reg, *reg);
1135       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1136    } else {
1137       struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1138       /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1139        * us front face
1140        */
1141       emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1142       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1143    }
1144
1145    return reg;
1146 }
1147
1148 fs_reg
1149 fs_visitor::fix_math_operand(fs_reg src)
1150 {
1151    /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1152     * might be able to do better by doing execsize = 1 math and then
1153     * expanding that result out, but we would need to be careful with
1154     * masking.
1155     *
1156     * The hardware ignores source modifiers (negate and abs) on math
1157     * instructions, so we also move to a temp to set those up.
1158     */
1159    if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1160        !src.abs && !src.negate)
1161       return src;
1162
1163    /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1164     * operands to math
1165     */
1166    if (brw->gen >= 7 && src.file != IMM)
1167       return src;
1168
1169    fs_reg expanded = fs_reg(this, glsl_type::float_type);
1170    expanded.type = src.type;
1171    emit(BRW_OPCODE_MOV, expanded, src);
1172    return expanded;
1173 }
1174
1175 fs_inst *
1176 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1177 {
1178    switch (opcode) {
1179    case SHADER_OPCODE_RCP:
1180    case SHADER_OPCODE_RSQ:
1181    case SHADER_OPCODE_SQRT:
1182    case SHADER_OPCODE_EXP2:
1183    case SHADER_OPCODE_LOG2:
1184    case SHADER_OPCODE_SIN:
1185    case SHADER_OPCODE_COS:
1186       break;
1187    default:
1188       assert(!"not reached: bad math opcode");
1189       return NULL;
1190    }
1191
1192    /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
1193     * might be able to do better by doing execsize = 1 math and then
1194     * expanding that result out, but we would need to be careful with
1195     * masking.
1196     *
1197     * Gen 6 hardware ignores source modifiers (negate and abs) on math
1198     * instructions, so we also move to a temp to set those up.
1199     */
1200    if (brw->gen >= 6)
1201       src = fix_math_operand(src);
1202
1203    fs_inst *inst = emit(opcode, dst, src);
1204
1205    if (brw->gen < 6) {
1206       inst->base_mrf = 2;
1207       inst->mlen = dispatch_width / 8;
1208    }
1209
1210    return inst;
1211 }
1212
1213 fs_inst *
1214 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1215 {
1216    int base_mrf = 2;
1217    fs_inst *inst;
1218
1219    switch (opcode) {
1220    case SHADER_OPCODE_INT_QUOTIENT:
1221    case SHADER_OPCODE_INT_REMAINDER:
1222       if (brw->gen >= 7 && dispatch_width == 16)
1223          fail("16-wide INTDIV unsupported\n");
1224       break;
1225    case SHADER_OPCODE_POW:
1226       break;
1227    default:
1228       assert(!"not reached: unsupported binary math opcode.");
1229       return NULL;
1230    }
1231
1232    if (brw->gen >= 6) {
1233       src0 = fix_math_operand(src0);
1234       src1 = fix_math_operand(src1);
1235
1236       inst = emit(opcode, dst, src0, src1);
1237    } else {
1238       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1239        * "Message Payload":
1240        *
1241        * "Operand0[7].  For the INT DIV functions, this operand is the
1242        *  denominator."
1243        *  ...
1244        * "Operand1[7].  For the INT DIV functions, this operand is the
1245        *  numerator."
1246        */
1247       bool is_int_div = opcode != SHADER_OPCODE_POW;
1248       fs_reg &op0 = is_int_div ? src1 : src0;
1249       fs_reg &op1 = is_int_div ? src0 : src1;
1250
1251       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1252       inst = emit(opcode, dst, op0, reg_null_f);
1253
1254       inst->base_mrf = base_mrf;
1255       inst->mlen = 2 * dispatch_width / 8;
1256    }
1257    return inst;
1258 }
1259
1260 void
1261 fs_visitor::assign_curb_setup()
1262 {
1263    c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
1264    if (dispatch_width == 8) {
1265       c->prog_data.first_curbe_grf = c->nr_payload_regs;
1266    } else {
1267       c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1268    }
1269
1270    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1271    foreach_list(node, &this->instructions) {
1272       fs_inst *inst = (fs_inst *)node;
1273
1274       for (unsigned int i = 0; i < 3; i++) {
1275          if (inst->src[i].file == UNIFORM) {
1276             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1277             struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1278                                                   constant_nr / 8,
1279                                                   constant_nr % 8);
1280
1281             inst->src[i].file = HW_REG;
1282             inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
1283          }
1284       }
1285    }
1286 }
1287
1288 void
1289 fs_visitor::calculate_urb_setup()
1290 {
1291    for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1292       c->prog_data.urb_setup[i] = -1;
1293    }
1294
1295    int urb_next = 0;
1296    /* Figure out where each of the incoming setup attributes lands. */
1297    if (brw->gen >= 6) {
1298       if (_mesa_bitcount_64(fp->Base.InputsRead &
1299                             BRW_FS_VARYING_INPUT_MASK) <= 16) {
1300          /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1301           * first 16 varying inputs, so we can put them wherever we want.
1302           * Just put them in order.
1303           *
1304           * This is useful because it means that (a) inputs not used by the
1305           * fragment shader won't take up valuable register space, and (b) we
1306           * won't have to recompile the fragment shader if it gets paired with
1307           * a different vertex (or geometry) shader.
1308           */
1309          for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1310             if (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1311                 BITFIELD64_BIT(i)) {
1312                c->prog_data.urb_setup[i] = urb_next++;
1313             }
1314          }
1315       } else {
1316          /* We have enough input varyings that the SF/SBE pipeline stage can't
1317           * arbitrarily rearrange them to suit our whim; we have to put them
1318           * in an order that matches the output of the previous pipeline stage
1319           * (geometry or vertex shader).
1320           */
1321          struct brw_vue_map prev_stage_vue_map;
1322          brw_compute_vue_map(brw, &prev_stage_vue_map,
1323                              c->key.input_slots_valid);
1324          int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1325          assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1326          for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1327               slot++) {
1328             int varying = prev_stage_vue_map.slot_to_varying[slot];
1329             /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1330              * unused.
1331              */
1332             if (varying != BRW_VARYING_SLOT_COUNT &&
1333                 (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1334                  BITFIELD64_BIT(varying))) {
1335                c->prog_data.urb_setup[varying] = slot - first_slot;
1336             }
1337          }
1338          urb_next = prev_stage_vue_map.num_slots - first_slot;
1339       }
1340    } else {
1341       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1342       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1343          /* Point size is packed into the header, not as a general attribute */
1344          if (i == VARYING_SLOT_PSIZ)
1345             continue;
1346
1347          if (c->key.input_slots_valid & BITFIELD64_BIT(i)) {
1348             /* The back color slot is skipped when the front color is
1349              * also written to.  In addition, some slots can be
1350              * written in the vertex shader and not read in the
1351              * fragment shader.  So the register number must always be
1352              * incremented, mapped or not.
1353              */
1354             if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1355                c->prog_data.urb_setup[i] = urb_next;
1356             urb_next++;
1357          }
1358       }
1359
1360       /*
1361        * It's a FS only attribute, and we did interpolation for this attribute
1362        * in SF thread. So, count it here, too.
1363        *
1364        * See compile_sf_prog() for more info.
1365        */
1366       if (fp->Base.InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1367          c->prog_data.urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1368    }
1369
1370    c->prog_data.num_varying_inputs = urb_next;
1371 }
1372
1373 void
1374 fs_visitor::assign_urb_setup()
1375 {
1376    int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1377
1378    /* Offset all the urb_setup[] index by the actual position of the
1379     * setup regs, now that the location of the constants has been chosen.
1380     */
1381    foreach_list(node, &this->instructions) {
1382       fs_inst *inst = (fs_inst *)node;
1383
1384       if (inst->opcode == FS_OPCODE_LINTERP) {
1385          assert(inst->src[2].file == HW_REG);
1386          inst->src[2].fixed_hw_reg.nr += urb_start;
1387       }
1388
1389       if (inst->opcode == FS_OPCODE_CINTERP) {
1390          assert(inst->src[0].file == HW_REG);
1391          inst->src[0].fixed_hw_reg.nr += urb_start;
1392       }
1393    }
1394
1395    /* Each attribute is 4 setup channels, each of which is half a reg. */
1396    this->first_non_payload_grf =
1397       urb_start + c->prog_data.num_varying_inputs * 2;
1398 }
1399
1400 /**
1401  * Split large virtual GRFs into separate components if we can.
1402  *
1403  * This is mostly duplicated with what brw_fs_vector_splitting does,
1404  * but that's really conservative because it's afraid of doing
1405  * splitting that doesn't result in real progress after the rest of
1406  * the optimization phases, which would cause infinite looping in
1407  * optimization.  We can do it once here, safely.  This also has the
1408  * opportunity to split interpolated values, or maybe even uniforms,
1409  * which we don't have at the IR level.
1410  *
1411  * We want to split, because virtual GRFs are what we register
1412  * allocate and spill (due to contiguousness requirements for some
1413  * instructions), and they're what we naturally generate in the
1414  * codegen process, but most virtual GRFs don't actually need to be
1415  * contiguous sets of GRFs.  If we split, we'll end up with reduced
1416  * live intervals and better dead code elimination and coalescing.
1417  */
1418 void
1419 fs_visitor::split_virtual_grfs()
1420 {
1421    int num_vars = this->virtual_grf_count;
1422    bool split_grf[num_vars];
1423    int new_virtual_grf[num_vars];
1424
1425    /* Try to split anything > 0 sized. */
1426    for (int i = 0; i < num_vars; i++) {
1427       if (this->virtual_grf_sizes[i] != 1)
1428          split_grf[i] = true;
1429       else
1430          split_grf[i] = false;
1431    }
1432
1433    if (brw->has_pln &&
1434        this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1435       /* PLN opcodes rely on the delta_xy being contiguous.  We only have to
1436        * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1437        * Gen6, that was the only supported interpolation mode, and since Gen6,
1438        * delta_x and delta_y are in fixed hardware registers.
1439        */
1440       split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1441          false;
1442    }
1443
1444    foreach_list(node, &this->instructions) {
1445       fs_inst *inst = (fs_inst *)node;
1446
1447       /* If there's a SEND message that requires contiguous destination
1448        * registers, no splitting is allowed.
1449        */
1450       if (inst->regs_written > 1) {
1451          split_grf[inst->dst.reg] = false;
1452       }
1453
1454       /* If we're sending from a GRF, don't split it, on the assumption that
1455        * the send is reading the whole thing.
1456        */
1457       if (inst->is_send_from_grf()) {
1458          for (int i = 0; i < 3; i++) {
1459             if (inst->src[i].file == GRF) {
1460                split_grf[inst->src[i].reg] = false;
1461             }
1462          }
1463       }
1464    }
1465
1466    /* Allocate new space for split regs.  Note that the virtual
1467     * numbers will be contiguous.
1468     */
1469    for (int i = 0; i < num_vars; i++) {
1470       if (split_grf[i]) {
1471          new_virtual_grf[i] = virtual_grf_alloc(1);
1472          for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1473             int reg = virtual_grf_alloc(1);
1474             assert(reg == new_virtual_grf[i] + j - 1);
1475             (void) reg;
1476          }
1477          this->virtual_grf_sizes[i] = 1;
1478       }
1479    }
1480
1481    foreach_list(node, &this->instructions) {
1482       fs_inst *inst = (fs_inst *)node;
1483
1484       if (inst->dst.file == GRF &&
1485           split_grf[inst->dst.reg] &&
1486           inst->dst.reg_offset != 0) {
1487          inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1488                           inst->dst.reg_offset - 1);
1489          inst->dst.reg_offset = 0;
1490       }
1491       for (int i = 0; i < 3; i++) {
1492          if (inst->src[i].file == GRF &&
1493              split_grf[inst->src[i].reg] &&
1494              inst->src[i].reg_offset != 0) {
1495             inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1496                                 inst->src[i].reg_offset - 1);
1497             inst->src[i].reg_offset = 0;
1498          }
1499       }
1500    }
1501    invalidate_live_intervals();
1502 }
1503
1504 /**
1505  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1506  *
1507  * During code generation, we create tons of temporary variables, many of
1508  * which get immediately killed and are never used again.  Yet, in later
1509  * optimization and analysis passes, such as compute_live_intervals, we need
1510  * to loop over all the virtual GRFs.  Compacting them can save a lot of
1511  * overhead.
1512  */
1513 void
1514 fs_visitor::compact_virtual_grfs()
1515 {
1516    /* Mark which virtual GRFs are used, and count how many. */
1517    int remap_table[this->virtual_grf_count];
1518    memset(remap_table, -1, sizeof(remap_table));
1519
1520    foreach_list(node, &this->instructions) {
1521       const fs_inst *inst = (const fs_inst *) node;
1522
1523       if (inst->dst.file == GRF)
1524          remap_table[inst->dst.reg] = 0;
1525
1526       for (int i = 0; i < 3; i++) {
1527          if (inst->src[i].file == GRF)
1528             remap_table[inst->src[i].reg] = 0;
1529       }
1530    }
1531
1532    /* In addition to registers used in instructions, fs_visitor keeps
1533     * direct references to certain special values which must be patched:
1534     */
1535    fs_reg *special[] = {
1536       &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1537       &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1538       &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1539       &delta_x[0], &delta_x[1], &delta_x[2],
1540       &delta_x[3], &delta_x[4], &delta_x[5],
1541       &delta_y[0], &delta_y[1], &delta_y[2],
1542       &delta_y[3], &delta_y[4], &delta_y[5],
1543    };
1544    STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1545    STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1546
1547    /* Treat all special values as used, to be conservative */
1548    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1549       if (special[i]->file == GRF)
1550          remap_table[special[i]->reg] = 0;
1551    }
1552
1553    /* Compact the GRF arrays. */
1554    int new_index = 0;
1555    for (int i = 0; i < this->virtual_grf_count; i++) {
1556       if (remap_table[i] != -1) {
1557          remap_table[i] = new_index;
1558          virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1559          invalidate_live_intervals();
1560          ++new_index;
1561       }
1562    }
1563
1564    this->virtual_grf_count = new_index;
1565
1566    /* Patch all the instructions to use the newly renumbered registers */
1567    foreach_list(node, &this->instructions) {
1568       fs_inst *inst = (fs_inst *) node;
1569
1570       if (inst->dst.file == GRF)
1571          inst->dst.reg = remap_table[inst->dst.reg];
1572
1573       for (int i = 0; i < 3; i++) {
1574          if (inst->src[i].file == GRF)
1575             inst->src[i].reg = remap_table[inst->src[i].reg];
1576       }
1577    }
1578
1579    /* Patch all the references to special values */
1580    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1581       if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1582          special[i]->reg = remap_table[special[i]->reg];
1583    }
1584 }
1585
1586 bool
1587 fs_visitor::remove_dead_constants()
1588 {
1589    if (dispatch_width == 8) {
1590       this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
1591       this->nr_params_remap = c->prog_data.nr_params;
1592
1593       for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
1594          this->params_remap[i] = -1;
1595
1596       /* Find which params are still in use. */
1597       foreach_list(node, &this->instructions) {
1598          fs_inst *inst = (fs_inst *)node;
1599
1600          for (int i = 0; i < 3; i++) {
1601             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1602
1603             if (inst->src[i].file != UNIFORM)
1604                continue;
1605
1606             /* Section 5.11 of the OpenGL 4.3 spec says:
1607              *
1608              *     "Out-of-bounds reads return undefined values, which include
1609              *     values from other variables of the active program or zero."
1610              */
1611             if (constant_nr < 0 || constant_nr >= (int)c->prog_data.nr_params) {
1612                constant_nr = 0;
1613             }
1614
1615             /* For now, set this to non-negative.  We'll give it the
1616              * actual new number in a moment, in order to keep the
1617              * register numbers nicely ordered.
1618              */
1619             this->params_remap[constant_nr] = 0;
1620          }
1621       }
1622
1623       /* Figure out what the new numbers for the params will be.  At some
1624        * point when we're doing uniform array access, we're going to want
1625        * to keep the distinction between .reg and .reg_offset, but for
1626        * now we don't care.
1627        */
1628       unsigned int new_nr_params = 0;
1629       for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1630          if (this->params_remap[i] != -1) {
1631             this->params_remap[i] = new_nr_params++;
1632          }
1633       }
1634
1635       /* Update the list of params to be uploaded to match our new numbering. */
1636       for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1637          int remapped = this->params_remap[i];
1638
1639          if (remapped == -1)
1640             continue;
1641
1642          c->prog_data.param[remapped] = c->prog_data.param[i];
1643       }
1644
1645       c->prog_data.nr_params = new_nr_params;
1646    } else {
1647       /* This should have been generated in the 8-wide pass already. */
1648       assert(this->params_remap);
1649    }
1650
1651    /* Now do the renumbering of the shader to remove unused params. */
1652    foreach_list(node, &this->instructions) {
1653       fs_inst *inst = (fs_inst *)node;
1654
1655       for (int i = 0; i < 3; i++) {
1656          int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1657
1658          if (inst->src[i].file != UNIFORM)
1659             continue;
1660
1661          /* as above alias to 0 */
1662          if (constant_nr < 0 || constant_nr >= (int)this->nr_params_remap) {
1663             constant_nr = 0;
1664          }
1665          assert(this->params_remap[constant_nr] != -1);
1666          inst->src[i].reg = this->params_remap[constant_nr];
1667          inst->src[i].reg_offset = 0;
1668       }
1669    }
1670
1671    return true;
1672 }
1673
1674 /*
1675  * Implements array access of uniforms by inserting a
1676  * PULL_CONSTANT_LOAD instruction.
1677  *
1678  * Unlike temporary GRF array access (where we don't support it due to
1679  * the difficulty of doing relative addressing on instruction
1680  * destinations), we could potentially do array access of uniforms
1681  * that were loaded in GRF space as push constants.  In real-world
1682  * usage we've seen, though, the arrays being used are always larger
1683  * than we could load as push constants, so just always move all
1684  * uniform array access out to a pull constant buffer.
1685  */
1686 void
1687 fs_visitor::move_uniform_array_access_to_pull_constants()
1688 {
1689    int pull_constant_loc[c->prog_data.nr_params];
1690
1691    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1692       pull_constant_loc[i] = -1;
1693    }
1694
1695    /* Walk through and find array access of uniforms.  Put a copy of that
1696     * uniform in the pull constant buffer.
1697     *
1698     * Note that we don't move constant-indexed accesses to arrays.  No
1699     * testing has been done of the performance impact of this choice.
1700     */
1701    foreach_list_safe(node, &this->instructions) {
1702       fs_inst *inst = (fs_inst *)node;
1703
1704       for (int i = 0 ; i < 3; i++) {
1705          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1706             continue;
1707
1708          int uniform = inst->src[i].reg;
1709
1710          /* If this array isn't already present in the pull constant buffer,
1711           * add it.
1712           */
1713          if (pull_constant_loc[uniform] == -1) {
1714             const float **values = &c->prog_data.param[uniform];
1715
1716             pull_constant_loc[uniform] = c->prog_data.nr_pull_params;
1717
1718             assert(param_size[uniform]);
1719
1720             for (int j = 0; j < param_size[uniform]; j++) {
1721                c->prog_data.pull_param[c->prog_data.nr_pull_params++] =
1722                   values[j];
1723             }
1724          }
1725
1726          /* Set up the annotation tracking for new generated instructions. */
1727          base_ir = inst->ir;
1728          current_annotation = inst->annotation;
1729
1730          fs_reg surf_index = fs_reg(c->prog_data.base.binding_table.pull_constants_start);
1731          fs_reg temp = fs_reg(this, glsl_type::float_type);
1732          exec_list list = VARYING_PULL_CONSTANT_LOAD(temp,
1733                                                      surf_index,
1734                                                      *inst->src[i].reladdr,
1735                                                      pull_constant_loc[uniform] +
1736                                                      inst->src[i].reg_offset);
1737          inst->insert_before(&list);
1738
1739          inst->src[i].file = temp.file;
1740          inst->src[i].reg = temp.reg;
1741          inst->src[i].reg_offset = temp.reg_offset;
1742          inst->src[i].reladdr = NULL;
1743       }
1744    }
1745 }
1746
1747 /**
1748  * Choose accesses from the UNIFORM file to demote to using the pull
1749  * constant buffer.
1750  *
1751  * We allow a fragment shader to have more than the specified minimum
1752  * maximum number of fragment shader uniform components (64).  If
1753  * there are too many of these, they'd fill up all of register space.
1754  * So, this will push some of them out to the pull constant buffer and
1755  * update the program to load them.
1756  */
1757 void
1758 fs_visitor::setup_pull_constants()
1759 {
1760    /* Only allow 16 registers (128 uniform components) as push constants. */
1761    unsigned int max_uniform_components = 16 * 8;
1762    if (c->prog_data.nr_params <= max_uniform_components)
1763       return;
1764
1765    if (dispatch_width == 16) {
1766       fail("Pull constants not supported in 16-wide\n");
1767       return;
1768    }
1769
1770    /* Just demote the end of the list.  We could probably do better
1771     * here, demoting things that are rarely used in the program first.
1772     */
1773    unsigned int pull_uniform_base = max_uniform_components;
1774
1775    int pull_constant_loc[c->prog_data.nr_params];
1776    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1777       if (i < pull_uniform_base) {
1778          pull_constant_loc[i] = -1;
1779       } else {
1780          pull_constant_loc[i] = -1;
1781          /* If our constant is already being uploaded for reladdr purposes,
1782           * reuse it.
1783           */
1784          for (unsigned int j = 0; j < c->prog_data.nr_pull_params; j++) {
1785             if (c->prog_data.pull_param[j] == c->prog_data.param[i]) {
1786                pull_constant_loc[i] = j;
1787                break;
1788             }
1789          }
1790          if (pull_constant_loc[i] == -1) {
1791             int pull_index = c->prog_data.nr_pull_params++;
1792             c->prog_data.pull_param[pull_index] = c->prog_data.param[i];
1793             pull_constant_loc[i] = pull_index;;
1794          }
1795       }
1796    }
1797    c->prog_data.nr_params = pull_uniform_base;
1798
1799    foreach_list(node, &this->instructions) {
1800       fs_inst *inst = (fs_inst *)node;
1801
1802       for (int i = 0; i < 3; i++) {
1803          if (inst->src[i].file != UNIFORM)
1804             continue;
1805
1806          int pull_index = pull_constant_loc[inst->src[i].reg +
1807                                             inst->src[i].reg_offset];
1808          if (pull_index == -1)
1809             continue;
1810
1811          assert(!inst->src[i].reladdr);
1812
1813          fs_reg dst = fs_reg(this, glsl_type::float_type);
1814          fs_reg index = fs_reg(c->prog_data.base.binding_table.pull_constants_start);
1815          fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1816          fs_inst *pull =
1817             new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1818                                  dst, index, offset);
1819          pull->ir = inst->ir;
1820          pull->annotation = inst->annotation;
1821
1822          inst->insert_before(pull);
1823
1824          inst->src[i].file = GRF;
1825          inst->src[i].reg = dst.reg;
1826          inst->src[i].reg_offset = 0;
1827          inst->src[i].smear = pull_index & 3;
1828       }
1829    }
1830 }
1831
1832 bool
1833 fs_visitor::opt_algebraic()
1834 {
1835    bool progress = false;
1836
1837    foreach_list(node, &this->instructions) {
1838       fs_inst *inst = (fs_inst *)node;
1839
1840       switch (inst->opcode) {
1841       case BRW_OPCODE_MUL:
1842          if (inst->src[1].file != IMM)
1843             continue;
1844
1845          /* a * 1.0 = a */
1846          if (inst->src[1].is_one()) {
1847             inst->opcode = BRW_OPCODE_MOV;
1848             inst->src[1] = reg_undef;
1849             progress = true;
1850             break;
1851          }
1852
1853          /* a * 0.0 = 0.0 */
1854          if (inst->src[1].is_zero()) {
1855             inst->opcode = BRW_OPCODE_MOV;
1856             inst->src[0] = inst->src[1];
1857             inst->src[1] = reg_undef;
1858             progress = true;
1859             break;
1860          }
1861
1862          break;
1863       case BRW_OPCODE_ADD:
1864          if (inst->src[1].file != IMM)
1865             continue;
1866
1867          /* a + 0.0 = a */
1868          if (inst->src[1].is_zero()) {
1869             inst->opcode = BRW_OPCODE_MOV;
1870             inst->src[1] = reg_undef;
1871             progress = true;
1872             break;
1873          }
1874          break;
1875       case BRW_OPCODE_OR:
1876          if (inst->src[0].equals(inst->src[1])) {
1877             inst->opcode = BRW_OPCODE_MOV;
1878             inst->src[1] = reg_undef;
1879             progress = true;
1880             break;
1881          }
1882          break;
1883       case BRW_OPCODE_SEL:
1884          if (inst->saturate && inst->src[1].file == IMM) {
1885             switch (inst->conditional_mod) {
1886             case BRW_CONDITIONAL_LE:
1887             case BRW_CONDITIONAL_L:
1888                switch (inst->src[1].type) {
1889                case BRW_REGISTER_TYPE_F:
1890                   if (inst->src[1].imm.f >= 1.0f) {
1891                      inst->opcode = BRW_OPCODE_MOV;
1892                      inst->src[1] = reg_undef;
1893                      progress = true;
1894                   }
1895                   break;
1896                default:
1897                   break;
1898                }
1899                break;
1900             case BRW_CONDITIONAL_GE:
1901             case BRW_CONDITIONAL_G:
1902                switch (inst->src[1].type) {
1903                case BRW_REGISTER_TYPE_F:
1904                   if (inst->src[1].imm.f <= 0.0f) {
1905                      inst->opcode = BRW_OPCODE_MOV;
1906                      inst->src[1] = reg_undef;
1907                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
1908                      progress = true;
1909                   }
1910                   break;
1911                default:
1912                   break;
1913                }
1914             default:
1915                break;
1916             }
1917          }
1918          break;
1919       default:
1920          break;
1921       }
1922    }
1923
1924    return progress;
1925 }
1926
1927 /**
1928  * Removes any instructions writing a VGRF where that VGRF is not used by any
1929  * later instruction.
1930  */
1931 bool
1932 fs_visitor::dead_code_eliminate()
1933 {
1934    bool progress = false;
1935    int pc = 0;
1936
1937    calculate_live_intervals();
1938
1939    foreach_list_safe(node, &this->instructions) {
1940       fs_inst *inst = (fs_inst *)node;
1941
1942       if (inst->dst.file == GRF) {
1943          bool dead = true;
1944
1945          for (int i = 0; i < inst->regs_written; i++) {
1946             int var = live_intervals->var_from_vgrf[inst->dst.reg];
1947             assert(live_intervals->end[var + inst->dst.reg_offset + i] >= pc);
1948             if (live_intervals->end[var + inst->dst.reg_offset + i] != pc) {
1949                dead = false;
1950                break;
1951             }
1952          }
1953
1954          if (dead) {
1955             /* Don't dead code eliminate instructions that write to the
1956              * accumulator as a side-effect. Instead just set the destination
1957              * to the null register to free it.
1958              */
1959             switch (inst->opcode) {
1960             case BRW_OPCODE_ADDC:
1961             case BRW_OPCODE_SUBB:
1962             case BRW_OPCODE_MACH:
1963                inst->dst = fs_reg(retype(brw_null_reg(), inst->dst.type));
1964                break;
1965             default:
1966                inst->remove();
1967                progress = true;
1968                break;
1969             }
1970          }
1971       }
1972
1973       pc++;
1974    }
1975
1976    if (progress)
1977       invalidate_live_intervals();
1978
1979    return progress;
1980 }
1981
1982 struct dead_code_hash_key
1983 {
1984    int vgrf;
1985    int reg_offset;
1986 };
1987
1988 static bool
1989 dead_code_hash_compare(const void *a, const void *b)
1990 {
1991    return memcmp(a, b, sizeof(struct dead_code_hash_key)) == 0;
1992 }
1993
1994 static void
1995 clear_dead_code_hash(struct hash_table *ht)
1996 {
1997    struct hash_entry *entry;
1998
1999    hash_table_foreach(ht, entry) {
2000       _mesa_hash_table_remove(ht, entry);
2001    }
2002 }
2003
2004 static void
2005 insert_dead_code_hash(struct hash_table *ht,
2006                       int vgrf, int reg_offset, fs_inst *inst)
2007 {
2008    /* We don't bother freeing keys, because they'll be GCed with the ht. */
2009    struct dead_code_hash_key *key = ralloc(ht, struct dead_code_hash_key);
2010
2011    key->vgrf = vgrf;
2012    key->reg_offset = reg_offset;
2013
2014    _mesa_hash_table_insert(ht, _mesa_hash_data(key, sizeof(*key)), key, inst);
2015 }
2016
2017 static struct hash_entry *
2018 get_dead_code_hash_entry(struct hash_table *ht, int vgrf, int reg_offset)
2019 {
2020    struct dead_code_hash_key key;
2021
2022    key.vgrf = vgrf;
2023    key.reg_offset = reg_offset;
2024
2025    return _mesa_hash_table_search(ht, _mesa_hash_data(&key, sizeof(key)), &key);
2026 }
2027
2028 static void
2029 remove_dead_code_hash(struct hash_table *ht,
2030                       int vgrf, int reg_offset)
2031 {
2032    struct hash_entry *entry = get_dead_code_hash_entry(ht, vgrf, reg_offset);
2033    if (!entry)
2034       return;
2035
2036    _mesa_hash_table_remove(ht, entry);
2037 }
2038
2039 /**
2040  * Walks basic blocks, removing any regs that are written but not read before
2041  * being redefined.
2042  *
2043  * The dead_code_eliminate() function implements a global dead code
2044  * elimination, but it only handles the removing the last write to a register
2045  * if it's never read.  This one can handle intermediate writes, but only
2046  * within a basic block.
2047  */
2048 bool
2049 fs_visitor::dead_code_eliminate_local()
2050 {
2051    struct hash_table *ht;
2052    bool progress = false;
2053
2054    ht = _mesa_hash_table_create(mem_ctx, dead_code_hash_compare);
2055
2056    foreach_list_safe(node, &this->instructions) {
2057       fs_inst *inst = (fs_inst *)node;
2058
2059       /* At a basic block, empty the HT since we don't understand dataflow
2060        * here.
2061        */
2062       if (inst->is_control_flow()) {
2063          clear_dead_code_hash(ht);
2064          continue;
2065       }
2066
2067       /* Clear the HT of any instructions that got read. */
2068       for (int i = 0; i < 3; i++) {
2069          fs_reg src = inst->src[i];
2070          if (src.file != GRF)
2071             continue;
2072
2073          int read = 1;
2074          if (inst->is_send_from_grf())
2075             read = virtual_grf_sizes[src.reg] - src.reg_offset;
2076
2077          for (int reg_offset = src.reg_offset;
2078               reg_offset < src.reg_offset + read;
2079               reg_offset++) {
2080             remove_dead_code_hash(ht, src.reg, reg_offset);
2081          }
2082       }
2083
2084       /* Add any update of a GRF to the HT, removing a previous write if it
2085        * wasn't read.
2086        */
2087       if (inst->dst.file == GRF) {
2088          if (inst->regs_written > 1) {
2089             /* We don't know how to trim channels from an instruction's
2090              * writes, so we can't incrementally remove unread channels from
2091              * it.  Just remove whatever it overwrites from the table
2092              */
2093             for (int i = 0; i < inst->regs_written; i++) {
2094                remove_dead_code_hash(ht,
2095                                      inst->dst.reg,
2096                                      inst->dst.reg_offset + i);
2097             }
2098          } else {
2099             struct hash_entry *entry =
2100                get_dead_code_hash_entry(ht, inst->dst.reg,
2101                                         inst->dst.reg_offset);
2102
2103             if (inst->is_partial_write()) {
2104                /* For a partial write, we can't remove any previous dead code
2105                 * candidate, since we're just modifying their result, but we can
2106                 * be dead code eliminiated ourselves.
2107                 */
2108                if (entry) {
2109                   entry->data = inst;
2110                } else {
2111                   insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
2112                                         inst);
2113                }
2114             } else {
2115                if (entry) {
2116                   /* We're completely updating a channel, and there was a
2117                    * previous write to the channel that wasn't read.  Kill it!
2118                    */
2119                   fs_inst *inst = (fs_inst *)entry->data;
2120                   inst->remove();
2121                   progress = true;
2122                   _mesa_hash_table_remove(ht, entry);
2123                }
2124
2125                insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
2126                                      inst);
2127             }
2128          }
2129       }
2130    }
2131
2132    _mesa_hash_table_destroy(ht, NULL);
2133
2134    if (progress)
2135       invalidate_live_intervals();
2136
2137    return progress;
2138 }
2139
2140 /**
2141  * Implements a second type of register coalescing: This one checks if
2142  * the two regs involved in a raw move don't interfere, in which case
2143  * they can both by stored in the same place and the MOV removed.
2144  */
2145 bool
2146 fs_visitor::register_coalesce_2()
2147 {
2148    bool progress = false;
2149
2150    calculate_live_intervals();
2151
2152    foreach_list_safe(node, &this->instructions) {
2153       fs_inst *inst = (fs_inst *)node;
2154
2155       if (inst->opcode != BRW_OPCODE_MOV ||
2156           inst->is_partial_write() ||
2157           inst->saturate ||
2158           inst->src[0].file != GRF ||
2159           inst->src[0].negate ||
2160           inst->src[0].abs ||
2161           inst->src[0].smear != -1 ||
2162           inst->dst.file != GRF ||
2163           inst->dst.type != inst->src[0].type ||
2164           virtual_grf_sizes[inst->src[0].reg] != 1) {
2165          continue;
2166       }
2167
2168       int var_from = live_intervals->var_from_reg(&inst->src[0]);
2169       int var_to = live_intervals->var_from_reg(&inst->dst);
2170
2171       if (live_intervals->vars_interfere(var_from, var_to))
2172          continue;
2173
2174       int reg_from = inst->src[0].reg;
2175       assert(inst->src[0].reg_offset == 0);
2176       int reg_to = inst->dst.reg;
2177       int reg_to_offset = inst->dst.reg_offset;
2178
2179       foreach_list(node, &this->instructions) {
2180          fs_inst *scan_inst = (fs_inst *)node;
2181
2182          if (scan_inst->dst.file == GRF &&
2183              scan_inst->dst.reg == reg_from) {
2184             scan_inst->dst.reg = reg_to;
2185             scan_inst->dst.reg_offset = reg_to_offset;
2186          }
2187          for (int i = 0; i < 3; i++) {
2188             if (scan_inst->src[i].file == GRF &&
2189                 scan_inst->src[i].reg == reg_from) {
2190                scan_inst->src[i].reg = reg_to;
2191                scan_inst->src[i].reg_offset = reg_to_offset;
2192             }
2193          }
2194       }
2195
2196       inst->remove();
2197       progress = true;
2198       continue;
2199    }
2200
2201    if (progress)
2202       invalidate_live_intervals();
2203
2204    return progress;
2205 }
2206
2207 bool
2208 fs_visitor::register_coalesce()
2209 {
2210    bool progress = false;
2211    int if_depth = 0;
2212    int loop_depth = 0;
2213
2214    foreach_list_safe(node, &this->instructions) {
2215       fs_inst *inst = (fs_inst *)node;
2216
2217       /* Make sure that we dominate the instructions we're going to
2218        * scan for interfering with our coalescing, or we won't have
2219        * scanned enough to see if anything interferes with our
2220        * coalescing.  We don't dominate the following instructions if
2221        * we're in a loop or an if block.
2222        */
2223       switch (inst->opcode) {
2224       case BRW_OPCODE_DO:
2225          loop_depth++;
2226          break;
2227       case BRW_OPCODE_WHILE:
2228          loop_depth--;
2229          break;
2230       case BRW_OPCODE_IF:
2231          if_depth++;
2232          break;
2233       case BRW_OPCODE_ENDIF:
2234          if_depth--;
2235          break;
2236       default:
2237          break;
2238       }
2239       if (loop_depth || if_depth)
2240          continue;
2241
2242       if (inst->opcode != BRW_OPCODE_MOV ||
2243           inst->is_partial_write() ||
2244           inst->saturate ||
2245           inst->dst.file != GRF || (inst->src[0].file != GRF &&
2246                                     inst->src[0].file != UNIFORM)||
2247           inst->dst.type != inst->src[0].type)
2248          continue;
2249
2250       bool has_source_modifiers = (inst->src[0].abs ||
2251                                    inst->src[0].negate ||
2252                                    inst->src[0].smear != -1 ||
2253                                    inst->src[0].file == UNIFORM);
2254
2255       /* Found a move of a GRF to a GRF.  Let's see if we can coalesce
2256        * them: check for no writes to either one until the exit of the
2257        * program.
2258        */
2259       bool interfered = false;
2260
2261       for (fs_inst *scan_inst = (fs_inst *)inst->next;
2262            !scan_inst->is_tail_sentinel();
2263            scan_inst = (fs_inst *)scan_inst->next) {
2264          if (scan_inst->dst.file == GRF) {
2265             if (scan_inst->overwrites_reg(inst->dst) ||
2266                 scan_inst->overwrites_reg(inst->src[0])) {
2267                interfered = true;
2268                break;
2269             }
2270          }
2271
2272          if (has_source_modifiers) {
2273             for (int i = 0; i < 3; i++) {
2274                if (scan_inst->src[i].file == GRF &&
2275                    scan_inst->src[i].reg == inst->dst.reg &&
2276                    scan_inst->src[i].reg_offset == inst->dst.reg_offset &&
2277                    inst->dst.type != scan_inst->src[i].type)
2278                {
2279                  interfered = true;
2280                  break;
2281                }
2282             }
2283          }
2284
2285
2286          /* The gen6 MATH instruction can't handle source modifiers or
2287           * unusual register regions, so avoid coalescing those for
2288           * now.  We should do something more specific.
2289           */
2290          if (has_source_modifiers && !can_do_source_mods(scan_inst)) {
2291             interfered = true;
2292             break;
2293          }
2294
2295          if (scan_inst->mlen > 0 && scan_inst->base_mrf == -1 &&
2296              scan_inst->src[0].file == GRF &&
2297              scan_inst->src[0].reg == inst->dst.reg) {
2298             interfered = true;
2299             break;
2300          }
2301
2302          /* The accumulator result appears to get used for the
2303           * conditional modifier generation.  When negating a UD
2304           * value, there is a 33rd bit generated for the sign in the
2305           * accumulator value, so now you can't check, for example,
2306           * equality with a 32-bit value.  See piglit fs-op-neg-uint.
2307           */
2308          if (scan_inst->conditional_mod &&
2309              inst->src[0].negate &&
2310              inst->src[0].type == BRW_REGISTER_TYPE_UD) {
2311             interfered = true;
2312             break;
2313          }
2314       }
2315       if (interfered) {
2316          continue;
2317       }
2318
2319       /* Rewrite the later usage to point at the source of the move to
2320        * be removed.
2321        */
2322       for (fs_inst *scan_inst = inst;
2323            !scan_inst->is_tail_sentinel();
2324            scan_inst = (fs_inst *)scan_inst->next) {
2325          for (int i = 0; i < 3; i++) {
2326             if (scan_inst->src[i].file == GRF &&
2327                 scan_inst->src[i].reg == inst->dst.reg &&
2328                 scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
2329                fs_reg new_src = inst->src[0];
2330                if (scan_inst->src[i].abs) {
2331                   new_src.negate = 0;
2332                   new_src.abs = 1;
2333                }
2334                new_src.negate ^= scan_inst->src[i].negate;
2335                new_src.sechalf = scan_inst->src[i].sechalf;
2336                scan_inst->src[i] = new_src;
2337             }
2338          }
2339       }
2340
2341       inst->remove();
2342       progress = true;
2343    }
2344
2345    if (progress)
2346       invalidate_live_intervals();
2347
2348    return progress;
2349 }
2350
2351
2352 bool
2353 fs_visitor::compute_to_mrf()
2354 {
2355    bool progress = false;
2356    int next_ip = 0;
2357
2358    calculate_live_intervals();
2359
2360    foreach_list_safe(node, &this->instructions) {
2361       fs_inst *inst = (fs_inst *)node;
2362
2363       int ip = next_ip;
2364       next_ip++;
2365
2366       if (inst->opcode != BRW_OPCODE_MOV ||
2367           inst->is_partial_write() ||
2368           inst->dst.file != MRF || inst->src[0].file != GRF ||
2369           inst->dst.type != inst->src[0].type ||
2370           inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
2371          continue;
2372
2373       /* Work out which hardware MRF registers are written by this
2374        * instruction.
2375        */
2376       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2377       int mrf_high;
2378       if (inst->dst.reg & BRW_MRF_COMPR4) {
2379          mrf_high = mrf_low + 4;
2380       } else if (dispatch_width == 16 &&
2381                  (!inst->force_uncompressed && !inst->force_sechalf)) {
2382          mrf_high = mrf_low + 1;
2383       } else {
2384          mrf_high = mrf_low;
2385       }
2386
2387       /* Can't compute-to-MRF this GRF if someone else was going to
2388        * read it later.
2389        */
2390       if (this->virtual_grf_end[inst->src[0].reg] > ip)
2391          continue;
2392
2393       /* Found a move of a GRF to a MRF.  Let's see if we can go
2394        * rewrite the thing that made this GRF to write into the MRF.
2395        */
2396       fs_inst *scan_inst;
2397       for (scan_inst = (fs_inst *)inst->prev;
2398            scan_inst->prev != NULL;
2399            scan_inst = (fs_inst *)scan_inst->prev) {
2400          if (scan_inst->dst.file == GRF &&
2401              scan_inst->dst.reg == inst->src[0].reg) {
2402             /* Found the last thing to write our reg we want to turn
2403              * into a compute-to-MRF.
2404              */
2405
2406             /* If this one instruction didn't populate all the
2407              * channels, bail.  We might be able to rewrite everything
2408              * that writes that reg, but it would require smarter
2409              * tracking to delay the rewriting until complete success.
2410              */
2411             if (scan_inst->is_partial_write())
2412                break;
2413
2414             /* Things returning more than one register would need us to
2415              * understand coalescing out more than one MOV at a time.
2416              */
2417             if (scan_inst->regs_written > 1)
2418                break;
2419
2420             /* SEND instructions can't have MRF as a destination. */
2421             if (scan_inst->mlen)
2422                break;
2423
2424             if (brw->gen == 6) {
2425                /* gen6 math instructions must have the destination be
2426                 * GRF, so no compute-to-MRF for them.
2427                 */
2428                if (scan_inst->is_math()) {
2429                   break;
2430                }
2431             }
2432
2433             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2434                /* Found the creator of our MRF's source value. */
2435                scan_inst->dst.file = MRF;
2436                scan_inst->dst.reg = inst->dst.reg;
2437                scan_inst->saturate |= inst->saturate;
2438                inst->remove();
2439                progress = true;
2440             }
2441             break;
2442          }
2443
2444          /* We don't handle control flow here.  Most computation of
2445           * values that end up in MRFs are shortly before the MRF
2446           * write anyway.
2447           */
2448          if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2449             break;
2450
2451          /* You can't read from an MRF, so if someone else reads our
2452           * MRF's source GRF that we wanted to rewrite, that stops us.
2453           */
2454          bool interfered = false;
2455          for (int i = 0; i < 3; i++) {
2456             if (scan_inst->src[i].file == GRF &&
2457                 scan_inst->src[i].reg == inst->src[0].reg &&
2458                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2459                interfered = true;
2460             }
2461          }
2462          if (interfered)
2463             break;
2464
2465          if (scan_inst->dst.file == MRF) {
2466             /* If somebody else writes our MRF here, we can't
2467              * compute-to-MRF before that.
2468              */
2469             int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2470             int scan_mrf_high;
2471
2472             if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2473                scan_mrf_high = scan_mrf_low + 4;
2474             } else if (dispatch_width == 16 &&
2475                        (!scan_inst->force_uncompressed &&
2476                         !scan_inst->force_sechalf)) {
2477                scan_mrf_high = scan_mrf_low + 1;
2478             } else {
2479                scan_mrf_high = scan_mrf_low;
2480             }
2481
2482             if (mrf_low == scan_mrf_low ||
2483                 mrf_low == scan_mrf_high ||
2484                 mrf_high == scan_mrf_low ||
2485                 mrf_high == scan_mrf_high) {
2486                break;
2487             }
2488          }
2489
2490          if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2491             /* Found a SEND instruction, which means that there are
2492              * live values in MRFs from base_mrf to base_mrf +
2493              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
2494              * above it.
2495              */
2496             if (mrf_low >= scan_inst->base_mrf &&
2497                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2498                break;
2499             }
2500             if (mrf_high >= scan_inst->base_mrf &&
2501                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2502                break;
2503             }
2504          }
2505       }
2506    }
2507
2508    if (progress)
2509       invalidate_live_intervals();
2510
2511    return progress;
2512 }
2513
2514 /**
2515  * Walks through basic blocks, looking for repeated MRF writes and
2516  * removing the later ones.
2517  */
2518 bool
2519 fs_visitor::remove_duplicate_mrf_writes()
2520 {
2521    fs_inst *last_mrf_move[16];
2522    bool progress = false;
2523
2524    /* Need to update the MRF tracking for compressed instructions. */
2525    if (dispatch_width == 16)
2526       return false;
2527
2528    memset(last_mrf_move, 0, sizeof(last_mrf_move));
2529
2530    foreach_list_safe(node, &this->instructions) {
2531       fs_inst *inst = (fs_inst *)node;
2532
2533       if (inst->is_control_flow()) {
2534          memset(last_mrf_move, 0, sizeof(last_mrf_move));
2535       }
2536
2537       if (inst->opcode == BRW_OPCODE_MOV &&
2538           inst->dst.file == MRF) {
2539          fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2540          if (prev_inst && inst->equals(prev_inst)) {
2541             inst->remove();
2542             progress = true;
2543             continue;
2544          }
2545       }
2546
2547       /* Clear out the last-write records for MRFs that were overwritten. */
2548       if (inst->dst.file == MRF) {
2549          last_mrf_move[inst->dst.reg] = NULL;
2550       }
2551
2552       if (inst->mlen > 0 && inst->base_mrf != -1) {
2553          /* Found a SEND instruction, which will include two or fewer
2554           * implied MRF writes.  We could do better here.
2555           */
2556          for (int i = 0; i < implied_mrf_writes(inst); i++) {
2557             last_mrf_move[inst->base_mrf + i] = NULL;
2558          }
2559       }
2560
2561       /* Clear out any MRF move records whose sources got overwritten. */
2562       if (inst->dst.file == GRF) {
2563          for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2564             if (last_mrf_move[i] &&
2565                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2566                last_mrf_move[i] = NULL;
2567             }
2568          }
2569       }
2570
2571       if (inst->opcode == BRW_OPCODE_MOV &&
2572           inst->dst.file == MRF &&
2573           inst->src[0].file == GRF &&
2574           !inst->is_partial_write()) {
2575          last_mrf_move[inst->dst.reg] = inst;
2576       }
2577    }
2578
2579    if (progress)
2580       invalidate_live_intervals();
2581
2582    return progress;
2583 }
2584
2585 static void
2586 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2587                         int first_grf, int grf_len)
2588 {
2589    bool inst_16wide = (dispatch_width > 8 &&
2590                        !inst->force_uncompressed &&
2591                        !inst->force_sechalf);
2592
2593    /* Clear the flag for registers that actually got read (as expected). */
2594    for (int i = 0; i < 3; i++) {
2595       int grf;
2596       if (inst->src[i].file == GRF) {
2597          grf = inst->src[i].reg;
2598       } else if (inst->src[i].file == HW_REG &&
2599                  inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2600          grf = inst->src[i].fixed_hw_reg.nr;
2601       } else {
2602          continue;
2603       }
2604
2605       if (grf >= first_grf &&
2606           grf < first_grf + grf_len) {
2607          deps[grf - first_grf] = false;
2608          if (inst_16wide)
2609             deps[grf - first_grf + 1] = false;
2610       }
2611    }
2612 }
2613
2614 /**
2615  * Implements this workaround for the original 965:
2616  *
2617  *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2618  *      check for post destination dependencies on this instruction, software
2619  *      must ensure that there is no destination hazard for the case of ‘write
2620  *      followed by a posted write’ shown in the following example.
2621  *
2622  *      1. mov r3 0
2623  *      2. send r3.xy <rest of send instruction>
2624  *      3. mov r2 r3
2625  *
2626  *      Due to no post-destination dependency check on the ‘send’, the above
2627  *      code sequence could have two instructions (1 and 2) in flight at the
2628  *      same time that both consider ‘r3’ as the target of their final writes.
2629  */
2630 void
2631 fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
2632 {
2633    int reg_size = dispatch_width / 8;
2634    int write_len = inst->regs_written * reg_size;
2635    int first_write_grf = inst->dst.reg;
2636    bool needs_dep[BRW_MAX_MRF];
2637    assert(write_len < (int)sizeof(needs_dep) - 1);
2638
2639    memset(needs_dep, false, sizeof(needs_dep));
2640    memset(needs_dep, true, write_len);
2641
2642    clear_deps_for_inst_src(inst, dispatch_width,
2643                            needs_dep, first_write_grf, write_len);
2644
2645    /* Walk backwards looking for writes to registers we're writing which
2646     * aren't read since being written.  If we hit the start of the program,
2647     * we assume that there are no outstanding dependencies on entry to the
2648     * program.
2649     */
2650    for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2651         scan_inst != NULL;
2652         scan_inst = (fs_inst *)scan_inst->prev) {
2653
2654       /* If we hit control flow, assume that there *are* outstanding
2655        * dependencies, and force their cleanup before our instruction.
2656        */
2657       if (scan_inst->is_control_flow()) {
2658          for (int i = 0; i < write_len; i++) {
2659             if (needs_dep[i]) {
2660                inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2661             }
2662          }
2663          return;
2664       }
2665
2666       bool scan_inst_16wide = (dispatch_width > 8 &&
2667                                !scan_inst->force_uncompressed &&
2668                                !scan_inst->force_sechalf);
2669
2670       /* We insert our reads as late as possible on the assumption that any
2671        * instruction but a MOV that might have left us an outstanding
2672        * dependency has more latency than a MOV.
2673        */
2674       if (scan_inst->dst.file == GRF) {
2675          for (int i = 0; i < scan_inst->regs_written; i++) {
2676             int reg = scan_inst->dst.reg + i * reg_size;
2677
2678             if (reg >= first_write_grf &&
2679                 reg < first_write_grf + write_len &&
2680                 needs_dep[reg - first_write_grf]) {
2681                inst->insert_before(DEP_RESOLVE_MOV(reg));
2682                needs_dep[reg - first_write_grf] = false;
2683                if (scan_inst_16wide)
2684                   needs_dep[reg - first_write_grf + 1] = false;
2685             }
2686          }
2687       }
2688
2689       /* Clear the flag for registers that actually got read (as expected). */
2690       clear_deps_for_inst_src(scan_inst, dispatch_width,
2691                               needs_dep, first_write_grf, write_len);
2692
2693       /* Continue the loop only if we haven't resolved all the dependencies */
2694       int i;
2695       for (i = 0; i < write_len; i++) {
2696          if (needs_dep[i])
2697             break;
2698       }
2699       if (i == write_len)
2700          return;
2701    }
2702 }
2703
2704 /**
2705  * Implements this workaround for the original 965:
2706  *
2707  *     "[DevBW, DevCL] Errata: A destination register from a send can not be
2708  *      used as a destination register until after it has been sourced by an
2709  *      instruction with a different destination register.
2710  */
2711 void
2712 fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
2713 {
2714    int write_len = inst->regs_written * dispatch_width / 8;
2715    int first_write_grf = inst->dst.reg;
2716    bool needs_dep[BRW_MAX_MRF];
2717    assert(write_len < (int)sizeof(needs_dep) - 1);
2718
2719    memset(needs_dep, false, sizeof(needs_dep));
2720    memset(needs_dep, true, write_len);
2721    /* Walk forwards looking for writes to registers we're writing which aren't
2722     * read before being written.
2723     */
2724    for (fs_inst *scan_inst = (fs_inst *)inst->next;
2725         !scan_inst->is_tail_sentinel();
2726         scan_inst = (fs_inst *)scan_inst->next) {
2727       /* If we hit control flow, force resolve all remaining dependencies. */
2728       if (scan_inst->is_control_flow()) {
2729          for (int i = 0; i < write_len; i++) {
2730             if (needs_dep[i])
2731                scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2732          }
2733          return;
2734       }
2735
2736       /* Clear the flag for registers that actually got read (as expected). */
2737       clear_deps_for_inst_src(scan_inst, dispatch_width,
2738                               needs_dep, first_write_grf, write_len);
2739
2740       /* We insert our reads as late as possible since they're reading the
2741        * result of a SEND, which has massive latency.
2742        */
2743       if (scan_inst->dst.file == GRF &&
2744           scan_inst->dst.reg >= first_write_grf &&
2745           scan_inst->dst.reg < first_write_grf + write_len &&
2746           needs_dep[scan_inst->dst.reg - first_write_grf]) {
2747          scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2748          needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2749       }
2750
2751       /* Continue the loop only if we haven't resolved all the dependencies */
2752       int i;
2753       for (i = 0; i < write_len; i++) {
2754          if (needs_dep[i])
2755             break;
2756       }
2757       if (i == write_len)
2758          return;
2759    }
2760
2761    /* If we hit the end of the program, resolve all remaining dependencies out
2762     * of paranoia.
2763     */
2764    fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2765    assert(last_inst->eot);
2766    for (int i = 0; i < write_len; i++) {
2767       if (needs_dep[i])
2768          last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2769    }
2770 }
2771
2772 void
2773 fs_visitor::insert_gen4_send_dependency_workarounds()
2774 {
2775    if (brw->gen != 4 || brw->is_g4x)
2776       return;
2777
2778    /* Note that we're done with register allocation, so GRF fs_regs always
2779     * have a .reg_offset of 0.
2780     */
2781
2782    foreach_list_safe(node, &this->instructions) {
2783       fs_inst *inst = (fs_inst *)node;
2784
2785       if (inst->mlen != 0 && inst->dst.file == GRF) {
2786          insert_gen4_pre_send_dependency_workarounds(inst);
2787          insert_gen4_post_send_dependency_workarounds(inst);
2788       }
2789    }
2790 }
2791
2792 /**
2793  * Turns the generic expression-style uniform pull constant load instruction
2794  * into a hardware-specific series of instructions for loading a pull
2795  * constant.
2796  *
2797  * The expression style allows the CSE pass before this to optimize out
2798  * repeated loads from the same offset, and gives the pre-register-allocation
2799  * scheduling full flexibility, while the conversion to native instructions
2800  * allows the post-register-allocation scheduler the best information
2801  * possible.
2802  *
2803  * Note that execution masking for setting up pull constant loads is special:
2804  * the channels that need to be written are unrelated to the current execution
2805  * mask, since a later instruction will use one of the result channels as a
2806  * source operand for all 8 or 16 of its channels.
2807  */
2808 void
2809 fs_visitor::lower_uniform_pull_constant_loads()
2810 {
2811    foreach_list(node, &this->instructions) {
2812       fs_inst *inst = (fs_inst *)node;
2813
2814       if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2815          continue;
2816
2817       if (brw->gen >= 7) {
2818          /* The offset arg before was a vec4-aligned byte offset.  We need to
2819           * turn it into a dword offset.
2820           */
2821          fs_reg const_offset_reg = inst->src[1];
2822          assert(const_offset_reg.file == IMM &&
2823                 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2824          const_offset_reg.imm.u /= 4;
2825          fs_reg payload = fs_reg(this, glsl_type::uint_type);
2826
2827          /* This is actually going to be a MOV, but since only the first dword
2828           * is accessed, we have a special opcode to do just that one.  Note
2829           * that this needs to be an operation that will be considered a def
2830           * by live variable analysis, or register allocation will explode.
2831           */
2832          fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2833                                                payload, const_offset_reg);
2834          setup->force_writemask_all = true;
2835
2836          setup->ir = inst->ir;
2837          setup->annotation = inst->annotation;
2838          inst->insert_before(setup);
2839
2840          /* Similarly, this will only populate the first 4 channels of the
2841           * result register (since we only use smear values from 0-3), but we
2842           * don't tell the optimizer.
2843           */
2844          inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2845          inst->src[1] = payload;
2846
2847          invalidate_live_intervals();
2848       } else {
2849          /* Before register allocation, we didn't tell the scheduler about the
2850           * MRF we use.  We know it's safe to use this MRF because nothing
2851           * else does except for register spill/unspill, which generates and
2852           * uses its MRF within a single IR instruction.
2853           */
2854          inst->base_mrf = 14;
2855          inst->mlen = 1;
2856       }
2857    }
2858 }
2859
2860 void
2861 fs_visitor::dump_instruction(backend_instruction *be_inst)
2862 {
2863    fs_inst *inst = (fs_inst *)be_inst;
2864
2865    if (inst->predicate) {
2866       printf("(%cf0.%d) ",
2867              inst->predicate_inverse ? '-' : '+',
2868              inst->flag_subreg);
2869    }
2870
2871    printf("%s", brw_instruction_name(inst->opcode));
2872    if (inst->saturate)
2873       printf(".sat");
2874    if (inst->conditional_mod) {
2875       printf(".cmod");
2876       if (!inst->predicate &&
2877           (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2878                               inst->opcode != BRW_OPCODE_IF &&
2879                               inst->opcode != BRW_OPCODE_WHILE))) {
2880          printf(".f0.%d", inst->flag_subreg);
2881       }
2882    }
2883    printf(" ");
2884
2885
2886    switch (inst->dst.file) {
2887    case GRF:
2888       printf("vgrf%d", inst->dst.reg);
2889       if (inst->dst.reg_offset)
2890          printf("+%d", inst->dst.reg_offset);
2891       break;
2892    case MRF:
2893       printf("m%d", inst->dst.reg);
2894       break;
2895    case BAD_FILE:
2896       printf("(null)");
2897       break;
2898    case UNIFORM:
2899       printf("***u%d***", inst->dst.reg);
2900       break;
2901    case HW_REG:
2902       printf("hw_reg%d", inst->dst.fixed_hw_reg.nr);
2903       if (inst->dst.fixed_hw_reg.subnr)
2904          printf("+%d", inst->dst.fixed_hw_reg.subnr);
2905       break;
2906    default:
2907       printf("???");
2908       break;
2909    }
2910    printf(", ");
2911
2912    for (int i = 0; i < 3; i++) {
2913       if (inst->src[i].negate)
2914          printf("-");
2915       if (inst->src[i].abs)
2916          printf("|");
2917       switch (inst->src[i].file) {
2918       case GRF:
2919          printf("vgrf%d", inst->src[i].reg);
2920          if (inst->src[i].reg_offset)
2921             printf("+%d", inst->src[i].reg_offset);
2922          break;
2923       case MRF:
2924          printf("***m%d***", inst->src[i].reg);
2925          break;
2926       case UNIFORM:
2927          printf("u%d", inst->src[i].reg);
2928          if (inst->src[i].reg_offset)
2929             printf(".%d", inst->src[i].reg_offset);
2930          break;
2931       case BAD_FILE:
2932          printf("(null)");
2933          break;
2934       case IMM:
2935          switch (inst->src[i].type) {
2936          case BRW_REGISTER_TYPE_F:
2937             printf("%ff", inst->src[i].imm.f);
2938             break;
2939          case BRW_REGISTER_TYPE_D:
2940             printf("%dd", inst->src[i].imm.i);
2941             break;
2942          case BRW_REGISTER_TYPE_UD:
2943             printf("%uu", inst->src[i].imm.u);
2944             break;
2945          default:
2946             printf("???");
2947             break;
2948          }
2949          break;
2950       case HW_REG:
2951          if (inst->src[i].fixed_hw_reg.negate)
2952             printf("-");
2953          if (inst->src[i].fixed_hw_reg.abs)
2954             printf("|");
2955          printf("hw_reg%d", inst->src[i].fixed_hw_reg.nr);
2956          if (inst->src[i].fixed_hw_reg.subnr)
2957             printf("+%d", inst->src[i].fixed_hw_reg.subnr);
2958          if (inst->src[i].fixed_hw_reg.abs)
2959             printf("|");
2960          break;
2961       default:
2962          printf("???");
2963          break;
2964       }
2965       if (inst->src[i].abs)
2966          printf("|");
2967
2968       if (i < 3)
2969          printf(", ");
2970    }
2971
2972    printf(" ");
2973
2974    if (inst->force_uncompressed)
2975       printf("1sthalf ");
2976
2977    if (inst->force_sechalf)
2978       printf("2ndhalf ");
2979
2980    printf("\n");
2981 }
2982
2983 /**
2984  * Possibly returns an instruction that set up @param reg.
2985  *
2986  * Sometimes we want to take the result of some expression/variable
2987  * dereference tree and rewrite the instruction generating the result
2988  * of the tree.  When processing the tree, we know that the
2989  * instructions generated are all writing temporaries that are dead
2990  * outside of this tree.  So, if we have some instructions that write
2991  * a temporary, we're free to point that temp write somewhere else.
2992  *
2993  * Note that this doesn't guarantee that the instruction generated
2994  * only reg -- it might be the size=4 destination of a texture instruction.
2995  */
2996 fs_inst *
2997 fs_visitor::get_instruction_generating_reg(fs_inst *start,
2998                                            fs_inst *end,
2999                                            fs_reg reg)
3000 {
3001    if (end == start ||
3002        end->is_partial_write() ||
3003        reg.reladdr ||
3004        !reg.equals(end->dst)) {
3005       return NULL;
3006    } else {
3007       return end;
3008    }
3009 }
3010
3011 void
3012 fs_visitor::setup_payload_gen6()
3013 {
3014    bool uses_depth =
3015       (fp->Base.InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3016    unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
3017
3018    assert(brw->gen >= 6);
3019
3020    /* R0-1: masks, pixel X/Y coordinates. */
3021    c->nr_payload_regs = 2;
3022    /* R2: only for 32-pixel dispatch.*/
3023
3024    /* R3-26: barycentric interpolation coordinates.  These appear in the
3025     * same order that they appear in the brw_wm_barycentric_interp_mode
3026     * enum.  Each set of coordinates occupies 2 registers if dispatch width
3027     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
3028     * appear if they were enabled using the "Barycentric Interpolation
3029     * Mode" bits in WM_STATE.
3030     */
3031    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3032       if (barycentric_interp_modes & (1 << i)) {
3033          c->barycentric_coord_reg[i] = c->nr_payload_regs;
3034          c->nr_payload_regs += 2;
3035          if (dispatch_width == 16) {
3036             c->nr_payload_regs += 2;
3037          }
3038       }
3039    }
3040
3041    /* R27: interpolated depth if uses source depth */
3042    if (uses_depth) {
3043       c->source_depth_reg = c->nr_payload_regs;
3044       c->nr_payload_regs++;
3045       if (dispatch_width == 16) {
3046          /* R28: interpolated depth if not 8-wide. */
3047          c->nr_payload_regs++;
3048       }
3049    }
3050    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3051    if (uses_depth) {
3052       c->source_w_reg = c->nr_payload_regs;
3053       c->nr_payload_regs++;
3054       if (dispatch_width == 16) {
3055          /* R30: interpolated W if not 8-wide. */
3056          c->nr_payload_regs++;
3057       }
3058    }
3059    /* R31: MSAA position offsets. */
3060    /* R32-: bary for 32-pixel. */
3061    /* R58-59: interp W for 32-pixel. */
3062
3063    if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3064       c->source_depth_to_render_target = true;
3065    }
3066 }
3067
3068 void
3069 fs_visitor::assign_binding_table_offsets()
3070 {
3071    uint32_t next_binding_table_offset = 0;
3072
3073    c->prog_data.binding_table.render_target_start = next_binding_table_offset;
3074    next_binding_table_offset += c->key.nr_color_regions;
3075
3076    assign_common_binding_table_offsets(next_binding_table_offset);
3077 }
3078
3079 bool
3080 fs_visitor::run()
3081 {
3082    sanity_param_count = fp->Base.Parameters->NumParameters;
3083    uint32_t orig_nr_params = c->prog_data.nr_params;
3084
3085    assign_binding_table_offsets();
3086
3087    if (brw->gen >= 6)
3088       setup_payload_gen6();
3089    else
3090       setup_payload_gen4();
3091
3092    if (0) {
3093       emit_dummy_fs();
3094    } else {
3095       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3096          emit_shader_time_begin();
3097
3098       calculate_urb_setup();
3099       if (fp->Base.InputsRead > 0) {
3100          if (brw->gen < 6)
3101             emit_interpolation_setup_gen4();
3102          else
3103             emit_interpolation_setup_gen6();
3104       }
3105
3106       /* We handle discards by keeping track of the still-live pixels in f0.1.
3107        * Initialize it with the dispatched pixels.
3108        */
3109       if (fp->UsesKill) {
3110          fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
3111          discard_init->flag_subreg = 1;
3112       }
3113
3114       /* Generate FS IR for main().  (the visitor only descends into
3115        * functions called "main").
3116        */
3117       if (shader) {
3118          foreach_list(node, &*shader->ir) {
3119             ir_instruction *ir = (ir_instruction *)node;
3120             base_ir = ir;
3121             this->result = reg_undef;
3122             ir->accept(this);
3123          }
3124       } else {
3125          emit_fragment_program_code();
3126       }
3127       base_ir = NULL;
3128       if (failed)
3129          return false;
3130
3131       emit(FS_OPCODE_PLACEHOLDER_HALT);
3132
3133       emit_fb_writes();
3134
3135       split_virtual_grfs();
3136
3137       move_uniform_array_access_to_pull_constants();
3138       remove_dead_constants();
3139       setup_pull_constants();
3140
3141       bool progress;
3142       do {
3143          progress = false;
3144
3145          compact_virtual_grfs();
3146
3147          progress = remove_duplicate_mrf_writes() || progress;
3148
3149          progress = opt_algebraic() || progress;
3150          progress = opt_cse() || progress;
3151          progress = opt_copy_propagate() || progress;
3152          progress = dead_code_eliminate() || progress;
3153          progress = dead_code_eliminate_local() || progress;
3154          progress = register_coalesce() || progress;
3155          progress = register_coalesce_2() || progress;
3156          progress = compute_to_mrf() || progress;
3157       } while (progress);
3158
3159       schedule_instructions(false);
3160
3161       lower_uniform_pull_constant_loads();
3162
3163       assign_curb_setup();
3164       assign_urb_setup();
3165
3166       if (0)
3167          assign_regs_trivial();
3168       else {
3169          while (!assign_regs()) {
3170             if (failed)
3171                break;
3172          }
3173       }
3174    }
3175    assert(force_uncompressed_stack == 0);
3176    assert(force_sechalf_stack == 0);
3177
3178    /* This must come after all optimization and register allocation, since
3179     * it inserts dead code that happens to have side effects, and it does
3180     * so based on the actual physical registers in use.
3181     */
3182    insert_gen4_send_dependency_workarounds();
3183
3184    if (failed)
3185       return false;
3186
3187    schedule_instructions(true);
3188
3189    if (dispatch_width == 8) {
3190       c->prog_data.reg_blocks = brw_register_blocks(grf_used);
3191    } else {
3192       c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
3193
3194       /* Make sure we didn't try to sneak in an extra uniform */
3195       assert(orig_nr_params == c->prog_data.nr_params);
3196       (void) orig_nr_params;
3197    }
3198
3199    /* If any state parameters were appended, then ParameterValues could have
3200     * been realloced, in which case the driver uniform storage set up by
3201     * _mesa_associate_uniform_storage() would point to freed memory.  Make
3202     * sure that didn't happen.
3203     */
3204    assert(sanity_param_count == fp->Base.Parameters->NumParameters);
3205
3206    return !failed;
3207 }
3208
3209 const unsigned *
3210 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
3211                struct gl_fragment_program *fp,
3212                struct gl_shader_program *prog,
3213                unsigned *final_assembly_size)
3214 {
3215    bool start_busy = false;
3216    float start_time = 0;
3217
3218    if (unlikely(brw->perf_debug)) {
3219       start_busy = (brw->batch.last_bo &&
3220                     drm_intel_bo_busy(brw->batch.last_bo));
3221       start_time = get_time();
3222    }
3223
3224    struct brw_shader *shader = NULL;
3225    if (prog)
3226       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3227
3228    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3229       if (prog) {
3230          printf("GLSL IR for native fragment shader %d:\n", prog->Name);
3231          _mesa_print_ir(shader->ir, NULL);
3232          printf("\n\n");
3233       } else {
3234          printf("ARB_fragment_program %d ir for native fragment shader\n",
3235                 fp->Base.Id);
3236          _mesa_print_program(&fp->Base);
3237       }
3238    }
3239
3240    /* Now the main event: Visit the shader IR and generate our FS IR for it.
3241     */
3242    fs_visitor v(brw, c, prog, fp, 8);
3243    if (!v.run()) {
3244       if (prog) {
3245          prog->LinkStatus = false;
3246          ralloc_strcat(&prog->InfoLog, v.fail_msg);
3247       }
3248
3249       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3250                     v.fail_msg);
3251
3252       return NULL;
3253    }
3254
3255    exec_list *simd16_instructions = NULL;
3256    fs_visitor v2(brw, c, prog, fp, 16);
3257    if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16))) {
3258       if (c->prog_data.nr_pull_params == 0) {
3259          /* Try a 16-wide compile */
3260          v2.import_uniforms(&v);
3261          if (!v2.run()) {
3262             perf_debug("16-wide shader failed to compile, falling back to "
3263                        "8-wide at a 10-20%% performance cost: %s", v2.fail_msg);
3264          } else {
3265             simd16_instructions = &v2.instructions;
3266          }
3267       } else {
3268          perf_debug("Skipping 16-wide due to pull parameters.\n");
3269       }
3270    }
3271
3272    fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
3273    const unsigned *generated = g.generate_assembly(&v.instructions,
3274                                                    simd16_instructions,
3275                                                    final_assembly_size);
3276
3277    if (unlikely(brw->perf_debug) && shader) {
3278       if (shader->compiled_once)
3279          brw_wm_debug_recompile(brw, prog, &c->key);
3280       shader->compiled_once = true;
3281
3282       if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
3283          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3284                     (get_time() - start_time) * 1000);
3285       }
3286    }
3287
3288    return generated;
3289 }
3290
3291 bool
3292 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
3293 {
3294    struct brw_context *brw = brw_context(ctx);
3295    struct brw_wm_prog_key key;
3296
3297    if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
3298       return true;
3299
3300    struct gl_fragment_program *fp = (struct gl_fragment_program *)
3301       prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
3302    struct brw_fragment_program *bfp = brw_fragment_program(fp);
3303    bool program_uses_dfdy = fp->UsesDFdy;
3304
3305    memset(&key, 0, sizeof(key));
3306
3307    if (brw->gen < 6) {
3308       if (fp->UsesKill)
3309          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3310
3311       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3312          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3313
3314       /* Just assume depth testing. */
3315       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3316       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3317    }
3318
3319    if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
3320                                          BRW_FS_VARYING_INPUT_MASK) > 16)
3321       key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
3322
3323    key.clamp_fragment_color = ctx->API == API_OPENGL_COMPAT;
3324
3325    unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
3326    for (unsigned i = 0; i < sampler_count; i++) {
3327       if (fp->Base.ShadowSamplers & (1 << i)) {
3328          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3329          key.tex.swizzles[i] =
3330             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3331       } else {
3332          /* Color sampler: assume no swizzling. */
3333          key.tex.swizzles[i] = SWIZZLE_XYZW;
3334       }
3335    }
3336
3337    if (fp->Base.InputsRead & VARYING_BIT_POS) {
3338       key.drawable_height = ctx->DrawBuffer->Height;
3339    }
3340
3341    if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3342       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
3343    }
3344
3345    key.nr_color_regions = 1;
3346
3347    /* GL_FRAGMENT_SHADER_DERIVATIVE_HINT is almost always GL_DONT_CARE.  The
3348     * quality of the derivatives is likely to be determined by the driconf
3349     * option.
3350     */
3351    key.high_quality_derivatives = brw->disable_derivative_optimization;
3352
3353    key.program_string_id = bfp->id;
3354
3355    uint32_t old_prog_offset = brw->wm.base.prog_offset;
3356    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3357
3358    bool success = do_wm_prog(brw, prog, bfp, &key);
3359
3360    brw->wm.base.prog_offset = old_prog_offset;
3361    brw->wm.prog_data = old_prog_data;
3362
3363    return success;
3364 }