src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 extern "C" {
  32
  33 #include <sys/types.h>
  34
  35 #include "main/hash_table.h"
  36 #include "main/macros.h"
  37 #include "main/shaderobj.h"
  38 #include "main/fbobject.h"
  39 #include "program/prog_parameter.h"
  40 #include "program/prog_print.h"
  41 #include "program/register_allocate.h"
  42 #include "program/sampler.h"
  43 #include "program/hash_table.h"
  44 #include "brw_context.h"
  45 #include "brw_eu.h"
  46 #include "brw_wm.h"
  47 }
  48 #include "brw_fs.h"
  49 #include "brw_dead_control_flow.h"
  50 #include "main/uniforms.h"
  51 #include "brw_fs_live_variables.h"
  52 #include "glsl/glsl_types.h"
  53
  54 void
  55 fs_inst::init()
  56 {
  57    memset(this, 0, sizeof(*this));
  58    this->opcode = BRW_OPCODE_NOP;
  59    this->conditional_mod = BRW_CONDITIONAL_NONE;
  60
  61    this->dst = reg_undef;
  62    this->src[0] = reg_undef;
  63    this->src[1] = reg_undef;
  64    this->src[2] = reg_undef;
  65
  66    /* This will be the case for almost all instructions. */
  67    this->regs_written = 1;
  68 }
  69
  70 fs_inst::fs_inst()
  71 {
  72    init();
  73 }
  74
  75 fs_inst::fs_inst(enum opcode opcode)
  76 {
  77    init();
  78    this->opcode = opcode;
  79 }
  80
  81 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
  82 {
  83    init();
  84    this->opcode = opcode;
  85    this->dst = dst;
  86
  87    if (dst.file == GRF)
  88       assert(dst.reg_offset >= 0);
  89 }
  90
  91 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
  92 {
  93    init();
  94    this->opcode = opcode;
  95    this->dst = dst;
  96    this->src[0] = src0;
  97
  98    if (dst.file == GRF)
  99       assert(dst.reg_offset >= 0);
 100    if (src[0].file == GRF)
 101       assert(src[0].reg_offset >= 0);
 102 }
 103
 104 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 105 {
 106    init();
 107    this->opcode = opcode;
 108    this->dst = dst;
 109    this->src[0] = src0;
 110    this->src[1] = src1;
 111
 112    if (dst.file == GRF)
 113       assert(dst.reg_offset >= 0);
 114    if (src[0].file == GRF)
 115       assert(src[0].reg_offset >= 0);
 116    if (src[1].file == GRF)
 117       assert(src[1].reg_offset >= 0);
 118 }
 119
 120 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
 121                  fs_reg src0, fs_reg src1, fs_reg src2)
 122 {
 123    init();
 124    this->opcode = opcode;
 125    this->dst = dst;
 126    this->src[0] = src0;
 127    this->src[1] = src1;
 128    this->src[2] = src2;
 129
 130    if (dst.file == GRF)
 131       assert(dst.reg_offset >= 0);
 132    if (src[0].file == GRF)
 133       assert(src[0].reg_offset >= 0);
 134    if (src[1].file == GRF)
 135       assert(src[1].reg_offset >= 0);
 136    if (src[2].file == GRF)
 137       assert(src[2].reg_offset >= 0);
 138 }
 139
 140 #define ALU1(op)                                                        \
 141    fs_inst *                                                            \
 142    fs_visitor::op(fs_reg dst, fs_reg src0)                              \
 143    {                                                                    \
 144       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
 145    }
 146
 147 #define ALU2(op)                                                        \
 148    fs_inst *                                                            \
 149    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1)                 \
 150    {                                                                    \
 151       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
 152    }
 153
 154 #define ALU3(op)                                                        \
 155    fs_inst *                                                            \
 156    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1, fs_reg src2)    \
 157    {                                                                    \
 158       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
 159    }
 160
 161 ALU1(NOT)
 162 ALU1(MOV)
 163 ALU1(FRC)
 164 ALU1(RNDD)
 165 ALU1(RNDE)
 166 ALU1(RNDZ)
 167 ALU2(ADD)
 168 ALU2(MUL)
 169 ALU2(MACH)
 170 ALU2(AND)
 171 ALU2(OR)
 172 ALU2(XOR)
 173 ALU2(SHL)
 174 ALU2(SHR)
 175 ALU2(ASR)
 176 ALU3(LRP)
 177 ALU1(BFREV)
 178 ALU3(BFE)
 179 ALU2(BFI1)
 180 ALU3(BFI2)
 181 ALU1(FBH)
 182 ALU1(FBL)
 183 ALU1(CBIT)
 184 ALU3(MAD)
 185 ALU2(ADDC)
 186 ALU2(SUBB)
 187
 188 /** Gen4 predicated IF. */
 189 fs_inst *
 190 fs_visitor::IF(uint32_t predicate)
 191 {
 192    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
 193    inst->predicate = predicate;
 194    return inst;
 195 }
 196
 197 /** Gen6 IF with embedded comparison. */
 198 fs_inst *
 199 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
 200 {
 201    assert(brw->gen == 6);
 202    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
 203                                         reg_null_d, src0, src1);
 204    inst->conditional_mod = condition;
 205    return inst;
 206 }
 207
 208 /**
 209  * CMP: Sets the low bit of the destination channels with the result
 210  * of the comparison, while the upper bits are undefined, and updates
 211  * the flag register with the packed 16 bits of the result.
 212  */
 213 fs_inst *
 214 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
 215 {
 216    fs_inst *inst;
 217
 218    /* Take the instruction:
 219     *
 220     * CMP null<d> src0<f> src1<f>
 221     *
 222     * Original gen4 does type conversion to the destination type before
 223     * comparison, producing garbage results for floating point comparisons.
 224     * gen5 does the comparison on the execution type (resolved source types),
 225     * so dst type doesn't matter.  gen6 does comparison and then uses the
 226     * result as if it was the dst type with no conversion, which happens to
 227     * mostly work out for float-interpreted-as-int since our comparisons are
 228     * for >0, =0, <0.
 229     */
 230    if (brw->gen == 4) {
 231       dst.type = src0.type;
 232       if (dst.file == HW_REG)
 233          dst.fixed_hw_reg.type = dst.type;
 234    }
 235
 236    resolve_ud_negate(&src0);
 237    resolve_ud_negate(&src1);
 238
 239    inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
 240    inst->conditional_mod = condition;
 241
 242    return inst;
 243 }
 244
 245 exec_list
 246 fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
 247                                        fs_reg varying_offset,
 248                                        uint32_t const_offset)
 249 {
 250    exec_list instructions;
 251    fs_inst *inst;
 252
 253    /* We have our constant surface use a pitch of 4 bytes, so our index can
 254     * be any component of a vector, and then we load 4 contiguous
 255     * components starting from that.
 256     *
 257     * We break down the const_offset to a portion added to the variable
 258     * offset and a portion done using reg_offset, which means that if you
 259     * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
 260     * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
 261     * CSE can later notice that those loads are all the same and eliminate
 262     * the redundant ones.
 263     */
 264    fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
 265    instructions.push_tail(ADD(vec4_offset,
 266                               varying_offset, const_offset & ~3));
 267
 268    int scale = 1;
 269    if (brw->gen == 4 && dispatch_width == 8) {
 270       /* Pre-gen5, we can either use a SIMD8 message that requires (header,
 271        * u, v, r) as parameters, or we can just use the SIMD16 message
 272        * consisting of (header, u).  We choose the second, at the cost of a
 273        * longer return length.
 274        */
 275       scale = 2;
 276    }
 277
 278    enum opcode op;
 279    if (brw->gen >= 7)
 280       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
 281    else
 282       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
 283    fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4 * scale), dst.type);
 284    inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
 285    inst->regs_written = 4 * scale;
 286    instructions.push_tail(inst);
 287
 288    if (brw->gen < 7) {
 289       inst->base_mrf = 13;
 290       inst->header_present = true;
 291       if (brw->gen == 4)
 292          inst->mlen = 3;
 293       else
 294          inst->mlen = 1 + dispatch_width / 8;
 295    }
 296
 297    vec4_result.reg_offset += (const_offset & 3) * scale;
 298    instructions.push_tail(MOV(dst, vec4_result));
 299
 300    return instructions;
 301 }
 302
 303 /**
 304  * A helper for MOV generation for fixing up broken hardware SEND dependency
 305  * handling.
 306  */
 307 fs_inst *
 308 fs_visitor::DEP_RESOLVE_MOV(int grf)
 309 {
 310    fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
 311
 312    inst->ir = NULL;
 313    inst->annotation = "send dependency resolve";
 314
 315    /* The caller always wants uncompressed to emit the minimal extra
 316     * dependencies, and to avoid having to deal with aligning its regs to 2.
 317     */
 318    inst->force_uncompressed = true;
 319
 320    return inst;
 321 }
 322
 323 bool
 324 fs_inst::equals(fs_inst *inst)
 325 {
 326    return (opcode == inst->opcode &&
 327            dst.equals(inst->dst) &&
 328            src[0].equals(inst->src[0]) &&
 329            src[1].equals(inst->src[1]) &&
 330            src[2].equals(inst->src[2]) &&
 331            saturate == inst->saturate &&
 332            predicate == inst->predicate &&
 333            conditional_mod == inst->conditional_mod &&
 334            mlen == inst->mlen &&
 335            base_mrf == inst->base_mrf &&
 336            sampler == inst->sampler &&
 337            target == inst->target &&
 338            eot == inst->eot &&
 339            header_present == inst->header_present &&
 340            shadow_compare == inst->shadow_compare &&
 341            offset == inst->offset);
 342 }
 343
 344 bool
 345 fs_inst::overwrites_reg(const fs_reg &reg)
 346 {
 347    return (reg.file == dst.file &&
 348            reg.reg == dst.reg &&
 349            reg.reg_offset >= dst.reg_offset  &&
 350            reg.reg_offset < dst.reg_offset + regs_written);
 351 }
 352
 353 bool
 354 fs_inst::is_send_from_grf()
 355 {
 356    return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
 357            opcode == SHADER_OPCODE_SHADER_TIME_ADD ||
 358            (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
 359             src[1].file == GRF) ||
 360            (is_tex() && src[0].file == GRF));
 361 }
 362
 363 bool
 364 fs_visitor::can_do_source_mods(fs_inst *inst)
 365 {
 366    if (brw->gen == 6 && inst->is_math())
 367       return false;
 368
 369    if (inst->is_send_from_grf())
 370       return false;
 371
 372    if (!inst->can_do_source_mods())
 373       return false;
 374
 375    return true;
 376 }
 377
 378 void
 379 fs_reg::init()
 380 {
 381    memset(this, 0, sizeof(*this));
 382    this->smear = -1;
 383 }
 384
 385 /** Generic unset register constructor. */
 386 fs_reg::fs_reg()
 387 {
 388    init();
 389    this->file = BAD_FILE;
 390 }
 391
 392 /** Immediate value constructor. */
 393 fs_reg::fs_reg(float f)
 394 {
 395    init();
 396    this->file = IMM;
 397    this->type = BRW_REGISTER_TYPE_F;
 398    this->imm.f = f;
 399 }
 400
 401 /** Immediate value constructor. */
 402 fs_reg::fs_reg(int32_t i)
 403 {
 404    init();
 405    this->file = IMM;
 406    this->type = BRW_REGISTER_TYPE_D;
 407    this->imm.i = i;
 408 }
 409
 410 /** Immediate value constructor. */
 411 fs_reg::fs_reg(uint32_t u)
 412 {
 413    init();
 414    this->file = IMM;
 415    this->type = BRW_REGISTER_TYPE_UD;
 416    this->imm.u = u;
 417 }
 418
 419 /** Fixed brw_reg. */
 420 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
 421 {
 422    init();
 423    this->file = HW_REG;
 424    this->fixed_hw_reg = fixed_hw_reg;
 425    this->type = fixed_hw_reg.type;
 426 }
 427
 428 bool
 429 fs_reg::equals(const fs_reg &r) const
 430 {
 431    return (file == r.file &&
 432            reg == r.reg &&
 433            reg_offset == r.reg_offset &&
 434            type == r.type &&
 435            negate == r.negate &&
 436            abs == r.abs &&
 437            !reladdr && !r.reladdr &&
 438            memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
 439                   sizeof(fixed_hw_reg)) == 0 &&
 440            smear == r.smear &&
 441            imm.u == r.imm.u);
 442 }
 443
 444 fs_reg
 445 fs_reg::retype(uint32_t type)
 446 {
 447    fs_reg result = *this;
 448    result.type = type;
 449    return result;
 450 }
 451
 452 bool
 453 fs_reg::is_zero() const
 454 {
 455    if (file != IMM)
 456       return false;
 457
 458    return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
 459 }
 460
 461 bool
 462 fs_reg::is_one() const
 463 {
 464    if (file != IMM)
 465       return false;
 466
 467    return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
 468 }
 469
 470 bool
 471 fs_reg::is_null() const
 472 {
 473    return file == HW_REG &&
 474           fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
 475           fixed_hw_reg.nr == BRW_ARF_NULL;
 476 }
 477
 478 bool
 479 fs_reg::is_valid_3src() const
 480 {
 481    return file == GRF || file == UNIFORM;
 482 }
 483
 484 int
 485 fs_visitor::type_size(const struct glsl_type *type)
 486 {
 487    unsigned int size, i;
 488
 489    switch (type->base_type) {
 490    case GLSL_TYPE_UINT:
 491    case GLSL_TYPE_INT:
 492    case GLSL_TYPE_FLOAT:
 493    case GLSL_TYPE_BOOL:
 494       return type->components();
 495    case GLSL_TYPE_ARRAY:
 496       return type_size(type->fields.array) * type->length;
 497    case GLSL_TYPE_STRUCT:
 498       size = 0;
 499       for (i = 0; i < type->length; i++) {
 500          size += type_size(type->fields.structure[i].type);
 501       }
 502       return size;
 503    case GLSL_TYPE_SAMPLER:
 504       /* Samplers take up no register space, since they're baked in at
 505        * link time.
 506        */
 507       return 0;
 508    case GLSL_TYPE_ATOMIC_UINT:
 509       return 0;
 510    case GLSL_TYPE_VOID:
 511    case GLSL_TYPE_ERROR:
 512    case GLSL_TYPE_INTERFACE:
 513       assert(!"not reached");
 514       break;
 515    }
 516
 517    return 0;
 518 }
 519
 520 fs_reg
 521 fs_visitor::get_timestamp()
 522 {
 523    assert(brw->gen >= 7);
 524
 525    fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 526                                           BRW_ARF_TIMESTAMP,
 527                                           0),
 528                              BRW_REGISTER_TYPE_UD));
 529
 530    fs_reg dst = fs_reg(this, glsl_type::uint_type);
 531
 532    fs_inst *mov = emit(MOV(dst, ts));
 533    /* We want to read the 3 fields we care about (mostly field 0, but also 2)
 534     * even if it's not enabled in the dispatch.
 535     */
 536    mov->force_writemask_all = true;
 537    mov->force_uncompressed = true;
 538
 539    /* The caller wants the low 32 bits of the timestamp.  Since it's running
 540     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
 541     * which is plenty of time for our purposes.  It is identical across the
 542     * EUs, but since it's tracking GPU core speed it will increment at a
 543     * varying rate as render P-states change.
 544     *
 545     * The caller could also check if render P-states have changed (or anything
 546     * else that might disrupt timing) by setting smear to 2 and checking if
 547     * that field is != 0.
 548     */
 549    dst.smear = 0;
 550
 551    return dst;
 552 }
 553
 554 void
 555 fs_visitor::emit_shader_time_begin()
 556 {
 557    current_annotation = "shader time start";
 558    shader_start_time = get_timestamp();
 559 }
 560
 561 void
 562 fs_visitor::emit_shader_time_end()
 563 {
 564    current_annotation = "shader time end";
 565
 566    enum shader_time_shader_type type, written_type, reset_type;
 567    if (dispatch_width == 8) {
 568       type = ST_FS8;
 569       written_type = ST_FS8_WRITTEN;
 570       reset_type = ST_FS8_RESET;
 571    } else {
 572       assert(dispatch_width == 16);
 573       type = ST_FS16;
 574       written_type = ST_FS16_WRITTEN;
 575       reset_type = ST_FS16_RESET;
 576    }
 577
 578    fs_reg shader_end_time = get_timestamp();
 579
 580    /* Check that there weren't any timestamp reset events (assuming these
 581     * were the only two timestamp reads that happened).
 582     */
 583    fs_reg reset = shader_end_time;
 584    reset.smear = 2;
 585    fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
 586    test->conditional_mod = BRW_CONDITIONAL_Z;
 587    emit(IF(BRW_PREDICATE_NORMAL));
 588
 589    push_force_uncompressed();
 590    fs_reg start = shader_start_time;
 591    start.negate = true;
 592    fs_reg diff = fs_reg(this, glsl_type::uint_type);
 593    emit(ADD(diff, start, shader_end_time));
 594
 595    /* If there were no instructions between the two timestamp gets, the diff
 596     * is 2 cycles.  Remove that overhead, so I can forget about that when
 597     * trying to determine the time taken for single instructions.
 598     */
 599    emit(ADD(diff, diff, fs_reg(-2u)));
 600
 601    emit_shader_time_write(type, diff);
 602    emit_shader_time_write(written_type, fs_reg(1u));
 603    emit(BRW_OPCODE_ELSE);
 604    emit_shader_time_write(reset_type, fs_reg(1u));
 605    emit(BRW_OPCODE_ENDIF);
 606
 607    pop_force_uncompressed();
 608 }
 609
 610 void
 611 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
 612                                    fs_reg value)
 613 {
 614    int shader_time_index =
 615       brw_get_shader_time_index(brw, shader_prog, &fp->Base, type);
 616    fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
 617
 618    fs_reg payload;
 619    if (dispatch_width == 8)
 620       payload = fs_reg(this, glsl_type::uvec2_type);
 621    else
 622       payload = fs_reg(this, glsl_type::uint_type);
 623
 624    emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
 625                 fs_reg(), payload, offset, value));
 626 }
 627
 628 void
 629 fs_visitor::fail(const char *format, ...)
 630 {
 631    va_list va;
 632    char *msg;
 633
 634    if (failed)
 635       return;
 636
 637    failed = true;
 638
 639    va_start(va, format);
 640    msg = ralloc_vasprintf(mem_ctx, format, va);
 641    va_end(va);
 642    msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
 643
 644    this->fail_msg = msg;
 645
 646    if (INTEL_DEBUG & DEBUG_WM) {
 647       fprintf(stderr, "%s",  msg);
 648    }
 649 }
 650
 651 fs_inst *
 652 fs_visitor::emit(enum opcode opcode)
 653 {
 654    return emit(fs_inst(opcode));
 655 }
 656
 657 fs_inst *
 658 fs_visitor::emit(enum opcode opcode, fs_reg dst)
 659 {
 660    return emit(fs_inst(opcode, dst));
 661 }
 662
 663 fs_inst *
 664 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
 665 {
 666    return emit(fs_inst(opcode, dst, src0));
 667 }
 668
 669 fs_inst *
 670 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 671 {
 672    return emit(fs_inst(opcode, dst, src0, src1));
 673 }
 674
 675 fs_inst *
 676 fs_visitor::emit(enum opcode opcode, fs_reg dst,
 677                  fs_reg src0, fs_reg src1, fs_reg src2)
 678 {
 679    return emit(fs_inst(opcode, dst, src0, src1, src2));
 680 }
 681
 682 void
 683 fs_visitor::push_force_uncompressed()
 684 {
 685    force_uncompressed_stack++;
 686 }
 687
 688 void
 689 fs_visitor::pop_force_uncompressed()
 690 {
 691    force_uncompressed_stack--;
 692    assert(force_uncompressed_stack >= 0);
 693 }
 694
 695 /**
 696  * Returns true if the instruction has a flag that means it won't
 697  * update an entire destination register.
 698  *
 699  * For example, dead code elimination and live variable analysis want to know
 700  * when a write to a variable screens off any preceding values that were in
 701  * it.
 702  */
 703 bool
 704 fs_inst::is_partial_write()
 705 {
 706    return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
 707            this->force_uncompressed ||
 708            this->force_sechalf);
 709 }
 710
 711 int
 712 fs_inst::regs_read(fs_visitor *v, int arg)
 713 {
 714    if (is_tex() && arg == 0 && src[0].file == GRF) {
 715       if (v->dispatch_width == 16)
 716          return (mlen + 1) / 2;
 717       else
 718          return mlen;
 719    }
 720    return 1;
 721 }
 722
 723 bool
 724 fs_inst::reads_flag()
 725 {
 726    return predicate;
 727 }
 728
 729 bool
 730 fs_inst::writes_flag()
 731 {
 732    return (conditional_mod && opcode != BRW_OPCODE_SEL) ||
 733           opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
 734 }
 735
 736 /**
 737  * Returns how many MRFs an FS opcode will write over.
 738  *
 739  * Note that this is not the 0 or 1 implied writes in an actual gen
 740  * instruction -- the FS opcodes often generate MOVs in addition.
 741  */
 742 int
 743 fs_visitor::implied_mrf_writes(fs_inst *inst)
 744 {
 745    if (inst->mlen == 0)
 746       return 0;
 747
 748    if (inst->base_mrf == -1)
 749       return 0;
 750
 751    switch (inst->opcode) {
 752    case SHADER_OPCODE_RCP:
 753    case SHADER_OPCODE_RSQ:
 754    case SHADER_OPCODE_SQRT:
 755    case SHADER_OPCODE_EXP2:
 756    case SHADER_OPCODE_LOG2:
 757    case SHADER_OPCODE_SIN:
 758    case SHADER_OPCODE_COS:
 759       return 1 * dispatch_width / 8;
 760    case SHADER_OPCODE_POW:
 761    case SHADER_OPCODE_INT_QUOTIENT:
 762    case SHADER_OPCODE_INT_REMAINDER:
 763       return 2 * dispatch_width / 8;
 764    case SHADER_OPCODE_TEX:
 765    case FS_OPCODE_TXB:
 766    case SHADER_OPCODE_TXD:
 767    case SHADER_OPCODE_TXF:
 768    case SHADER_OPCODE_TXF_MS:
 769    case SHADER_OPCODE_TG4:
 770    case SHADER_OPCODE_TG4_OFFSET:
 771    case SHADER_OPCODE_TXL:
 772    case SHADER_OPCODE_TXS:
 773    case SHADER_OPCODE_LOD:
 774       return 1;
 775    case FS_OPCODE_FB_WRITE:
 776       return 2;
 777    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 778    case SHADER_OPCODE_GEN4_SCRATCH_READ:
 779       return 1;
 780    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
 781       return inst->mlen;
 782    case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
 783       return 2;
 784    case SHADER_OPCODE_UNTYPED_ATOMIC:
 785    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
 786       return 0;
 787    default:
 788       assert(!"not reached");
 789       return inst->mlen;
 790    }
 791 }
 792
 793 int
 794 fs_visitor::virtual_grf_alloc(int size)
 795 {
 796    if (virtual_grf_array_size <= virtual_grf_count) {
 797       if (virtual_grf_array_size == 0)
 798          virtual_grf_array_size = 16;
 799       else
 800          virtual_grf_array_size *= 2;
 801       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 802                                    virtual_grf_array_size);
 803    }
 804    virtual_grf_sizes[virtual_grf_count] = size;
 805    return virtual_grf_count++;
 806 }
 807
 808 /** Fixed HW reg constructor. */
 809 fs_reg::fs_reg(enum register_file file, int reg)
 810 {
 811    init();
 812    this->file = file;
 813    this->reg = reg;
 814    this->type = BRW_REGISTER_TYPE_F;
 815 }
 816
 817 /** Fixed HW reg constructor. */
 818 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
 819 {
 820    init();
 821    this->file = file;
 822    this->reg = reg;
 823    this->type = type;
 824 }
 825
 826 /** Automatic reg constructor. */
 827 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
 828 {
 829    init();
 830
 831    this->file = GRF;
 832    this->reg = v->virtual_grf_alloc(v->type_size(type));
 833    this->reg_offset = 0;
 834    this->type = brw_type_for_base_type(type);
 835 }
 836
 837 fs_reg *
 838 fs_visitor::variable_storage(ir_variable *var)
 839 {
 840    return (fs_reg *)hash_table_find(this->variable_ht, var);
 841 }
 842
 843 void
 844 import_uniforms_callback(const void *key,
 845                          void *data,
 846                          void *closure)
 847 {
 848    struct hash_table *dst_ht = (struct hash_table *)closure;
 849    const fs_reg *reg = (const fs_reg *)data;
 850
 851    if (reg->file != UNIFORM)
 852       return;
 853
 854    hash_table_insert(dst_ht, data, key);
 855 }
 856
 857 /* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
 858  * This brings in those uniform definitions
 859  */
 860 void
 861 fs_visitor::import_uniforms(fs_visitor *v)
 862 {
 863    hash_table_call_foreach(v->variable_ht,
 864                            import_uniforms_callback,
 865                            variable_ht);
 866    this->params_remap = v->params_remap;
 867    this->nr_params_remap = v->nr_params_remap;
 868 }
 869
 870 /* Our support for uniforms is piggy-backed on the struct
 871  * gl_fragment_program, because that's where the values actually
 872  * get stored, rather than in some global gl_shader_program uniform
 873  * store.
 874  */
 875 void
 876 fs_visitor::setup_uniform_values(ir_variable *ir)
 877 {
 878    int namelen = strlen(ir->name);
 879
 880    /* The data for our (non-builtin) uniforms is stored in a series of
 881     * gl_uniform_driver_storage structs for each subcomponent that
 882     * glGetUniformLocation() could name.  We know it's been set up in the same
 883     * order we'd walk the type, so walk the list of storage and find anything
 884     * with our name, or the prefix of a component that starts with our name.
 885     */
 886    unsigned params_before = c->prog_data.nr_params;
 887    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
 888       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 889
 890       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 891           (storage->name[namelen] != 0 &&
 892            storage->name[namelen] != '.' &&
 893            storage->name[namelen] != '[')) {
 894          continue;
 895       }
 896
 897       unsigned slots = storage->type->component_slots();
 898       if (storage->array_elements)
 899          slots *= storage->array_elements;
 900
 901       for (unsigned i = 0; i < slots; i++) {
 902          c->prog_data.param[c->prog_data.nr_params++] =
 903             &storage->storage[i].f;
 904       }
 905    }
 906
 907    /* Make sure we actually initialized the right amount of stuff here. */
 908    assert(params_before + ir->type->component_slots() ==
 909           c->prog_data.nr_params);
 910    (void)params_before;
 911 }
 912
 913
 914 /* Our support for builtin uniforms is even scarier than non-builtin.
 915  * It sits on top of the PROG_STATE_VAR parameters that are
 916  * automatically updated from GL context state.
 917  */
 918 void
 919 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
 920 {
 921    const ir_state_slot *const slots = ir->state_slots;
 922    assert(ir->state_slots != NULL);
 923
 924    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 925       /* This state reference has already been setup by ir_to_mesa, but we'll
 926        * get the same index back here.
 927        */
 928       int index = _mesa_add_state_reference(this->fp->Base.Parameters,
 929                                             (gl_state_index *)slots[i].tokens);
 930
 931       /* Add each of the unique swizzles of the element as a parameter.
 932        * This'll end up matching the expected layout of the
 933        * array/matrix/structure we're trying to fill in.
 934        */
 935       int last_swiz = -1;
 936       for (unsigned int j = 0; j < 4; j++) {
 937          int swiz = GET_SWZ(slots[i].swizzle, j);
 938          if (swiz == last_swiz)
 939             break;
 940          last_swiz = swiz;
 941
 942          c->prog_data.param[c->prog_data.nr_params++] =
 943             &fp->Base.Parameters->ParameterValues[index][swiz].f;
 944       }
 945    }
 946 }
 947
 948 fs_reg *
 949 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
 950 {
 951    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 952    fs_reg wpos = *reg;
 953    bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
 954
 955    /* gl_FragCoord.x */
 956    if (ir->pixel_center_integer) {
 957       emit(MOV(wpos, this->pixel_x));
 958    } else {
 959       emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
 960    }
 961    wpos.reg_offset++;
 962
 963    /* gl_FragCoord.y */
 964    if (!flip && ir->pixel_center_integer) {
 965       emit(MOV(wpos, this->pixel_y));
 966    } else {
 967       fs_reg pixel_y = this->pixel_y;
 968       float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
 969
 970       if (flip) {
 971          pixel_y.negate = true;
 972          offset += c->key.drawable_height - 1.0;
 973       }
 974
 975       emit(ADD(wpos, pixel_y, fs_reg(offset)));
 976    }
 977    wpos.reg_offset++;
 978
 979    /* gl_FragCoord.z */
 980    if (brw->gen >= 6) {
 981       emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
 982    } else {
 983       emit(FS_OPCODE_LINTERP, wpos,
 984            this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
 985            this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
 986            interp_reg(VARYING_SLOT_POS, 2));
 987    }
 988    wpos.reg_offset++;
 989
 990    /* gl_FragCoord.w: Already set up in emit_interpolation */
 991    emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
 992
 993    return reg;
 994 }
 995
 996 fs_inst *
 997 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
 998                          glsl_interp_qualifier interpolation_mode,
 999                          bool is_centroid)
1000 {
1001    brw_wm_barycentric_interp_mode barycoord_mode;
1002    if (brw->gen >= 6) {
1003       if (is_centroid) {
1004          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1005             barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1006          else
1007             barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1008       } else {
1009          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1010             barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1011          else
1012             barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1013       }
1014    } else {
1015       /* On Ironlake and below, there is only one interpolation mode.
1016        * Centroid interpolation doesn't mean anything on this hardware --
1017        * there is no multisampling.
1018        */
1019       barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1020    }
1021    return emit(FS_OPCODE_LINTERP, attr,
1022                this->delta_x[barycoord_mode],
1023                this->delta_y[barycoord_mode], interp);
1024 }
1025
1026 fs_reg *
1027 fs_visitor::emit_general_interpolation(ir_variable *ir)
1028 {
1029    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1030    reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
1031    fs_reg attr = *reg;
1032
1033    unsigned int array_elements;
1034    const glsl_type *type;
1035
1036    if (ir->type->is_array()) {
1037       array_elements = ir->type->length;
1038       if (array_elements == 0) {
1039          fail("dereferenced array '%s' has length 0\n", ir->name);
1040       }
1041       type = ir->type->fields.array;
1042    } else {
1043       array_elements = 1;
1044       type = ir->type;
1045    }
1046
1047    glsl_interp_qualifier interpolation_mode =
1048       ir->determine_interpolation_mode(c->key.flat_shade);
1049
1050    int location = ir->location;
1051    for (unsigned int i = 0; i < array_elements; i++) {
1052       for (unsigned int j = 0; j < type->matrix_columns; j++) {
1053          if (c->prog_data.urb_setup[location] == -1) {
1054             /* If there's no incoming setup data for this slot, don't
1055              * emit interpolation for it.
1056              */
1057             attr.reg_offset += type->vector_elements;
1058             location++;
1059             continue;
1060          }
1061
1062          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1063             /* Constant interpolation (flat shading) case. The SF has
1064              * handed us defined values in only the constant offset
1065              * field of the setup reg.
1066              */
1067             for (unsigned int k = 0; k < type->vector_elements; k++) {
1068                struct brw_reg interp = interp_reg(location, k);
1069                interp = suboffset(interp, 3);
1070                interp.type = reg->type;
1071                emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1072                attr.reg_offset++;
1073             }
1074          } else {
1075             /* Smooth/noperspective interpolation case. */
1076             for (unsigned int k = 0; k < type->vector_elements; k++) {
1077                /* FINISHME: At some point we probably want to push
1078                 * this farther by giving similar treatment to the
1079                 * other potentially constant components of the
1080                 * attribute, as well as making brw_vs_constval.c
1081                 * handle varyings other than gl_TexCoord.
1082                 */
1083                struct brw_reg interp = interp_reg(location, k);
1084                emit_linterp(attr, fs_reg(interp), interpolation_mode,
1085                             ir->centroid);
1086                if (brw->needs_unlit_centroid_workaround && ir->centroid) {
1087                   /* Get the pixel/sample mask into f0 so that we know
1088                    * which pixels are lit.  Then, for each channel that is
1089                    * unlit, replace the centroid data with non-centroid
1090                    * data.
1091                    */
1092                   emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1093                   fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1094                                                interpolation_mode, false);
1095                   inst->predicate = BRW_PREDICATE_NORMAL;
1096                   inst->predicate_inverse = true;
1097                }
1098                if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1099                   emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1100                }
1101                attr.reg_offset++;
1102             }
1103
1104          }
1105          location++;
1106       }
1107    }
1108
1109    return reg;
1110 }
1111
1112 fs_reg *
1113 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1114 {
1115    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1116
1117    /* The frontfacing comes in as a bit in the thread payload. */
1118    if (brw->gen >= 6) {
1119       emit(BRW_OPCODE_ASR, *reg,
1120            fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1121            fs_reg(15));
1122       emit(BRW_OPCODE_NOT, *reg, *reg);
1123       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1124    } else {
1125       struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1126       /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1127        * us front face
1128        */
1129       emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1130       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1131    }
1132
1133    return reg;
1134 }
1135
1136 void
1137 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1138 {
1139    assert(dst.type == BRW_REGISTER_TYPE_F);
1140
1141    if (c->key.compute_pos_offset) {
1142       /* Convert int_sample_pos to floating point */
1143       emit(MOV(dst, int_sample_pos));
1144       /* Scale to the range [0, 1] */
1145       emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1146    }
1147    else {
1148       /* From ARB_sample_shading specification:
1149        * "When rendering to a non-multisample buffer, or if multisample
1150        *  rasterization is disabled, gl_SamplePosition will always be
1151        *  (0.5, 0.5).
1152        */
1153       emit(MOV(dst, fs_reg(0.5f)));
1154    }
1155 }
1156
1157 fs_reg *
1158 fs_visitor::emit_samplepos_setup(ir_variable *ir)
1159 {
1160    assert(brw->gen >= 6);
1161    assert(ir->type == glsl_type::vec2_type);
1162
1163    this->current_annotation = "compute sample position";
1164    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1165    fs_reg pos = *reg;
1166    fs_reg int_sample_x = fs_reg(this, glsl_type::int_type);
1167    fs_reg int_sample_y = fs_reg(this, glsl_type::int_type);
1168
1169    /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1170     * mode will be enabled.
1171     *
1172     * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1173     * R31.1:0         Position Offset X/Y for Slot[3:0]
1174     * R31.3:2         Position Offset X/Y for Slot[7:4]
1175     * .....
1176     *
1177     * The X, Y sample positions come in as bytes in  thread payload. So, read
1178     * the positions using vstride=16, width=8, hstride=2.
1179     */
1180    struct brw_reg sample_pos_reg =
1181       stride(retype(brw_vec1_grf(c->sample_pos_reg, 0),
1182                     BRW_REGISTER_TYPE_B), 16, 8, 2);
1183
1184    emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1185    if (dispatch_width == 16) {
1186       int_sample_x.sechalf = true;
1187       fs_inst *inst = emit(MOV(int_sample_x,
1188                                fs_reg(suboffset(sample_pos_reg, 16))));
1189       inst->force_sechalf = true;
1190       int_sample_x.sechalf = false;
1191    }
1192    /* Compute gl_SamplePosition.x */
1193    compute_sample_position(pos, int_sample_x);
1194    pos.reg_offset++;
1195    emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1196    if (dispatch_width == 16) {
1197       int_sample_y.sechalf = true;
1198       fs_inst *inst = emit(MOV(int_sample_y,
1199                                fs_reg(suboffset(sample_pos_reg, 17))));
1200       inst->force_sechalf = true;
1201       int_sample_y.sechalf = false;
1202    }
1203    /* Compute gl_SamplePosition.y */
1204    compute_sample_position(pos, int_sample_y);
1205    return reg;
1206 }
1207
1208 fs_reg *
1209 fs_visitor::emit_sampleid_setup(ir_variable *ir)
1210 {
1211    assert(brw->gen >= 6);
1212
1213    this->current_annotation = "compute sample id";
1214    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1215
1216    if (c->key.compute_sample_id) {
1217       fs_reg t1 = fs_reg(this, glsl_type::int_type);
1218       fs_reg t2 = fs_reg(this, glsl_type::int_type);
1219       t2.type = BRW_REGISTER_TYPE_UW;
1220
1221       /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1222        * 8x multisampling, subspan 0 will represent sample N (where N
1223        * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1224        * 7. We can find the value of N by looking at R0.0 bits 7:6
1225        * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1226        * (since samples are always delivered in pairs). That is, we
1227        * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1228        * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1229        * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1230        * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1231        * populating a temporary variable with the sequence (0, 1, 2, 3),
1232        * and then reading from it using vstride=1, width=4, hstride=0.
1233        * These computations hold good for 4x multisampling as well.
1234        */
1235       emit(BRW_OPCODE_AND, t1,
1236            fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1237            fs_reg(brw_imm_d(0xc0)));
1238       emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1239       /* This works for both SIMD8 and SIMD16 */
1240       emit(MOV(t2, brw_imm_v(0x3210)));
1241       /* This special instruction takes care of setting vstride=1,
1242        * width=4, hstride=0 of t2 during an ADD instruction.
1243        */
1244       emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1245    } else {
1246       /* As per GL_ARB_sample_shading specification:
1247        * "When rendering to a non-multisample buffer, or if multisample
1248        *  rasterization is disabled, gl_SampleID will always be zero."
1249        */
1250       emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1251    }
1252
1253    return reg;
1254 }
1255
1256 fs_reg
1257 fs_visitor::fix_math_operand(fs_reg src)
1258 {
1259    /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1260     * might be able to do better by doing execsize = 1 math and then
1261     * expanding that result out, but we would need to be careful with
1262     * masking.
1263     *
1264     * The hardware ignores source modifiers (negate and abs) on math
1265     * instructions, so we also move to a temp to set those up.
1266     */
1267    if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1268        !src.abs && !src.negate)
1269       return src;
1270
1271    /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1272     * operands to math
1273     */
1274    if (brw->gen >= 7 && src.file != IMM)
1275       return src;
1276
1277    fs_reg expanded = fs_reg(this, glsl_type::float_type);
1278    expanded.type = src.type;
1279    emit(BRW_OPCODE_MOV, expanded, src);
1280    return expanded;
1281 }
1282
1283 fs_inst *
1284 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1285 {
1286    switch (opcode) {
1287    case SHADER_OPCODE_RCP:
1288    case SHADER_OPCODE_RSQ:
1289    case SHADER_OPCODE_SQRT:
1290    case SHADER_OPCODE_EXP2:
1291    case SHADER_OPCODE_LOG2:
1292    case SHADER_OPCODE_SIN:
1293    case SHADER_OPCODE_COS:
1294       break;
1295    default:
1296       assert(!"not reached: bad math opcode");
1297       return NULL;
1298    }
1299
1300    /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
1301     * might be able to do better by doing execsize = 1 math and then
1302     * expanding that result out, but we would need to be careful with
1303     * masking.
1304     *
1305     * Gen 6 hardware ignores source modifiers (negate and abs) on math
1306     * instructions, so we also move to a temp to set those up.
1307     */
1308    if (brw->gen >= 6)
1309       src = fix_math_operand(src);
1310
1311    fs_inst *inst = emit(opcode, dst, src);
1312
1313    if (brw->gen < 6) {
1314       inst->base_mrf = 2;
1315       inst->mlen = dispatch_width / 8;
1316    }
1317
1318    return inst;
1319 }
1320
1321 fs_inst *
1322 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1323 {
1324    int base_mrf = 2;
1325    fs_inst *inst;
1326
1327    switch (opcode) {
1328    case SHADER_OPCODE_INT_QUOTIENT:
1329    case SHADER_OPCODE_INT_REMAINDER:
1330       if (brw->gen >= 7 && dispatch_width == 16)
1331          fail("16-wide INTDIV unsupported\n");
1332       break;
1333    case SHADER_OPCODE_POW:
1334       break;
1335    default:
1336       assert(!"not reached: unsupported binary math opcode.");
1337       return NULL;
1338    }
1339
1340    if (brw->gen >= 6) {
1341       src0 = fix_math_operand(src0);
1342       src1 = fix_math_operand(src1);
1343
1344       inst = emit(opcode, dst, src0, src1);
1345    } else {
1346       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1347        * "Message Payload":
1348        *
1349        * "Operand0[7].  For the INT DIV functions, this operand is the
1350        *  denominator."
1351        *  ...
1352        * "Operand1[7].  For the INT DIV functions, this operand is the
1353        *  numerator."
1354        */
1355       bool is_int_div = opcode != SHADER_OPCODE_POW;
1356       fs_reg &op0 = is_int_div ? src1 : src0;
1357       fs_reg &op1 = is_int_div ? src0 : src1;
1358
1359       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1360       inst = emit(opcode, dst, op0, reg_null_f);
1361
1362       inst->base_mrf = base_mrf;
1363       inst->mlen = 2 * dispatch_width / 8;
1364    }
1365    return inst;
1366 }
1367
1368 void
1369 fs_visitor::assign_curb_setup()
1370 {
1371    c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
1372    if (dispatch_width == 8) {
1373       c->prog_data.first_curbe_grf = c->nr_payload_regs;
1374    } else {
1375       c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1376    }
1377
1378    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1379    foreach_list(node, &this->instructions) {
1380       fs_inst *inst = (fs_inst *)node;
1381
1382       for (unsigned int i = 0; i < 3; i++) {
1383          if (inst->src[i].file == UNIFORM) {
1384             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1385             struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1386                                                   constant_nr / 8,
1387                                                   constant_nr % 8);
1388
1389             inst->src[i].file = HW_REG;
1390             inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
1391          }
1392       }
1393    }
1394 }
1395
1396 void
1397 fs_visitor::calculate_urb_setup()
1398 {
1399    for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1400       c->prog_data.urb_setup[i] = -1;
1401    }
1402
1403    int urb_next = 0;
1404    /* Figure out where each of the incoming setup attributes lands. */
1405    if (brw->gen >= 6) {
1406       if (_mesa_bitcount_64(fp->Base.InputsRead &
1407                             BRW_FS_VARYING_INPUT_MASK) <= 16) {
1408          /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1409           * first 16 varying inputs, so we can put them wherever we want.
1410           * Just put them in order.
1411           *
1412           * This is useful because it means that (a) inputs not used by the
1413           * fragment shader won't take up valuable register space, and (b) we
1414           * won't have to recompile the fragment shader if it gets paired with
1415           * a different vertex (or geometry) shader.
1416           */
1417          for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1418             if (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1419                 BITFIELD64_BIT(i)) {
1420                c->prog_data.urb_setup[i] = urb_next++;
1421             }
1422          }
1423       } else {
1424          /* We have enough input varyings that the SF/SBE pipeline stage can't
1425           * arbitrarily rearrange them to suit our whim; we have to put them
1426           * in an order that matches the output of the previous pipeline stage
1427           * (geometry or vertex shader).
1428           */
1429          struct brw_vue_map prev_stage_vue_map;
1430          brw_compute_vue_map(brw, &prev_stage_vue_map,
1431                              c->key.input_slots_valid);
1432          int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1433          assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1434          for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1435               slot++) {
1436             int varying = prev_stage_vue_map.slot_to_varying[slot];
1437             /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1438              * unused.
1439              */
1440             if (varying != BRW_VARYING_SLOT_COUNT &&
1441                 (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1442                  BITFIELD64_BIT(varying))) {
1443                c->prog_data.urb_setup[varying] = slot - first_slot;
1444             }
1445          }
1446          urb_next = prev_stage_vue_map.num_slots - first_slot;
1447       }
1448    } else {
1449       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1450       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1451          /* Point size is packed into the header, not as a general attribute */
1452          if (i == VARYING_SLOT_PSIZ)
1453             continue;
1454
1455          if (c->key.input_slots_valid & BITFIELD64_BIT(i)) {
1456             /* The back color slot is skipped when the front color is
1457              * also written to.  In addition, some slots can be
1458              * written in the vertex shader and not read in the
1459              * fragment shader.  So the register number must always be
1460              * incremented, mapped or not.
1461              */
1462             if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1463                c->prog_data.urb_setup[i] = urb_next;
1464             urb_next++;
1465          }
1466       }
1467
1468       /*
1469        * It's a FS only attribute, and we did interpolation for this attribute
1470        * in SF thread. So, count it here, too.
1471        *
1472        * See compile_sf_prog() for more info.
1473        */
1474       if (fp->Base.InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1475          c->prog_data.urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1476    }
1477
1478    c->prog_data.num_varying_inputs = urb_next;
1479 }
1480
1481 void
1482 fs_visitor::assign_urb_setup()
1483 {
1484    int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1485
1486    /* Offset all the urb_setup[] index by the actual position of the
1487     * setup regs, now that the location of the constants has been chosen.
1488     */
1489    foreach_list(node, &this->instructions) {
1490       fs_inst *inst = (fs_inst *)node;
1491
1492       if (inst->opcode == FS_OPCODE_LINTERP) {
1493          assert(inst->src[2].file == HW_REG);
1494          inst->src[2].fixed_hw_reg.nr += urb_start;
1495       }
1496
1497       if (inst->opcode == FS_OPCODE_CINTERP) {
1498          assert(inst->src[0].file == HW_REG);
1499          inst->src[0].fixed_hw_reg.nr += urb_start;
1500       }
1501    }
1502
1503    /* Each attribute is 4 setup channels, each of which is half a reg. */
1504    this->first_non_payload_grf =
1505       urb_start + c->prog_data.num_varying_inputs * 2;
1506 }
1507
1508 /**
1509  * Split large virtual GRFs into separate components if we can.
1510  *
1511  * This is mostly duplicated with what brw_fs_vector_splitting does,
1512  * but that's really conservative because it's afraid of doing
1513  * splitting that doesn't result in real progress after the rest of
1514  * the optimization phases, which would cause infinite looping in
1515  * optimization.  We can do it once here, safely.  This also has the
1516  * opportunity to split interpolated values, or maybe even uniforms,
1517  * which we don't have at the IR level.
1518  *
1519  * We want to split, because virtual GRFs are what we register
1520  * allocate and spill (due to contiguousness requirements for some
1521  * instructions), and they're what we naturally generate in the
1522  * codegen process, but most virtual GRFs don't actually need to be
1523  * contiguous sets of GRFs.  If we split, we'll end up with reduced
1524  * live intervals and better dead code elimination and coalescing.
1525  */
1526 void
1527 fs_visitor::split_virtual_grfs()
1528 {
1529    int num_vars = this->virtual_grf_count;
1530    bool split_grf[num_vars];
1531    int new_virtual_grf[num_vars];
1532
1533    /* Try to split anything > 0 sized. */
1534    for (int i = 0; i < num_vars; i++) {
1535       if (this->virtual_grf_sizes[i] != 1)
1536          split_grf[i] = true;
1537       else
1538          split_grf[i] = false;
1539    }
1540
1541    if (brw->has_pln &&
1542        this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1543       /* PLN opcodes rely on the delta_xy being contiguous.  We only have to
1544        * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1545        * Gen6, that was the only supported interpolation mode, and since Gen6,
1546        * delta_x and delta_y are in fixed hardware registers.
1547        */
1548       split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1549          false;
1550    }
1551
1552    foreach_list(node, &this->instructions) {
1553       fs_inst *inst = (fs_inst *)node;
1554
1555       /* If there's a SEND message that requires contiguous destination
1556        * registers, no splitting is allowed.
1557        */
1558       if (inst->regs_written > 1) {
1559          split_grf[inst->dst.reg] = false;
1560       }
1561
1562       /* If we're sending from a GRF, don't split it, on the assumption that
1563        * the send is reading the whole thing.
1564        */
1565       if (inst->is_send_from_grf()) {
1566          for (int i = 0; i < 3; i++) {
1567             if (inst->src[i].file == GRF) {
1568                split_grf[inst->src[i].reg] = false;
1569             }
1570          }
1571       }
1572    }
1573
1574    /* Allocate new space for split regs.  Note that the virtual
1575     * numbers will be contiguous.
1576     */
1577    for (int i = 0; i < num_vars; i++) {
1578       if (split_grf[i]) {
1579          new_virtual_grf[i] = virtual_grf_alloc(1);
1580          for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1581             int reg = virtual_grf_alloc(1);
1582             assert(reg == new_virtual_grf[i] + j - 1);
1583             (void) reg;
1584          }
1585          this->virtual_grf_sizes[i] = 1;
1586       }
1587    }
1588
1589    foreach_list(node, &this->instructions) {
1590       fs_inst *inst = (fs_inst *)node;
1591
1592       if (inst->dst.file == GRF &&
1593           split_grf[inst->dst.reg] &&
1594           inst->dst.reg_offset != 0) {
1595          inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1596                           inst->dst.reg_offset - 1);
1597          inst->dst.reg_offset = 0;
1598       }
1599       for (int i = 0; i < 3; i++) {
1600          if (inst->src[i].file == GRF &&
1601              split_grf[inst->src[i].reg] &&
1602              inst->src[i].reg_offset != 0) {
1603             inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1604                                 inst->src[i].reg_offset - 1);
1605             inst->src[i].reg_offset = 0;
1606          }
1607       }
1608    }
1609    invalidate_live_intervals();
1610 }
1611
1612 /**
1613  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1614  *
1615  * During code generation, we create tons of temporary variables, many of
1616  * which get immediately killed and are never used again.  Yet, in later
1617  * optimization and analysis passes, such as compute_live_intervals, we need
1618  * to loop over all the virtual GRFs.  Compacting them can save a lot of
1619  * overhead.
1620  */
1621 void
1622 fs_visitor::compact_virtual_grfs()
1623 {
1624    /* Mark which virtual GRFs are used, and count how many. */
1625    int remap_table[this->virtual_grf_count];
1626    memset(remap_table, -1, sizeof(remap_table));
1627
1628    foreach_list(node, &this->instructions) {
1629       const fs_inst *inst = (const fs_inst *) node;
1630
1631       if (inst->dst.file == GRF)
1632          remap_table[inst->dst.reg] = 0;
1633
1634       for (int i = 0; i < 3; i++) {
1635          if (inst->src[i].file == GRF)
1636             remap_table[inst->src[i].reg] = 0;
1637       }
1638    }
1639
1640    /* In addition to registers used in instructions, fs_visitor keeps
1641     * direct references to certain special values which must be patched:
1642     */
1643    fs_reg *special[] = {
1644       &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1645       &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1646       &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1647       &delta_x[0], &delta_x[1], &delta_x[2],
1648       &delta_x[3], &delta_x[4], &delta_x[5],
1649       &delta_y[0], &delta_y[1], &delta_y[2],
1650       &delta_y[3], &delta_y[4], &delta_y[5],
1651    };
1652    STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1653    STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1654
1655    /* Treat all special values as used, to be conservative */
1656    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1657       if (special[i]->file == GRF)
1658          remap_table[special[i]->reg] = 0;
1659    }
1660
1661    /* Compact the GRF arrays. */
1662    int new_index = 0;
1663    for (int i = 0; i < this->virtual_grf_count; i++) {
1664       if (remap_table[i] != -1) {
1665          remap_table[i] = new_index;
1666          virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1667          invalidate_live_intervals();
1668          ++new_index;
1669       }
1670    }
1671
1672    this->virtual_grf_count = new_index;
1673
1674    /* Patch all the instructions to use the newly renumbered registers */
1675    foreach_list(node, &this->instructions) {
1676       fs_inst *inst = (fs_inst *) node;
1677
1678       if (inst->dst.file == GRF)
1679          inst->dst.reg = remap_table[inst->dst.reg];
1680
1681       for (int i = 0; i < 3; i++) {
1682          if (inst->src[i].file == GRF)
1683             inst->src[i].reg = remap_table[inst->src[i].reg];
1684       }
1685    }
1686
1687    /* Patch all the references to special values */
1688    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1689       if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1690          special[i]->reg = remap_table[special[i]->reg];
1691    }
1692 }
1693
1694 bool
1695 fs_visitor::remove_dead_constants()
1696 {
1697    if (dispatch_width == 8) {
1698       this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
1699       this->nr_params_remap = c->prog_data.nr_params;
1700
1701       for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
1702          this->params_remap[i] = -1;
1703
1704       /* Find which params are still in use. */
1705       foreach_list(node, &this->instructions) {
1706          fs_inst *inst = (fs_inst *)node;
1707
1708          for (int i = 0; i < 3; i++) {
1709             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1710
1711             if (inst->src[i].file != UNIFORM)
1712                continue;
1713
1714             /* Section 5.11 of the OpenGL 4.3 spec says:
1715              *
1716              *     "Out-of-bounds reads return undefined values, which include
1717              *     values from other variables of the active program or zero."
1718              */
1719             if (constant_nr < 0 || constant_nr >= (int)c->prog_data.nr_params) {
1720                constant_nr = 0;
1721             }
1722
1723             /* For now, set this to non-negative.  We'll give it the
1724              * actual new number in a moment, in order to keep the
1725              * register numbers nicely ordered.
1726              */
1727             this->params_remap[constant_nr] = 0;
1728          }
1729       }
1730
1731       /* Figure out what the new numbers for the params will be.  At some
1732        * point when we're doing uniform array access, we're going to want
1733        * to keep the distinction between .reg and .reg_offset, but for
1734        * now we don't care.
1735        */
1736       unsigned int new_nr_params = 0;
1737       for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1738          if (this->params_remap[i] != -1) {
1739             this->params_remap[i] = new_nr_params++;
1740          }
1741       }
1742
1743       /* Update the list of params to be uploaded to match our new numbering. */
1744       for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1745          int remapped = this->params_remap[i];
1746
1747          if (remapped == -1)
1748             continue;
1749
1750          c->prog_data.param[remapped] = c->prog_data.param[i];
1751       }
1752
1753       c->prog_data.nr_params = new_nr_params;
1754    } else {
1755       /* This should have been generated in the 8-wide pass already. */
1756       assert(this->params_remap);
1757    }
1758
1759    /* Now do the renumbering of the shader to remove unused params. */
1760    foreach_list(node, &this->instructions) {
1761       fs_inst *inst = (fs_inst *)node;
1762
1763       for (int i = 0; i < 3; i++) {
1764          int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1765
1766          if (inst->src[i].file != UNIFORM)
1767             continue;
1768
1769          /* as above alias to 0 */
1770          if (constant_nr < 0 || constant_nr >= (int)this->nr_params_remap) {
1771             constant_nr = 0;
1772          }
1773          assert(this->params_remap[constant_nr] != -1);
1774          inst->src[i].reg = this->params_remap[constant_nr];
1775          inst->src[i].reg_offset = 0;
1776       }
1777    }
1778
1779    return true;
1780 }
1781
1782 /*
1783  * Implements array access of uniforms by inserting a
1784  * PULL_CONSTANT_LOAD instruction.
1785  *
1786  * Unlike temporary GRF array access (where we don't support it due to
1787  * the difficulty of doing relative addressing on instruction
1788  * destinations), we could potentially do array access of uniforms
1789  * that were loaded in GRF space as push constants.  In real-world
1790  * usage we've seen, though, the arrays being used are always larger
1791  * than we could load as push constants, so just always move all
1792  * uniform array access out to a pull constant buffer.
1793  */
1794 void
1795 fs_visitor::move_uniform_array_access_to_pull_constants()
1796 {
1797    int pull_constant_loc[c->prog_data.nr_params];
1798
1799    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1800       pull_constant_loc[i] = -1;
1801    }
1802
1803    /* Walk through and find array access of uniforms.  Put a copy of that
1804     * uniform in the pull constant buffer.
1805     *
1806     * Note that we don't move constant-indexed accesses to arrays.  No
1807     * testing has been done of the performance impact of this choice.
1808     */
1809    foreach_list_safe(node, &this->instructions) {
1810       fs_inst *inst = (fs_inst *)node;
1811
1812       for (int i = 0 ; i < 3; i++) {
1813          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1814             continue;
1815
1816          int uniform = inst->src[i].reg;
1817
1818          /* If this array isn't already present in the pull constant buffer,
1819           * add it.
1820           */
1821          if (pull_constant_loc[uniform] == -1) {
1822             const float **values = &c->prog_data.param[uniform];
1823
1824             pull_constant_loc[uniform] = c->prog_data.nr_pull_params;
1825
1826             assert(param_size[uniform]);
1827
1828             for (int j = 0; j < param_size[uniform]; j++) {
1829                c->prog_data.pull_param[c->prog_data.nr_pull_params++] =
1830                   values[j];
1831             }
1832          }
1833
1834          /* Set up the annotation tracking for new generated instructions. */
1835          base_ir = inst->ir;
1836          current_annotation = inst->annotation;
1837
1838          fs_reg surf_index = fs_reg(c->prog_data.base.binding_table.pull_constants_start);
1839          fs_reg temp = fs_reg(this, glsl_type::float_type);
1840          exec_list list = VARYING_PULL_CONSTANT_LOAD(temp,
1841                                                      surf_index,
1842                                                      *inst->src[i].reladdr,
1843                                                      pull_constant_loc[uniform] +
1844                                                      inst->src[i].reg_offset);
1845          inst->insert_before(&list);
1846
1847          inst->src[i].file = temp.file;
1848          inst->src[i].reg = temp.reg;
1849          inst->src[i].reg_offset = temp.reg_offset;
1850          inst->src[i].reladdr = NULL;
1851       }
1852    }
1853 }
1854
1855 /**
1856  * Choose accesses from the UNIFORM file to demote to using the pull
1857  * constant buffer.
1858  *
1859  * We allow a fragment shader to have more than the specified minimum
1860  * maximum number of fragment shader uniform components (64).  If
1861  * there are too many of these, they'd fill up all of register space.
1862  * So, this will push some of them out to the pull constant buffer and
1863  * update the program to load them.
1864  */
1865 void
1866 fs_visitor::setup_pull_constants()
1867 {
1868    /* Only allow 16 registers (128 uniform components) as push constants. */
1869    unsigned int max_uniform_components = 16 * 8;
1870    if (c->prog_data.nr_params <= max_uniform_components)
1871       return;
1872
1873    if (dispatch_width == 16) {
1874       fail("Pull constants not supported in 16-wide\n");
1875       return;
1876    }
1877
1878    /* Just demote the end of the list.  We could probably do better
1879     * here, demoting things that are rarely used in the program first.
1880     */
1881    unsigned int pull_uniform_base = max_uniform_components;
1882
1883    int pull_constant_loc[c->prog_data.nr_params];
1884    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1885       if (i < pull_uniform_base) {
1886          pull_constant_loc[i] = -1;
1887       } else {
1888          pull_constant_loc[i] = -1;
1889          /* If our constant is already being uploaded for reladdr purposes,
1890           * reuse it.
1891           */
1892          for (unsigned int j = 0; j < c->prog_data.nr_pull_params; j++) {
1893             if (c->prog_data.pull_param[j] == c->prog_data.param[i]) {
1894                pull_constant_loc[i] = j;
1895                break;
1896             }
1897          }
1898          if (pull_constant_loc[i] == -1) {
1899             int pull_index = c->prog_data.nr_pull_params++;
1900             c->prog_data.pull_param[pull_index] = c->prog_data.param[i];
1901             pull_constant_loc[i] = pull_index;;
1902          }
1903       }
1904    }
1905    c->prog_data.nr_params = pull_uniform_base;
1906
1907    foreach_list(node, &this->instructions) {
1908       fs_inst *inst = (fs_inst *)node;
1909
1910       for (int i = 0; i < 3; i++) {
1911          if (inst->src[i].file != UNIFORM)
1912             continue;
1913
1914          int pull_index = pull_constant_loc[inst->src[i].reg +
1915                                             inst->src[i].reg_offset];
1916          if (pull_index == -1)
1917             continue;
1918
1919          assert(!inst->src[i].reladdr);
1920
1921          fs_reg dst = fs_reg(this, glsl_type::float_type);
1922          fs_reg index = fs_reg(c->prog_data.base.binding_table.pull_constants_start);
1923          fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1924          fs_inst *pull =
1925             new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1926                                  dst, index, offset);
1927          pull->ir = inst->ir;
1928          pull->annotation = inst->annotation;
1929
1930          inst->insert_before(pull);
1931
1932          inst->src[i].file = GRF;
1933          inst->src[i].reg = dst.reg;
1934          inst->src[i].reg_offset = 0;
1935          inst->src[i].smear = pull_index & 3;
1936       }
1937    }
1938 }
1939
1940 bool
1941 fs_visitor::opt_algebraic()
1942 {
1943    bool progress = false;
1944
1945    foreach_list(node, &this->instructions) {
1946       fs_inst *inst = (fs_inst *)node;
1947
1948       switch (inst->opcode) {
1949       case BRW_OPCODE_MUL:
1950          if (inst->src[1].file != IMM)
1951             continue;
1952
1953          /* a * 1.0 = a */
1954          if (inst->src[1].is_one()) {
1955             inst->opcode = BRW_OPCODE_MOV;
1956             inst->src[1] = reg_undef;
1957             progress = true;
1958             break;
1959          }
1960
1961          /* a * 0.0 = 0.0 */
1962          if (inst->src[1].is_zero()) {
1963             inst->opcode = BRW_OPCODE_MOV;
1964             inst->src[0] = inst->src[1];
1965             inst->src[1] = reg_undef;
1966             progress = true;
1967             break;
1968          }
1969
1970          break;
1971       case BRW_OPCODE_ADD:
1972          if (inst->src[1].file != IMM)
1973             continue;
1974
1975          /* a + 0.0 = a */
1976          if (inst->src[1].is_zero()) {
1977             inst->opcode = BRW_OPCODE_MOV;
1978             inst->src[1] = reg_undef;
1979             progress = true;
1980             break;
1981          }
1982          break;
1983       case BRW_OPCODE_OR:
1984          if (inst->src[0].equals(inst->src[1])) {
1985             inst->opcode = BRW_OPCODE_MOV;
1986             inst->src[1] = reg_undef;
1987             progress = true;
1988             break;
1989          }
1990          break;
1991       case BRW_OPCODE_SEL:
1992          if (inst->saturate && inst->src[1].file == IMM) {
1993             switch (inst->conditional_mod) {
1994             case BRW_CONDITIONAL_LE:
1995             case BRW_CONDITIONAL_L:
1996                switch (inst->src[1].type) {
1997                case BRW_REGISTER_TYPE_F:
1998                   if (inst->src[1].imm.f >= 1.0f) {
1999                      inst->opcode = BRW_OPCODE_MOV;
2000                      inst->src[1] = reg_undef;
2001                      progress = true;
2002                   }
2003                   break;
2004                default:
2005                   break;
2006                }
2007                break;
2008             case BRW_CONDITIONAL_GE:
2009             case BRW_CONDITIONAL_G:
2010                switch (inst->src[1].type) {
2011                case BRW_REGISTER_TYPE_F:
2012                   if (inst->src[1].imm.f <= 0.0f) {
2013                      inst->opcode = BRW_OPCODE_MOV;
2014                      inst->src[1] = reg_undef;
2015                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
2016                      progress = true;
2017                   }
2018                   break;
2019                default:
2020                   break;
2021                }
2022             default:
2023                break;
2024             }
2025          }
2026          break;
2027       default:
2028          break;
2029       }
2030    }
2031
2032    return progress;
2033 }
2034
2035 /**
2036  * Removes any instructions writing a VGRF where that VGRF is not used by any
2037  * later instruction.
2038  */
2039 bool
2040 fs_visitor::dead_code_eliminate()
2041 {
2042    bool progress = false;
2043    int pc = 0;
2044
2045    calculate_live_intervals();
2046
2047    foreach_list_safe(node, &this->instructions) {
2048       fs_inst *inst = (fs_inst *)node;
2049
2050       if (inst->dst.file == GRF && !inst->has_side_effects()) {
2051          bool dead = true;
2052
2053          for (int i = 0; i < inst->regs_written; i++) {
2054             int var = live_intervals->var_from_vgrf[inst->dst.reg];
2055             assert(live_intervals->end[var + inst->dst.reg_offset + i] >= pc);
2056             if (live_intervals->end[var + inst->dst.reg_offset + i] != pc) {
2057                dead = false;
2058                break;
2059             }
2060          }
2061
2062          if (dead) {
2063             /* Don't dead code eliminate instructions that write to the
2064              * accumulator as a side-effect. Instead just set the destination
2065              * to the null register to free it.
2066              */
2067             switch (inst->opcode) {
2068             case BRW_OPCODE_ADDC:
2069             case BRW_OPCODE_SUBB:
2070             case BRW_OPCODE_MACH:
2071                inst->dst = fs_reg(retype(brw_null_reg(), inst->dst.type));
2072                break;
2073             default:
2074                inst->remove();
2075                progress = true;
2076                break;
2077             }
2078          }
2079       }
2080
2081       pc++;
2082    }
2083
2084    if (progress)
2085       invalidate_live_intervals();
2086
2087    return progress;
2088 }
2089
2090 struct dead_code_hash_key
2091 {
2092    int vgrf;
2093    int reg_offset;
2094 };
2095
2096 static bool
2097 dead_code_hash_compare(const void *a, const void *b)
2098 {
2099    return memcmp(a, b, sizeof(struct dead_code_hash_key)) == 0;
2100 }
2101
2102 static void
2103 clear_dead_code_hash(struct hash_table *ht)
2104 {
2105    struct hash_entry *entry;
2106
2107    hash_table_foreach(ht, entry) {
2108       _mesa_hash_table_remove(ht, entry);
2109    }
2110 }
2111
2112 static void
2113 insert_dead_code_hash(struct hash_table *ht,
2114                       int vgrf, int reg_offset, fs_inst *inst)
2115 {
2116    /* We don't bother freeing keys, because they'll be GCed with the ht. */
2117    struct dead_code_hash_key *key = ralloc(ht, struct dead_code_hash_key);
2118
2119    key->vgrf = vgrf;
2120    key->reg_offset = reg_offset;
2121
2122    _mesa_hash_table_insert(ht, _mesa_hash_data(key, sizeof(*key)), key, inst);
2123 }
2124
2125 static struct hash_entry *
2126 get_dead_code_hash_entry(struct hash_table *ht, int vgrf, int reg_offset)
2127 {
2128    struct dead_code_hash_key key;
2129
2130    key.vgrf = vgrf;
2131    key.reg_offset = reg_offset;
2132
2133    return _mesa_hash_table_search(ht, _mesa_hash_data(&key, sizeof(key)), &key);
2134 }
2135
2136 static void
2137 remove_dead_code_hash(struct hash_table *ht,
2138                       int vgrf, int reg_offset)
2139 {
2140    struct hash_entry *entry = get_dead_code_hash_entry(ht, vgrf, reg_offset);
2141    if (!entry)
2142       return;
2143
2144    _mesa_hash_table_remove(ht, entry);
2145 }
2146
2147 /**
2148  * Walks basic blocks, removing any regs that are written but not read before
2149  * being redefined.
2150  *
2151  * The dead_code_eliminate() function implements a global dead code
2152  * elimination, but it only handles the removing the last write to a register
2153  * if it's never read.  This one can handle intermediate writes, but only
2154  * within a basic block.
2155  */
2156 bool
2157 fs_visitor::dead_code_eliminate_local()
2158 {
2159    struct hash_table *ht;
2160    bool progress = false;
2161
2162    ht = _mesa_hash_table_create(mem_ctx, dead_code_hash_compare);
2163
2164    foreach_list_safe(node, &this->instructions) {
2165       fs_inst *inst = (fs_inst *)node;
2166
2167       /* At a basic block, empty the HT since we don't understand dataflow
2168        * here.
2169        */
2170       if (inst->is_control_flow()) {
2171          clear_dead_code_hash(ht);
2172          continue;
2173       }
2174
2175       /* Clear the HT of any instructions that got read. */
2176       for (int i = 0; i < 3; i++) {
2177          fs_reg src = inst->src[i];
2178          if (src.file != GRF)
2179             continue;
2180
2181          int read = 1;
2182          if (inst->is_send_from_grf())
2183             read = virtual_grf_sizes[src.reg] - src.reg_offset;
2184
2185          for (int reg_offset = src.reg_offset;
2186               reg_offset < src.reg_offset + read;
2187               reg_offset++) {
2188             remove_dead_code_hash(ht, src.reg, reg_offset);
2189          }
2190       }
2191
2192       /* Add any update of a GRF to the HT, removing a previous write if it
2193        * wasn't read.
2194        */
2195       if (inst->dst.file == GRF) {
2196          if (inst->regs_written > 1) {
2197             /* We don't know how to trim channels from an instruction's
2198              * writes, so we can't incrementally remove unread channels from
2199              * it.  Just remove whatever it overwrites from the table
2200              */
2201             for (int i = 0; i < inst->regs_written; i++) {
2202                remove_dead_code_hash(ht,
2203                                      inst->dst.reg,
2204                                      inst->dst.reg_offset + i);
2205             }
2206          } else {
2207             struct hash_entry *entry =
2208                get_dead_code_hash_entry(ht, inst->dst.reg,
2209                                         inst->dst.reg_offset);
2210
2211             if (entry) {
2212                if (inst->is_partial_write()) {
2213                   /* For a partial write, we can't remove any previous dead code
2214                    * candidate, since we're just modifying their result.
2215                    */
2216                } else {
2217                   /* We're completely updating a channel, and there was a
2218                    * previous write to the channel that wasn't read.  Kill it!
2219                    */
2220                   fs_inst *inst = (fs_inst *)entry->data;
2221                   inst->remove();
2222                   progress = true;
2223                }
2224
2225                _mesa_hash_table_remove(ht, entry);
2226             }
2227
2228             if (!inst->has_side_effects())
2229                insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
2230                                      inst);
2231          }
2232       }
2233    }
2234
2235    _mesa_hash_table_destroy(ht, NULL);
2236
2237    if (progress)
2238       invalidate_live_intervals();
2239
2240    return progress;
2241 }
2242
2243 /**
2244  * Implements register coalescing: Checks if the two registers involved in a
2245  * raw move don't interfere, in which case they can both be stored in the same
2246  * place and the MOV removed.
2247  */
2248 bool
2249 fs_visitor::register_coalesce()
2250 {
2251    bool progress = false;
2252
2253    calculate_live_intervals();
2254
2255    foreach_list_safe(node, &this->instructions) {
2256       fs_inst *inst = (fs_inst *)node;
2257
2258       if (inst->opcode != BRW_OPCODE_MOV ||
2259           inst->is_partial_write() ||
2260           inst->saturate ||
2261           inst->src[0].file != GRF ||
2262           inst->src[0].negate ||
2263           inst->src[0].abs ||
2264           inst->src[0].smear != -1 ||
2265           inst->dst.file != GRF ||
2266           inst->dst.type != inst->src[0].type ||
2267           virtual_grf_sizes[inst->src[0].reg] != 1) {
2268          continue;
2269       }
2270
2271       int var_from = live_intervals->var_from_reg(&inst->src[0]);
2272       int var_to = live_intervals->var_from_reg(&inst->dst);
2273
2274       if (live_intervals->vars_interfere(var_from, var_to) &&
2275           !inst->dst.equals(inst->src[0]))
2276          continue;
2277
2278       int reg_from = inst->src[0].reg;
2279       assert(inst->src[0].reg_offset == 0);
2280       int reg_to = inst->dst.reg;
2281       int reg_to_offset = inst->dst.reg_offset;
2282
2283       foreach_list(node, &this->instructions) {
2284          fs_inst *scan_inst = (fs_inst *)node;
2285
2286          if (scan_inst->dst.file == GRF &&
2287              scan_inst->dst.reg == reg_from) {
2288             scan_inst->dst.reg = reg_to;
2289             scan_inst->dst.reg_offset = reg_to_offset;
2290          }
2291          for (int i = 0; i < 3; i++) {
2292             if (scan_inst->src[i].file == GRF &&
2293                 scan_inst->src[i].reg == reg_from) {
2294                scan_inst->src[i].reg = reg_to;
2295                scan_inst->src[i].reg_offset = reg_to_offset;
2296             }
2297          }
2298       }
2299
2300       inst->remove();
2301       progress = true;
2302       continue;
2303    }
2304
2305    if (progress)
2306       invalidate_live_intervals();
2307
2308    return progress;
2309 }
2310
2311 bool
2312 fs_visitor::compute_to_mrf()
2313 {
2314    bool progress = false;
2315    int next_ip = 0;
2316
2317    calculate_live_intervals();
2318
2319    foreach_list_safe(node, &this->instructions) {
2320       fs_inst *inst = (fs_inst *)node;
2321
2322       int ip = next_ip;
2323       next_ip++;
2324
2325       if (inst->opcode != BRW_OPCODE_MOV ||
2326           inst->is_partial_write() ||
2327           inst->dst.file != MRF || inst->src[0].file != GRF ||
2328           inst->dst.type != inst->src[0].type ||
2329           inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
2330          continue;
2331
2332       /* Work out which hardware MRF registers are written by this
2333        * instruction.
2334        */
2335       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2336       int mrf_high;
2337       if (inst->dst.reg & BRW_MRF_COMPR4) {
2338          mrf_high = mrf_low + 4;
2339       } else if (dispatch_width == 16 &&
2340                  (!inst->force_uncompressed && !inst->force_sechalf)) {
2341          mrf_high = mrf_low + 1;
2342       } else {
2343          mrf_high = mrf_low;
2344       }
2345
2346       /* Can't compute-to-MRF this GRF if someone else was going to
2347        * read it later.
2348        */
2349       if (this->virtual_grf_end[inst->src[0].reg] > ip)
2350          continue;
2351
2352       /* Found a move of a GRF to a MRF.  Let's see if we can go
2353        * rewrite the thing that made this GRF to write into the MRF.
2354        */
2355       fs_inst *scan_inst;
2356       for (scan_inst = (fs_inst *)inst->prev;
2357            scan_inst->prev != NULL;
2358            scan_inst = (fs_inst *)scan_inst->prev) {
2359          if (scan_inst->dst.file == GRF &&
2360              scan_inst->dst.reg == inst->src[0].reg) {
2361             /* Found the last thing to write our reg we want to turn
2362              * into a compute-to-MRF.
2363              */
2364
2365             /* If this one instruction didn't populate all the
2366              * channels, bail.  We might be able to rewrite everything
2367              * that writes that reg, but it would require smarter
2368              * tracking to delay the rewriting until complete success.
2369              */
2370             if (scan_inst->is_partial_write())
2371                break;
2372
2373             /* Things returning more than one register would need us to
2374              * understand coalescing out more than one MOV at a time.
2375              */
2376             if (scan_inst->regs_written > 1)
2377                break;
2378
2379             /* SEND instructions can't have MRF as a destination. */
2380             if (scan_inst->mlen)
2381                break;
2382
2383             if (brw->gen == 6) {
2384                /* gen6 math instructions must have the destination be
2385                 * GRF, so no compute-to-MRF for them.
2386                 */
2387                if (scan_inst->is_math()) {
2388                   break;
2389                }
2390             }
2391
2392             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2393                /* Found the creator of our MRF's source value. */
2394                scan_inst->dst.file = MRF;
2395                scan_inst->dst.reg = inst->dst.reg;
2396                scan_inst->saturate |= inst->saturate;
2397                inst->remove();
2398                progress = true;
2399             }
2400             break;
2401          }
2402
2403          /* We don't handle control flow here.  Most computation of
2404           * values that end up in MRFs are shortly before the MRF
2405           * write anyway.
2406           */
2407          if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2408             break;
2409
2410          /* You can't read from an MRF, so if someone else reads our
2411           * MRF's source GRF that we wanted to rewrite, that stops us.
2412           */
2413          bool interfered = false;
2414          for (int i = 0; i < 3; i++) {
2415             if (scan_inst->src[i].file == GRF &&
2416                 scan_inst->src[i].reg == inst->src[0].reg &&
2417                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2418                interfered = true;
2419             }
2420          }
2421          if (interfered)
2422             break;
2423
2424          if (scan_inst->dst.file == MRF) {
2425             /* If somebody else writes our MRF here, we can't
2426              * compute-to-MRF before that.
2427              */
2428             int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2429             int scan_mrf_high;
2430
2431             if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2432                scan_mrf_high = scan_mrf_low + 4;
2433             } else if (dispatch_width == 16 &&
2434                        (!scan_inst->force_uncompressed &&
2435                         !scan_inst->force_sechalf)) {
2436                scan_mrf_high = scan_mrf_low + 1;
2437             } else {
2438                scan_mrf_high = scan_mrf_low;
2439             }
2440
2441             if (mrf_low == scan_mrf_low ||
2442                 mrf_low == scan_mrf_high ||
2443                 mrf_high == scan_mrf_low ||
2444                 mrf_high == scan_mrf_high) {
2445                break;
2446             }
2447          }
2448
2449          if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2450             /* Found a SEND instruction, which means that there are
2451              * live values in MRFs from base_mrf to base_mrf +
2452              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
2453              * above it.
2454              */
2455             if (mrf_low >= scan_inst->base_mrf &&
2456                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2457                break;
2458             }
2459             if (mrf_high >= scan_inst->base_mrf &&
2460                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2461                break;
2462             }
2463          }
2464       }
2465    }
2466
2467    if (progress)
2468       invalidate_live_intervals();
2469
2470    return progress;
2471 }
2472
2473 /**
2474  * Walks through basic blocks, looking for repeated MRF writes and
2475  * removing the later ones.
2476  */
2477 bool
2478 fs_visitor::remove_duplicate_mrf_writes()
2479 {
2480    fs_inst *last_mrf_move[16];
2481    bool progress = false;
2482
2483    /* Need to update the MRF tracking for compressed instructions. */
2484    if (dispatch_width == 16)
2485       return false;
2486
2487    memset(last_mrf_move, 0, sizeof(last_mrf_move));
2488
2489    foreach_list_safe(node, &this->instructions) {
2490       fs_inst *inst = (fs_inst *)node;
2491
2492       if (inst->is_control_flow()) {
2493          memset(last_mrf_move, 0, sizeof(last_mrf_move));
2494       }
2495
2496       if (inst->opcode == BRW_OPCODE_MOV &&
2497           inst->dst.file == MRF) {
2498          fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2499          if (prev_inst && inst->equals(prev_inst)) {
2500             inst->remove();
2501             progress = true;
2502             continue;
2503          }
2504       }
2505
2506       /* Clear out the last-write records for MRFs that were overwritten. */
2507       if (inst->dst.file == MRF) {
2508          last_mrf_move[inst->dst.reg] = NULL;
2509       }
2510
2511       if (inst->mlen > 0 && inst->base_mrf != -1) {
2512          /* Found a SEND instruction, which will include two or fewer
2513           * implied MRF writes.  We could do better here.
2514           */
2515          for (int i = 0; i < implied_mrf_writes(inst); i++) {
2516             last_mrf_move[inst->base_mrf + i] = NULL;
2517          }
2518       }
2519
2520       /* Clear out any MRF move records whose sources got overwritten. */
2521       if (inst->dst.file == GRF) {
2522          for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2523             if (last_mrf_move[i] &&
2524                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2525                last_mrf_move[i] = NULL;
2526             }
2527          }
2528       }
2529
2530       if (inst->opcode == BRW_OPCODE_MOV &&
2531           inst->dst.file == MRF &&
2532           inst->src[0].file == GRF &&
2533           !inst->is_partial_write()) {
2534          last_mrf_move[inst->dst.reg] = inst;
2535       }
2536    }
2537
2538    if (progress)
2539       invalidate_live_intervals();
2540
2541    return progress;
2542 }
2543
2544 static void
2545 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2546                         int first_grf, int grf_len)
2547 {
2548    bool inst_16wide = (dispatch_width > 8 &&
2549                        !inst->force_uncompressed &&
2550                        !inst->force_sechalf);
2551
2552    /* Clear the flag for registers that actually got read (as expected). */
2553    for (int i = 0; i < 3; i++) {
2554       int grf;
2555       if (inst->src[i].file == GRF) {
2556          grf = inst->src[i].reg;
2557       } else if (inst->src[i].file == HW_REG &&
2558                  inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2559          grf = inst->src[i].fixed_hw_reg.nr;
2560       } else {
2561          continue;
2562       }
2563
2564       if (grf >= first_grf &&
2565           grf < first_grf + grf_len) {
2566          deps[grf - first_grf] = false;
2567          if (inst_16wide)
2568             deps[grf - first_grf + 1] = false;
2569       }
2570    }
2571 }
2572
2573 /**
2574  * Implements this workaround for the original 965:
2575  *
2576  *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2577  *      check for post destination dependencies on this instruction, software
2578  *      must ensure that there is no destination hazard for the case of ‘write
2579  *      followed by a posted write’ shown in the following example.
2580  *
2581  *      1. mov r3 0
2582  *      2. send r3.xy <rest of send instruction>
2583  *      3. mov r2 r3
2584  *
2585  *      Due to no post-destination dependency check on the ‘send’, the above
2586  *      code sequence could have two instructions (1 and 2) in flight at the
2587  *      same time that both consider ‘r3’ as the target of their final writes.
2588  */
2589 void
2590 fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
2591 {
2592    int reg_size = dispatch_width / 8;
2593    int write_len = inst->regs_written * reg_size;
2594    int first_write_grf = inst->dst.reg;
2595    bool needs_dep[BRW_MAX_MRF];
2596    assert(write_len < (int)sizeof(needs_dep) - 1);
2597
2598    memset(needs_dep, false, sizeof(needs_dep));
2599    memset(needs_dep, true, write_len);
2600
2601    clear_deps_for_inst_src(inst, dispatch_width,
2602                            needs_dep, first_write_grf, write_len);
2603
2604    /* Walk backwards looking for writes to registers we're writing which
2605     * aren't read since being written.  If we hit the start of the program,
2606     * we assume that there are no outstanding dependencies on entry to the
2607     * program.
2608     */
2609    for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2610         scan_inst != NULL;
2611         scan_inst = (fs_inst *)scan_inst->prev) {
2612
2613       /* If we hit control flow, assume that there *are* outstanding
2614        * dependencies, and force their cleanup before our instruction.
2615        */
2616       if (scan_inst->is_control_flow()) {
2617          for (int i = 0; i < write_len; i++) {
2618             if (needs_dep[i]) {
2619                inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2620             }
2621          }
2622          return;
2623       }
2624
2625       bool scan_inst_16wide = (dispatch_width > 8 &&
2626                                !scan_inst->force_uncompressed &&
2627                                !scan_inst->force_sechalf);
2628
2629       /* We insert our reads as late as possible on the assumption that any
2630        * instruction but a MOV that might have left us an outstanding
2631        * dependency has more latency than a MOV.
2632        */
2633       if (scan_inst->dst.file == GRF) {
2634          for (int i = 0; i < scan_inst->regs_written; i++) {
2635             int reg = scan_inst->dst.reg + i * reg_size;
2636
2637             if (reg >= first_write_grf &&
2638                 reg < first_write_grf + write_len &&
2639                 needs_dep[reg - first_write_grf]) {
2640                inst->insert_before(DEP_RESOLVE_MOV(reg));
2641                needs_dep[reg - first_write_grf] = false;
2642                if (scan_inst_16wide)
2643                   needs_dep[reg - first_write_grf + 1] = false;
2644             }
2645          }
2646       }
2647
2648       /* Clear the flag for registers that actually got read (as expected). */
2649       clear_deps_for_inst_src(scan_inst, dispatch_width,
2650                               needs_dep, first_write_grf, write_len);
2651
2652       /* Continue the loop only if we haven't resolved all the dependencies */
2653       int i;
2654       for (i = 0; i < write_len; i++) {
2655          if (needs_dep[i])
2656             break;
2657       }
2658       if (i == write_len)
2659          return;
2660    }
2661 }
2662
2663 /**
2664  * Implements this workaround for the original 965:
2665  *
2666  *     "[DevBW, DevCL] Errata: A destination register from a send can not be
2667  *      used as a destination register until after it has been sourced by an
2668  *      instruction with a different destination register.
2669  */
2670 void
2671 fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
2672 {
2673    int write_len = inst->regs_written * dispatch_width / 8;
2674    int first_write_grf = inst->dst.reg;
2675    bool needs_dep[BRW_MAX_MRF];
2676    assert(write_len < (int)sizeof(needs_dep) - 1);
2677
2678    memset(needs_dep, false, sizeof(needs_dep));
2679    memset(needs_dep, true, write_len);
2680    /* Walk forwards looking for writes to registers we're writing which aren't
2681     * read before being written.
2682     */
2683    for (fs_inst *scan_inst = (fs_inst *)inst->next;
2684         !scan_inst->is_tail_sentinel();
2685         scan_inst = (fs_inst *)scan_inst->next) {
2686       /* If we hit control flow, force resolve all remaining dependencies. */
2687       if (scan_inst->is_control_flow()) {
2688          for (int i = 0; i < write_len; i++) {
2689             if (needs_dep[i])
2690                scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2691          }
2692          return;
2693       }
2694
2695       /* Clear the flag for registers that actually got read (as expected). */
2696       clear_deps_for_inst_src(scan_inst, dispatch_width,
2697                               needs_dep, first_write_grf, write_len);
2698
2699       /* We insert our reads as late as possible since they're reading the
2700        * result of a SEND, which has massive latency.
2701        */
2702       if (scan_inst->dst.file == GRF &&
2703           scan_inst->dst.reg >= first_write_grf &&
2704           scan_inst->dst.reg < first_write_grf + write_len &&
2705           needs_dep[scan_inst->dst.reg - first_write_grf]) {
2706          scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2707          needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2708       }
2709
2710       /* Continue the loop only if we haven't resolved all the dependencies */
2711       int i;
2712       for (i = 0; i < write_len; i++) {
2713          if (needs_dep[i])
2714             break;
2715       }
2716       if (i == write_len)
2717          return;
2718    }
2719
2720    /* If we hit the end of the program, resolve all remaining dependencies out
2721     * of paranoia.
2722     */
2723    fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2724    assert(last_inst->eot);
2725    for (int i = 0; i < write_len; i++) {
2726       if (needs_dep[i])
2727          last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2728    }
2729 }
2730
2731 void
2732 fs_visitor::insert_gen4_send_dependency_workarounds()
2733 {
2734    if (brw->gen != 4 || brw->is_g4x)
2735       return;
2736
2737    /* Note that we're done with register allocation, so GRF fs_regs always
2738     * have a .reg_offset of 0.
2739     */
2740
2741    foreach_list_safe(node, &this->instructions) {
2742       fs_inst *inst = (fs_inst *)node;
2743
2744       if (inst->mlen != 0 && inst->dst.file == GRF) {
2745          insert_gen4_pre_send_dependency_workarounds(inst);
2746          insert_gen4_post_send_dependency_workarounds(inst);
2747       }
2748    }
2749 }
2750
2751 /**
2752  * Turns the generic expression-style uniform pull constant load instruction
2753  * into a hardware-specific series of instructions for loading a pull
2754  * constant.
2755  *
2756  * The expression style allows the CSE pass before this to optimize out
2757  * repeated loads from the same offset, and gives the pre-register-allocation
2758  * scheduling full flexibility, while the conversion to native instructions
2759  * allows the post-register-allocation scheduler the best information
2760  * possible.
2761  *
2762  * Note that execution masking for setting up pull constant loads is special:
2763  * the channels that need to be written are unrelated to the current execution
2764  * mask, since a later instruction will use one of the result channels as a
2765  * source operand for all 8 or 16 of its channels.
2766  */
2767 void
2768 fs_visitor::lower_uniform_pull_constant_loads()
2769 {
2770    foreach_list(node, &this->instructions) {
2771       fs_inst *inst = (fs_inst *)node;
2772
2773       if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2774          continue;
2775
2776       if (brw->gen >= 7) {
2777          /* The offset arg before was a vec4-aligned byte offset.  We need to
2778           * turn it into a dword offset.
2779           */
2780          fs_reg const_offset_reg = inst->src[1];
2781          assert(const_offset_reg.file == IMM &&
2782                 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2783          const_offset_reg.imm.u /= 4;
2784          fs_reg payload = fs_reg(this, glsl_type::uint_type);
2785
2786          /* This is actually going to be a MOV, but since only the first dword
2787           * is accessed, we have a special opcode to do just that one.  Note
2788           * that this needs to be an operation that will be considered a def
2789           * by live variable analysis, or register allocation will explode.
2790           */
2791          fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2792                                                payload, const_offset_reg);
2793          setup->force_writemask_all = true;
2794
2795          setup->ir = inst->ir;
2796          setup->annotation = inst->annotation;
2797          inst->insert_before(setup);
2798
2799          /* Similarly, this will only populate the first 4 channels of the
2800           * result register (since we only use smear values from 0-3), but we
2801           * don't tell the optimizer.
2802           */
2803          inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2804          inst->src[1] = payload;
2805
2806          invalidate_live_intervals();
2807       } else {
2808          /* Before register allocation, we didn't tell the scheduler about the
2809           * MRF we use.  We know it's safe to use this MRF because nothing
2810           * else does except for register spill/unspill, which generates and
2811           * uses its MRF within a single IR instruction.
2812           */
2813          inst->base_mrf = 14;
2814          inst->mlen = 1;
2815       }
2816    }
2817 }
2818
2819 void
2820 fs_visitor::dump_instruction(backend_instruction *be_inst)
2821 {
2822    fs_inst *inst = (fs_inst *)be_inst;
2823
2824    if (inst->predicate) {
2825       printf("(%cf0.%d) ",
2826              inst->predicate_inverse ? '-' : '+',
2827              inst->flag_subreg);
2828    }
2829
2830    printf("%s", brw_instruction_name(inst->opcode));
2831    if (inst->saturate)
2832       printf(".sat");
2833    if (inst->conditional_mod) {
2834       printf(".cmod");
2835       if (!inst->predicate &&
2836           (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2837                               inst->opcode != BRW_OPCODE_IF &&
2838                               inst->opcode != BRW_OPCODE_WHILE))) {
2839          printf(".f0.%d", inst->flag_subreg);
2840       }
2841    }
2842    printf(" ");
2843
2844
2845    switch (inst->dst.file) {
2846    case GRF:
2847       printf("vgrf%d", inst->dst.reg);
2848       if (inst->dst.reg_offset)
2849          printf("+%d", inst->dst.reg_offset);
2850       break;
2851    case MRF:
2852       printf("m%d", inst->dst.reg);
2853       break;
2854    case BAD_FILE:
2855       printf("(null)");
2856       break;
2857    case UNIFORM:
2858       printf("***u%d***", inst->dst.reg);
2859       break;
2860    case HW_REG:
2861       printf("hw_reg%d", inst->dst.fixed_hw_reg.nr);
2862       if (inst->dst.fixed_hw_reg.subnr)
2863          printf("+%d", inst->dst.fixed_hw_reg.subnr);
2864       break;
2865    default:
2866       printf("???");
2867       break;
2868    }
2869    printf(", ");
2870
2871    for (int i = 0; i < 3 && inst->src[i].file != BAD_FILE; i++) {
2872       if (inst->src[i].negate)
2873          printf("-");
2874       if (inst->src[i].abs)
2875          printf("|");
2876       switch (inst->src[i].file) {
2877       case GRF:
2878          printf("vgrf%d", inst->src[i].reg);
2879          if (inst->src[i].reg_offset)
2880             printf("+%d", inst->src[i].reg_offset);
2881          break;
2882       case MRF:
2883          printf("***m%d***", inst->src[i].reg);
2884          break;
2885       case UNIFORM:
2886          printf("u%d", inst->src[i].reg);
2887          if (inst->src[i].reg_offset)
2888             printf(".%d", inst->src[i].reg_offset);
2889          break;
2890       case BAD_FILE:
2891          printf("(null)");
2892          break;
2893       case IMM:
2894          switch (inst->src[i].type) {
2895          case BRW_REGISTER_TYPE_F:
2896             printf("%ff", inst->src[i].imm.f);
2897             break;
2898          case BRW_REGISTER_TYPE_D:
2899             printf("%dd", inst->src[i].imm.i);
2900             break;
2901          case BRW_REGISTER_TYPE_UD:
2902             printf("%uu", inst->src[i].imm.u);
2903             break;
2904          default:
2905             printf("???");
2906             break;
2907          }
2908          break;
2909       case HW_REG:
2910          if (inst->src[i].fixed_hw_reg.negate)
2911             printf("-");
2912          if (inst->src[i].fixed_hw_reg.abs)
2913             printf("|");
2914          printf("hw_reg%d", inst->src[i].fixed_hw_reg.nr);
2915          if (inst->src[i].fixed_hw_reg.subnr)
2916             printf("+%d", inst->src[i].fixed_hw_reg.subnr);
2917          if (inst->src[i].fixed_hw_reg.abs)
2918             printf("|");
2919          break;
2920       default:
2921          printf("???");
2922          break;
2923       }
2924       if (inst->src[i].abs)
2925          printf("|");
2926
2927       if (i < 2 && inst->src[i + 1].file != BAD_FILE)
2928          printf(", ");
2929    }
2930
2931    printf(" ");
2932
2933    if (inst->force_uncompressed)
2934       printf("1sthalf ");
2935
2936    if (inst->force_sechalf)
2937       printf("2ndhalf ");
2938
2939    printf("\n");
2940 }
2941
2942 /**
2943  * Possibly returns an instruction that set up @param reg.
2944  *
2945  * Sometimes we want to take the result of some expression/variable
2946  * dereference tree and rewrite the instruction generating the result
2947  * of the tree.  When processing the tree, we know that the
2948  * instructions generated are all writing temporaries that are dead
2949  * outside of this tree.  So, if we have some instructions that write
2950  * a temporary, we're free to point that temp write somewhere else.
2951  *
2952  * Note that this doesn't guarantee that the instruction generated
2953  * only reg -- it might be the size=4 destination of a texture instruction.
2954  */
2955 fs_inst *
2956 fs_visitor::get_instruction_generating_reg(fs_inst *start,
2957                                            fs_inst *end,
2958                                            fs_reg reg)
2959 {
2960    if (end == start ||
2961        end->is_partial_write() ||
2962        reg.reladdr ||
2963        !reg.equals(end->dst)) {
2964       return NULL;
2965    } else {
2966       return end;
2967    }
2968 }
2969
2970 void
2971 fs_visitor::setup_payload_gen6()
2972 {
2973    bool uses_depth =
2974       (fp->Base.InputsRead & (1 << VARYING_SLOT_POS)) != 0;
2975    unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
2976
2977    assert(brw->gen >= 6);
2978
2979    /* R0-1: masks, pixel X/Y coordinates. */
2980    c->nr_payload_regs = 2;
2981    /* R2: only for 32-pixel dispatch.*/
2982
2983    /* R3-26: barycentric interpolation coordinates.  These appear in the
2984     * same order that they appear in the brw_wm_barycentric_interp_mode
2985     * enum.  Each set of coordinates occupies 2 registers if dispatch width
2986     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
2987     * appear if they were enabled using the "Barycentric Interpolation
2988     * Mode" bits in WM_STATE.
2989     */
2990    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
2991       if (barycentric_interp_modes & (1 << i)) {
2992          c->barycentric_coord_reg[i] = c->nr_payload_regs;
2993          c->nr_payload_regs += 2;
2994          if (dispatch_width == 16) {
2995             c->nr_payload_regs += 2;
2996          }
2997       }
2998    }
2999
3000    /* R27: interpolated depth if uses source depth */
3001    if (uses_depth) {
3002       c->source_depth_reg = c->nr_payload_regs;
3003       c->nr_payload_regs++;
3004       if (dispatch_width == 16) {
3005          /* R28: interpolated depth if not 8-wide. */
3006          c->nr_payload_regs++;
3007       }
3008    }
3009    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3010    if (uses_depth) {
3011       c->source_w_reg = c->nr_payload_regs;
3012       c->nr_payload_regs++;
3013       if (dispatch_width == 16) {
3014          /* R30: interpolated W if not 8-wide. */
3015          c->nr_payload_regs++;
3016       }
3017    }
3018
3019    c->prog_data.uses_pos_offset = c->key.compute_pos_offset;
3020    /* R31: MSAA position offsets. */
3021    if (c->prog_data.uses_pos_offset) {
3022       c->sample_pos_reg = c->nr_payload_regs;
3023       c->nr_payload_regs++;
3024    }
3025
3026    /* R32-: bary for 32-pixel. */
3027    /* R58-59: interp W for 32-pixel. */
3028
3029    if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3030       c->source_depth_to_render_target = true;
3031    }
3032 }
3033
3034 void
3035 fs_visitor::assign_binding_table_offsets()
3036 {
3037    uint32_t next_binding_table_offset = 0;
3038
3039    /* If there are no color regions, we still perform an FB write to a null
3040     * renderbuffer, which we place at surface index 0.
3041     */
3042    c->prog_data.binding_table.render_target_start = next_binding_table_offset;
3043    next_binding_table_offset += MAX2(c->key.nr_color_regions, 1);
3044
3045    assign_common_binding_table_offsets(next_binding_table_offset);
3046 }
3047
3048 bool
3049 fs_visitor::run()
3050 {
3051    sanity_param_count = fp->Base.Parameters->NumParameters;
3052    uint32_t orig_nr_params = c->prog_data.nr_params;
3053    bool allocated_without_spills;
3054
3055    assign_binding_table_offsets();
3056
3057    if (brw->gen >= 6)
3058       setup_payload_gen6();
3059    else
3060       setup_payload_gen4();
3061
3062    if (0) {
3063       emit_dummy_fs();
3064    } else {
3065       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3066          emit_shader_time_begin();
3067
3068       calculate_urb_setup();
3069       if (fp->Base.InputsRead > 0) {
3070          if (brw->gen < 6)
3071             emit_interpolation_setup_gen4();
3072          else
3073             emit_interpolation_setup_gen6();
3074       }
3075
3076       /* We handle discards by keeping track of the still-live pixels in f0.1.
3077        * Initialize it with the dispatched pixels.
3078        */
3079       if (fp->UsesKill || c->key.alpha_test_func) {
3080          fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
3081          discard_init->flag_subreg = 1;
3082       }
3083
3084       /* Generate FS IR for main().  (the visitor only descends into
3085        * functions called "main").
3086        */
3087       if (shader) {
3088          foreach_list(node, &*shader->ir) {
3089             ir_instruction *ir = (ir_instruction *)node;
3090             base_ir = ir;
3091             this->result = reg_undef;
3092             ir->accept(this);
3093          }
3094       } else {
3095          emit_fragment_program_code();
3096       }
3097       base_ir = NULL;
3098       if (failed)
3099          return false;
3100
3101       emit(FS_OPCODE_PLACEHOLDER_HALT);
3102
3103       if (c->key.alpha_test_func)
3104          emit_alpha_test();
3105
3106       emit_fb_writes();
3107
3108       split_virtual_grfs();
3109
3110       move_uniform_array_access_to_pull_constants();
3111       remove_dead_constants();
3112       setup_pull_constants();
3113
3114       bool progress;
3115       do {
3116          progress = false;
3117
3118          compact_virtual_grfs();
3119
3120          progress = remove_duplicate_mrf_writes() || progress;
3121
3122          progress = opt_algebraic() || progress;
3123          progress = opt_cse() || progress;
3124          progress = opt_copy_propagate() || progress;
3125          progress = dead_code_eliminate() || progress;
3126          progress = dead_code_eliminate_local() || progress;
3127          progress = dead_control_flow_eliminate(this) || progress;
3128          progress = register_coalesce() || progress;
3129          progress = compute_to_mrf() || progress;
3130       } while (progress);
3131
3132       lower_uniform_pull_constant_loads();
3133
3134       assign_curb_setup();
3135       assign_urb_setup();
3136
3137       static enum instruction_scheduler_mode pre_modes[] = {
3138          SCHEDULE_PRE,
3139          SCHEDULE_PRE_NON_LIFO,
3140          SCHEDULE_PRE_LIFO,
3141       };
3142
3143       /* Try each scheduling heuristic to see if it can successfully register
3144        * allocate without spilling.  They should be ordered by decreasing
3145        * performance but increasing likelihood of allocating.
3146        */
3147       for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3148          schedule_instructions(pre_modes[i]);
3149
3150          if (0) {
3151             assign_regs_trivial();
3152             allocated_without_spills = true;
3153          } else {
3154             allocated_without_spills = assign_regs(false);
3155          }
3156          if (allocated_without_spills)
3157             break;
3158       }
3159
3160       if (!allocated_without_spills) {
3161          /* We assume that any spilling is worse than just dropping back to
3162           * SIMD8.  There's probably actually some intermediate point where
3163           * SIMD16 with a couple of spills is still better.
3164           */
3165          if (dispatch_width == 16) {
3166             fail("Failure to register allocate.  Reduce number of "
3167                  "live scalar values to avoid this.");
3168          }
3169
3170          /* Since we're out of heuristics, just go spill registers until we
3171           * get an allocation.
3172           */
3173          while (!assign_regs(true)) {
3174             if (failed)
3175                break;
3176          }
3177       }
3178    }
3179    assert(force_uncompressed_stack == 0);
3180
3181    /* This must come after all optimization and register allocation, since
3182     * it inserts dead code that happens to have side effects, and it does
3183     * so based on the actual physical registers in use.
3184     */
3185    insert_gen4_send_dependency_workarounds();
3186
3187    if (failed)
3188       return false;
3189
3190    if (!allocated_without_spills)
3191       schedule_instructions(SCHEDULE_POST);
3192
3193    if (dispatch_width == 8) {
3194       c->prog_data.reg_blocks = brw_register_blocks(grf_used);
3195    } else {
3196       c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
3197
3198       /* Make sure we didn't try to sneak in an extra uniform */
3199       assert(orig_nr_params == c->prog_data.nr_params);
3200       (void) orig_nr_params;
3201    }
3202
3203    /* If any state parameters were appended, then ParameterValues could have
3204     * been realloced, in which case the driver uniform storage set up by
3205     * _mesa_associate_uniform_storage() would point to freed memory.  Make
3206     * sure that didn't happen.
3207     */
3208    assert(sanity_param_count == fp->Base.Parameters->NumParameters);
3209
3210    return !failed;
3211 }
3212
3213 const unsigned *
3214 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
3215                struct gl_fragment_program *fp,
3216                struct gl_shader_program *prog,
3217                unsigned *final_assembly_size)
3218 {
3219    bool start_busy = false;
3220    float start_time = 0;
3221
3222    if (unlikely(brw->perf_debug)) {
3223       start_busy = (brw->batch.last_bo &&
3224                     drm_intel_bo_busy(brw->batch.last_bo));
3225       start_time = get_time();
3226    }
3227
3228    struct brw_shader *shader = NULL;
3229    if (prog)
3230       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3231
3232    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3233       if (prog) {
3234          printf("GLSL IR for native fragment shader %d:\n", prog->Name);
3235          _mesa_print_ir(shader->ir, NULL);
3236          printf("\n\n");
3237       } else {
3238          printf("ARB_fragment_program %d ir for native fragment shader\n",
3239                 fp->Base.Id);
3240          _mesa_print_program(&fp->Base);
3241       }
3242    }
3243
3244    /* Now the main event: Visit the shader IR and generate our FS IR for it.
3245     */
3246    fs_visitor v(brw, c, prog, fp, 8);
3247    if (!v.run()) {
3248       if (prog) {
3249          prog->LinkStatus = false;
3250          ralloc_strcat(&prog->InfoLog, v.fail_msg);
3251       }
3252
3253       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3254                     v.fail_msg);
3255
3256       return NULL;
3257    }
3258
3259    exec_list *simd16_instructions = NULL;
3260    fs_visitor v2(brw, c, prog, fp, 16);
3261    if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16))) {
3262       if (c->prog_data.nr_pull_params == 0) {
3263          /* Try a 16-wide compile */
3264          v2.import_uniforms(&v);
3265          if (!v2.run()) {
3266             perf_debug("16-wide shader failed to compile, falling back to "
3267                        "8-wide at a 10-20%% performance cost: %s", v2.fail_msg);
3268          } else {
3269             simd16_instructions = &v2.instructions;
3270          }
3271       } else {
3272          perf_debug("Skipping 16-wide due to pull parameters.\n");
3273       }
3274    }
3275
3276    fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
3277    const unsigned *generated = g.generate_assembly(&v.instructions,
3278                                                    simd16_instructions,
3279                                                    final_assembly_size);
3280
3281    if (unlikely(brw->perf_debug) && shader) {
3282       if (shader->compiled_once)
3283          brw_wm_debug_recompile(brw, prog, &c->key);
3284       shader->compiled_once = true;
3285
3286       if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
3287          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3288                     (get_time() - start_time) * 1000);
3289       }
3290    }
3291
3292    return generated;
3293 }
3294
3295 bool
3296 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
3297 {
3298    struct brw_context *brw = brw_context(ctx);
3299    struct brw_wm_prog_key key;
3300
3301    if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
3302       return true;
3303
3304    struct gl_fragment_program *fp = (struct gl_fragment_program *)
3305       prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
3306    struct brw_fragment_program *bfp = brw_fragment_program(fp);
3307    bool program_uses_dfdy = fp->UsesDFdy;
3308
3309    memset(&key, 0, sizeof(key));
3310
3311    if (brw->gen < 6) {
3312       if (fp->UsesKill)
3313          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3314
3315       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3316          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3317
3318       /* Just assume depth testing. */
3319       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3320       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3321    }
3322
3323    if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
3324                                          BRW_FS_VARYING_INPUT_MASK) > 16)
3325       key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
3326
3327    key.clamp_fragment_color = ctx->API == API_OPENGL_COMPAT;
3328
3329    unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
3330    for (unsigned i = 0; i < sampler_count; i++) {
3331       if (fp->Base.ShadowSamplers & (1 << i)) {
3332          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3333          key.tex.swizzles[i] =
3334             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3335       } else {
3336          /* Color sampler: assume no swizzling. */
3337          key.tex.swizzles[i] = SWIZZLE_XYZW;
3338       }
3339    }
3340
3341    if (fp->Base.InputsRead & VARYING_BIT_POS) {
3342       key.drawable_height = ctx->DrawBuffer->Height;
3343    }
3344
3345    if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3346       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
3347    }
3348
3349    key.nr_color_regions = 1;
3350
3351    /* GL_FRAGMENT_SHADER_DERIVATIVE_HINT is almost always GL_DONT_CARE.  The
3352     * quality of the derivatives is likely to be determined by the driconf
3353     * option.
3354     */
3355    key.high_quality_derivatives = brw->disable_derivative_optimization;
3356
3357    key.program_string_id = bfp->id;
3358
3359    uint32_t old_prog_offset = brw->wm.base.prog_offset;
3360    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3361
3362    bool success = do_wm_prog(brw, prog, bfp, &key);
3363
3364    brw->wm.base.prog_offset = old_prog_offset;
3365    brw->wm.prog_data = old_prog_data;
3366
3367    return success;
3368 }