src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 extern "C" {
  32
  33 #include <sys/types.h>
  34
  35 #include "main/hash_table.h"
  36 #include "main/macros.h"
  37 #include "main/shaderobj.h"
  38 #include "main/fbobject.h"
  39 #include "program/prog_parameter.h"
  40 #include "program/prog_print.h"
  41 #include "program/register_allocate.h"
  42 #include "program/sampler.h"
  43 #include "program/hash_table.h"
  44 #include "brw_context.h"
  45 #include "brw_eu.h"
  46 #include "brw_wm.h"
  47 }
  48 #include "brw_fs.h"
  49 #include "brw_dead_control_flow.h"
  50 #include "main/uniforms.h"
  51 #include "brw_fs_live_variables.h"
  52 #include "glsl/glsl_types.h"
  53
  54 void
  55 fs_inst::init()
  56 {
  57    memset(this, 0, sizeof(*this));
  58    this->opcode = BRW_OPCODE_NOP;
  59    this->conditional_mod = BRW_CONDITIONAL_NONE;
  60
  61    this->dst = reg_undef;
  62    this->src[0] = reg_undef;
  63    this->src[1] = reg_undef;
  64    this->src[2] = reg_undef;
  65
  66    /* This will be the case for almost all instructions. */
  67    this->regs_written = 1;
  68 }
  69
  70 fs_inst::fs_inst()
  71 {
  72    init();
  73 }
  74
  75 fs_inst::fs_inst(enum opcode opcode)
  76 {
  77    init();
  78    this->opcode = opcode;
  79 }
  80
  81 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
  82 {
  83    init();
  84    this->opcode = opcode;
  85    this->dst = dst;
  86
  87    if (dst.file == GRF)
  88       assert(dst.reg_offset >= 0);
  89 }
  90
  91 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
  92 {
  93    init();
  94    this->opcode = opcode;
  95    this->dst = dst;
  96    this->src[0] = src0;
  97
  98    if (dst.file == GRF)
  99       assert(dst.reg_offset >= 0);
 100    if (src[0].file == GRF)
 101       assert(src[0].reg_offset >= 0);
 102 }
 103
 104 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 105 {
 106    init();
 107    this->opcode = opcode;
 108    this->dst = dst;
 109    this->src[0] = src0;
 110    this->src[1] = src1;
 111
 112    if (dst.file == GRF)
 113       assert(dst.reg_offset >= 0);
 114    if (src[0].file == GRF)
 115       assert(src[0].reg_offset >= 0);
 116    if (src[1].file == GRF)
 117       assert(src[1].reg_offset >= 0);
 118 }
 119
 120 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
 121                  fs_reg src0, fs_reg src1, fs_reg src2)
 122 {
 123    init();
 124    this->opcode = opcode;
 125    this->dst = dst;
 126    this->src[0] = src0;
 127    this->src[1] = src1;
 128    this->src[2] = src2;
 129
 130    if (dst.file == GRF)
 131       assert(dst.reg_offset >= 0);
 132    if (src[0].file == GRF)
 133       assert(src[0].reg_offset >= 0);
 134    if (src[1].file == GRF)
 135       assert(src[1].reg_offset >= 0);
 136    if (src[2].file == GRF)
 137       assert(src[2].reg_offset >= 0);
 138 }
 139
 140 #define ALU1(op)                                                        \
 141    fs_inst *                                                            \
 142    fs_visitor::op(fs_reg dst, fs_reg src0)                              \
 143    {                                                                    \
 144       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
 145    }
 146
 147 #define ALU2(op)                                                        \
 148    fs_inst *                                                            \
 149    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1)                 \
 150    {                                                                    \
 151       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
 152    }
 153
 154 #define ALU3(op)                                                        \
 155    fs_inst *                                                            \
 156    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1, fs_reg src2)    \
 157    {                                                                    \
 158       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
 159    }
 160
 161 ALU1(NOT)
 162 ALU1(MOV)
 163 ALU1(FRC)
 164 ALU1(RNDD)
 165 ALU1(RNDE)
 166 ALU1(RNDZ)
 167 ALU2(ADD)
 168 ALU2(MUL)
 169 ALU2(MACH)
 170 ALU2(AND)
 171 ALU2(OR)
 172 ALU2(XOR)
 173 ALU2(SHL)
 174 ALU2(SHR)
 175 ALU2(ASR)
 176 ALU3(LRP)
 177 ALU1(BFREV)
 178 ALU3(BFE)
 179 ALU2(BFI1)
 180 ALU3(BFI2)
 181 ALU1(FBH)
 182 ALU1(FBL)
 183 ALU1(CBIT)
 184 ALU3(MAD)
 185 ALU2(ADDC)
 186 ALU2(SUBB)
 187 ALU2(SEL)
 188
 189 /** Gen4 predicated IF. */
 190 fs_inst *
 191 fs_visitor::IF(uint32_t predicate)
 192 {
 193    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
 194    inst->predicate = predicate;
 195    return inst;
 196 }
 197
 198 /** Gen6 IF with embedded comparison. */
 199 fs_inst *
 200 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
 201 {
 202    assert(brw->gen == 6);
 203    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
 204                                         reg_null_d, src0, src1);
 205    inst->conditional_mod = condition;
 206    return inst;
 207 }
 208
 209 /**
 210  * CMP: Sets the low bit of the destination channels with the result
 211  * of the comparison, while the upper bits are undefined, and updates
 212  * the flag register with the packed 16 bits of the result.
 213  */
 214 fs_inst *
 215 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
 216 {
 217    fs_inst *inst;
 218
 219    /* Take the instruction:
 220     *
 221     * CMP null<d> src0<f> src1<f>
 222     *
 223     * Original gen4 does type conversion to the destination type before
 224     * comparison, producing garbage results for floating point comparisons.
 225     * gen5 does the comparison on the execution type (resolved source types),
 226     * so dst type doesn't matter.  gen6 does comparison and then uses the
 227     * result as if it was the dst type with no conversion, which happens to
 228     * mostly work out for float-interpreted-as-int since our comparisons are
 229     * for >0, =0, <0.
 230     */
 231    if (brw->gen == 4) {
 232       dst.type = src0.type;
 233       if (dst.file == HW_REG)
 234          dst.fixed_hw_reg.type = dst.type;
 235    }
 236
 237    resolve_ud_negate(&src0);
 238    resolve_ud_negate(&src1);
 239
 240    inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
 241    inst->conditional_mod = condition;
 242
 243    return inst;
 244 }
 245
 246 exec_list
 247 fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
 248                                        fs_reg varying_offset,
 249                                        uint32_t const_offset)
 250 {
 251    exec_list instructions;
 252    fs_inst *inst;
 253
 254    /* We have our constant surface use a pitch of 4 bytes, so our index can
 255     * be any component of a vector, and then we load 4 contiguous
 256     * components starting from that.
 257     *
 258     * We break down the const_offset to a portion added to the variable
 259     * offset and a portion done using reg_offset, which means that if you
 260     * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
 261     * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
 262     * CSE can later notice that those loads are all the same and eliminate
 263     * the redundant ones.
 264     */
 265    fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
 266    instructions.push_tail(ADD(vec4_offset,
 267                               varying_offset, const_offset & ~3));
 268
 269    int scale = 1;
 270    if (brw->gen == 4 && dispatch_width == 8) {
 271       /* Pre-gen5, we can either use a SIMD8 message that requires (header,
 272        * u, v, r) as parameters, or we can just use the SIMD16 message
 273        * consisting of (header, u).  We choose the second, at the cost of a
 274        * longer return length.
 275        */
 276       scale = 2;
 277    }
 278
 279    enum opcode op;
 280    if (brw->gen >= 7)
 281       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
 282    else
 283       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
 284    fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4 * scale), dst.type);
 285    inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
 286    inst->regs_written = 4 * scale;
 287    instructions.push_tail(inst);
 288
 289    if (brw->gen < 7) {
 290       inst->base_mrf = 13;
 291       inst->header_present = true;
 292       if (brw->gen == 4)
 293          inst->mlen = 3;
 294       else
 295          inst->mlen = 1 + dispatch_width / 8;
 296    }
 297
 298    vec4_result.reg_offset += (const_offset & 3) * scale;
 299    instructions.push_tail(MOV(dst, vec4_result));
 300
 301    return instructions;
 302 }
 303
 304 /**
 305  * A helper for MOV generation for fixing up broken hardware SEND dependency
 306  * handling.
 307  */
 308 fs_inst *
 309 fs_visitor::DEP_RESOLVE_MOV(int grf)
 310 {
 311    fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
 312
 313    inst->ir = NULL;
 314    inst->annotation = "send dependency resolve";
 315
 316    /* The caller always wants uncompressed to emit the minimal extra
 317     * dependencies, and to avoid having to deal with aligning its regs to 2.
 318     */
 319    inst->force_uncompressed = true;
 320
 321    return inst;
 322 }
 323
 324 bool
 325 fs_inst::equals(fs_inst *inst)
 326 {
 327    return (opcode == inst->opcode &&
 328            dst.equals(inst->dst) &&
 329            src[0].equals(inst->src[0]) &&
 330            src[1].equals(inst->src[1]) &&
 331            src[2].equals(inst->src[2]) &&
 332            saturate == inst->saturate &&
 333            predicate == inst->predicate &&
 334            conditional_mod == inst->conditional_mod &&
 335            mlen == inst->mlen &&
 336            base_mrf == inst->base_mrf &&
 337            sampler == inst->sampler &&
 338            target == inst->target &&
 339            eot == inst->eot &&
 340            header_present == inst->header_present &&
 341            shadow_compare == inst->shadow_compare &&
 342            offset == inst->offset);
 343 }
 344
 345 bool
 346 fs_inst::overwrites_reg(const fs_reg &reg)
 347 {
 348    return (reg.file == dst.file &&
 349            reg.reg == dst.reg &&
 350            reg.reg_offset >= dst.reg_offset  &&
 351            reg.reg_offset < dst.reg_offset + regs_written);
 352 }
 353
 354 bool
 355 fs_inst::is_send_from_grf()
 356 {
 357    return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
 358            opcode == SHADER_OPCODE_SHADER_TIME_ADD ||
 359            (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
 360             src[1].file == GRF) ||
 361            (is_tex() && src[0].file == GRF));
 362 }
 363
 364 bool
 365 fs_visitor::can_do_source_mods(fs_inst *inst)
 366 {
 367    if (brw->gen == 6 && inst->is_math())
 368       return false;
 369
 370    if (inst->is_send_from_grf())
 371       return false;
 372
 373    if (!inst->can_do_source_mods())
 374       return false;
 375
 376    return true;
 377 }
 378
 379 void
 380 fs_reg::init()
 381 {
 382    memset(this, 0, sizeof(*this));
 383    this->smear = -1;
 384 }
 385
 386 /** Generic unset register constructor. */
 387 fs_reg::fs_reg()
 388 {
 389    init();
 390    this->file = BAD_FILE;
 391 }
 392
 393 /** Immediate value constructor. */
 394 fs_reg::fs_reg(float f)
 395 {
 396    init();
 397    this->file = IMM;
 398    this->type = BRW_REGISTER_TYPE_F;
 399    this->imm.f = f;
 400 }
 401
 402 /** Immediate value constructor. */
 403 fs_reg::fs_reg(int32_t i)
 404 {
 405    init();
 406    this->file = IMM;
 407    this->type = BRW_REGISTER_TYPE_D;
 408    this->imm.i = i;
 409 }
 410
 411 /** Immediate value constructor. */
 412 fs_reg::fs_reg(uint32_t u)
 413 {
 414    init();
 415    this->file = IMM;
 416    this->type = BRW_REGISTER_TYPE_UD;
 417    this->imm.u = u;
 418 }
 419
 420 /** Fixed brw_reg. */
 421 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
 422 {
 423    init();
 424    this->file = HW_REG;
 425    this->fixed_hw_reg = fixed_hw_reg;
 426    this->type = fixed_hw_reg.type;
 427 }
 428
 429 bool
 430 fs_reg::equals(const fs_reg &r) const
 431 {
 432    return (file == r.file &&
 433            reg == r.reg &&
 434            reg_offset == r.reg_offset &&
 435            type == r.type &&
 436            negate == r.negate &&
 437            abs == r.abs &&
 438            !reladdr && !r.reladdr &&
 439            memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
 440                   sizeof(fixed_hw_reg)) == 0 &&
 441            smear == r.smear &&
 442            imm.u == r.imm.u);
 443 }
 444
 445 fs_reg
 446 fs_reg::retype(uint32_t type)
 447 {
 448    fs_reg result = *this;
 449    result.type = type;
 450    return result;
 451 }
 452
 453 bool
 454 fs_reg::is_zero() const
 455 {
 456    if (file != IMM)
 457       return false;
 458
 459    return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
 460 }
 461
 462 bool
 463 fs_reg::is_one() const
 464 {
 465    if (file != IMM)
 466       return false;
 467
 468    return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
 469 }
 470
 471 bool
 472 fs_reg::is_null() const
 473 {
 474    return file == HW_REG &&
 475           fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
 476           fixed_hw_reg.nr == BRW_ARF_NULL;
 477 }
 478
 479 bool
 480 fs_reg::is_valid_3src() const
 481 {
 482    return file == GRF || file == UNIFORM;
 483 }
 484
 485 int
 486 fs_visitor::type_size(const struct glsl_type *type)
 487 {
 488    unsigned int size, i;
 489
 490    switch (type->base_type) {
 491    case GLSL_TYPE_UINT:
 492    case GLSL_TYPE_INT:
 493    case GLSL_TYPE_FLOAT:
 494    case GLSL_TYPE_BOOL:
 495       return type->components();
 496    case GLSL_TYPE_ARRAY:
 497       return type_size(type->fields.array) * type->length;
 498    case GLSL_TYPE_STRUCT:
 499       size = 0;
 500       for (i = 0; i < type->length; i++) {
 501          size += type_size(type->fields.structure[i].type);
 502       }
 503       return size;
 504    case GLSL_TYPE_SAMPLER:
 505       /* Samplers take up no register space, since they're baked in at
 506        * link time.
 507        */
 508       return 0;
 509    case GLSL_TYPE_ATOMIC_UINT:
 510       return 0;
 511    case GLSL_TYPE_VOID:
 512    case GLSL_TYPE_ERROR:
 513    case GLSL_TYPE_INTERFACE:
 514       assert(!"not reached");
 515       break;
 516    }
 517
 518    return 0;
 519 }
 520
 521 fs_reg
 522 fs_visitor::get_timestamp()
 523 {
 524    assert(brw->gen >= 7);
 525
 526    fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 527                                           BRW_ARF_TIMESTAMP,
 528                                           0),
 529                              BRW_REGISTER_TYPE_UD));
 530
 531    fs_reg dst = fs_reg(this, glsl_type::uint_type);
 532
 533    fs_inst *mov = emit(MOV(dst, ts));
 534    /* We want to read the 3 fields we care about (mostly field 0, but also 2)
 535     * even if it's not enabled in the dispatch.
 536     */
 537    mov->force_writemask_all = true;
 538    mov->force_uncompressed = true;
 539
 540    /* The caller wants the low 32 bits of the timestamp.  Since it's running
 541     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
 542     * which is plenty of time for our purposes.  It is identical across the
 543     * EUs, but since it's tracking GPU core speed it will increment at a
 544     * varying rate as render P-states change.
 545     *
 546     * The caller could also check if render P-states have changed (or anything
 547     * else that might disrupt timing) by setting smear to 2 and checking if
 548     * that field is != 0.
 549     */
 550    dst.smear = 0;
 551
 552    return dst;
 553 }
 554
 555 void
 556 fs_visitor::emit_shader_time_begin()
 557 {
 558    current_annotation = "shader time start";
 559    shader_start_time = get_timestamp();
 560 }
 561
 562 void
 563 fs_visitor::emit_shader_time_end()
 564 {
 565    current_annotation = "shader time end";
 566
 567    enum shader_time_shader_type type, written_type, reset_type;
 568    if (dispatch_width == 8) {
 569       type = ST_FS8;
 570       written_type = ST_FS8_WRITTEN;
 571       reset_type = ST_FS8_RESET;
 572    } else {
 573       assert(dispatch_width == 16);
 574       type = ST_FS16;
 575       written_type = ST_FS16_WRITTEN;
 576       reset_type = ST_FS16_RESET;
 577    }
 578
 579    fs_reg shader_end_time = get_timestamp();
 580
 581    /* Check that there weren't any timestamp reset events (assuming these
 582     * were the only two timestamp reads that happened).
 583     */
 584    fs_reg reset = shader_end_time;
 585    reset.smear = 2;
 586    fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
 587    test->conditional_mod = BRW_CONDITIONAL_Z;
 588    emit(IF(BRW_PREDICATE_NORMAL));
 589
 590    push_force_uncompressed();
 591    fs_reg start = shader_start_time;
 592    start.negate = true;
 593    fs_reg diff = fs_reg(this, glsl_type::uint_type);
 594    emit(ADD(diff, start, shader_end_time));
 595
 596    /* If there were no instructions between the two timestamp gets, the diff
 597     * is 2 cycles.  Remove that overhead, so I can forget about that when
 598     * trying to determine the time taken for single instructions.
 599     */
 600    emit(ADD(diff, diff, fs_reg(-2u)));
 601
 602    emit_shader_time_write(type, diff);
 603    emit_shader_time_write(written_type, fs_reg(1u));
 604    emit(BRW_OPCODE_ELSE);
 605    emit_shader_time_write(reset_type, fs_reg(1u));
 606    emit(BRW_OPCODE_ENDIF);
 607
 608    pop_force_uncompressed();
 609 }
 610
 611 void
 612 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
 613                                    fs_reg value)
 614 {
 615    int shader_time_index =
 616       brw_get_shader_time_index(brw, shader_prog, &fp->Base, type);
 617    fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
 618
 619    fs_reg payload;
 620    if (dispatch_width == 8)
 621       payload = fs_reg(this, glsl_type::uvec2_type);
 622    else
 623       payload = fs_reg(this, glsl_type::uint_type);
 624
 625    emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
 626                 fs_reg(), payload, offset, value));
 627 }
 628
 629 void
 630 fs_visitor::fail(const char *format, ...)
 631 {
 632    va_list va;
 633    char *msg;
 634
 635    if (failed)
 636       return;
 637
 638    failed = true;
 639
 640    va_start(va, format);
 641    msg = ralloc_vasprintf(mem_ctx, format, va);
 642    va_end(va);
 643    msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
 644
 645    this->fail_msg = msg;
 646
 647    if (INTEL_DEBUG & DEBUG_WM) {
 648       fprintf(stderr, "%s",  msg);
 649    }
 650 }
 651
 652 fs_inst *
 653 fs_visitor::emit(enum opcode opcode)
 654 {
 655    return emit(fs_inst(opcode));
 656 }
 657
 658 fs_inst *
 659 fs_visitor::emit(enum opcode opcode, fs_reg dst)
 660 {
 661    return emit(fs_inst(opcode, dst));
 662 }
 663
 664 fs_inst *
 665 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
 666 {
 667    return emit(fs_inst(opcode, dst, src0));
 668 }
 669
 670 fs_inst *
 671 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 672 {
 673    return emit(fs_inst(opcode, dst, src0, src1));
 674 }
 675
 676 fs_inst *
 677 fs_visitor::emit(enum opcode opcode, fs_reg dst,
 678                  fs_reg src0, fs_reg src1, fs_reg src2)
 679 {
 680    return emit(fs_inst(opcode, dst, src0, src1, src2));
 681 }
 682
 683 void
 684 fs_visitor::push_force_uncompressed()
 685 {
 686    force_uncompressed_stack++;
 687 }
 688
 689 void
 690 fs_visitor::pop_force_uncompressed()
 691 {
 692    force_uncompressed_stack--;
 693    assert(force_uncompressed_stack >= 0);
 694 }
 695
 696 /**
 697  * Returns true if the instruction has a flag that means it won't
 698  * update an entire destination register.
 699  *
 700  * For example, dead code elimination and live variable analysis want to know
 701  * when a write to a variable screens off any preceding values that were in
 702  * it.
 703  */
 704 bool
 705 fs_inst::is_partial_write()
 706 {
 707    return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
 708            this->force_uncompressed ||
 709            this->force_sechalf);
 710 }
 711
 712 int
 713 fs_inst::regs_read(fs_visitor *v, int arg)
 714 {
 715    if (is_tex() && arg == 0 && src[0].file == GRF) {
 716       if (v->dispatch_width == 16)
 717          return (mlen + 1) / 2;
 718       else
 719          return mlen;
 720    }
 721    return 1;
 722 }
 723
 724 bool
 725 fs_inst::reads_flag()
 726 {
 727    return predicate;
 728 }
 729
 730 bool
 731 fs_inst::writes_flag()
 732 {
 733    return (conditional_mod && opcode != BRW_OPCODE_SEL) ||
 734           opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
 735 }
 736
 737 /**
 738  * Returns how many MRFs an FS opcode will write over.
 739  *
 740  * Note that this is not the 0 or 1 implied writes in an actual gen
 741  * instruction -- the FS opcodes often generate MOVs in addition.
 742  */
 743 int
 744 fs_visitor::implied_mrf_writes(fs_inst *inst)
 745 {
 746    if (inst->mlen == 0)
 747       return 0;
 748
 749    if (inst->base_mrf == -1)
 750       return 0;
 751
 752    switch (inst->opcode) {
 753    case SHADER_OPCODE_RCP:
 754    case SHADER_OPCODE_RSQ:
 755    case SHADER_OPCODE_SQRT:
 756    case SHADER_OPCODE_EXP2:
 757    case SHADER_OPCODE_LOG2:
 758    case SHADER_OPCODE_SIN:
 759    case SHADER_OPCODE_COS:
 760       return 1 * dispatch_width / 8;
 761    case SHADER_OPCODE_POW:
 762    case SHADER_OPCODE_INT_QUOTIENT:
 763    case SHADER_OPCODE_INT_REMAINDER:
 764       return 2 * dispatch_width / 8;
 765    case SHADER_OPCODE_TEX:
 766    case FS_OPCODE_TXB:
 767    case SHADER_OPCODE_TXD:
 768    case SHADER_OPCODE_TXF:
 769    case SHADER_OPCODE_TXF_MS:
 770    case SHADER_OPCODE_TG4:
 771    case SHADER_OPCODE_TG4_OFFSET:
 772    case SHADER_OPCODE_TXL:
 773    case SHADER_OPCODE_TXS:
 774    case SHADER_OPCODE_LOD:
 775       return 1;
 776    case FS_OPCODE_FB_WRITE:
 777       return 2;
 778    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 779    case SHADER_OPCODE_GEN4_SCRATCH_READ:
 780       return 1;
 781    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
 782       return inst->mlen;
 783    case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
 784       return 2;
 785    case SHADER_OPCODE_UNTYPED_ATOMIC:
 786    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
 787       return 0;
 788    default:
 789       assert(!"not reached");
 790       return inst->mlen;
 791    }
 792 }
 793
 794 int
 795 fs_visitor::virtual_grf_alloc(int size)
 796 {
 797    if (virtual_grf_array_size <= virtual_grf_count) {
 798       if (virtual_grf_array_size == 0)
 799          virtual_grf_array_size = 16;
 800       else
 801          virtual_grf_array_size *= 2;
 802       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 803                                    virtual_grf_array_size);
 804    }
 805    virtual_grf_sizes[virtual_grf_count] = size;
 806    return virtual_grf_count++;
 807 }
 808
 809 /** Fixed HW reg constructor. */
 810 fs_reg::fs_reg(enum register_file file, int reg)
 811 {
 812    init();
 813    this->file = file;
 814    this->reg = reg;
 815    this->type = BRW_REGISTER_TYPE_F;
 816 }
 817
 818 /** Fixed HW reg constructor. */
 819 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
 820 {
 821    init();
 822    this->file = file;
 823    this->reg = reg;
 824    this->type = type;
 825 }
 826
 827 /** Automatic reg constructor. */
 828 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
 829 {
 830    init();
 831
 832    this->file = GRF;
 833    this->reg = v->virtual_grf_alloc(v->type_size(type));
 834    this->reg_offset = 0;
 835    this->type = brw_type_for_base_type(type);
 836 }
 837
 838 fs_reg *
 839 fs_visitor::variable_storage(ir_variable *var)
 840 {
 841    return (fs_reg *)hash_table_find(this->variable_ht, var);
 842 }
 843
 844 void
 845 import_uniforms_callback(const void *key,
 846                          void *data,
 847                          void *closure)
 848 {
 849    struct hash_table *dst_ht = (struct hash_table *)closure;
 850    const fs_reg *reg = (const fs_reg *)data;
 851
 852    if (reg->file != UNIFORM)
 853       return;
 854
 855    hash_table_insert(dst_ht, data, key);
 856 }
 857
 858 /* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
 859  * This brings in those uniform definitions
 860  */
 861 void
 862 fs_visitor::import_uniforms(fs_visitor *v)
 863 {
 864    hash_table_call_foreach(v->variable_ht,
 865                            import_uniforms_callback,
 866                            variable_ht);
 867    this->params_remap = v->params_remap;
 868    this->nr_params_remap = v->nr_params_remap;
 869 }
 870
 871 /* Our support for uniforms is piggy-backed on the struct
 872  * gl_fragment_program, because that's where the values actually
 873  * get stored, rather than in some global gl_shader_program uniform
 874  * store.
 875  */
 876 void
 877 fs_visitor::setup_uniform_values(ir_variable *ir)
 878 {
 879    int namelen = strlen(ir->name);
 880
 881    /* The data for our (non-builtin) uniforms is stored in a series of
 882     * gl_uniform_driver_storage structs for each subcomponent that
 883     * glGetUniformLocation() could name.  We know it's been set up in the same
 884     * order we'd walk the type, so walk the list of storage and find anything
 885     * with our name, or the prefix of a component that starts with our name.
 886     */
 887    unsigned params_before = c->prog_data.nr_params;
 888    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
 889       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 890
 891       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 892           (storage->name[namelen] != 0 &&
 893            storage->name[namelen] != '.' &&
 894            storage->name[namelen] != '[')) {
 895          continue;
 896       }
 897
 898       unsigned slots = storage->type->component_slots();
 899       if (storage->array_elements)
 900          slots *= storage->array_elements;
 901
 902       for (unsigned i = 0; i < slots; i++) {
 903          c->prog_data.param[c->prog_data.nr_params++] =
 904             &storage->storage[i].f;
 905       }
 906    }
 907
 908    /* Make sure we actually initialized the right amount of stuff here. */
 909    assert(params_before + ir->type->component_slots() ==
 910           c->prog_data.nr_params);
 911    (void)params_before;
 912 }
 913
 914
 915 /* Our support for builtin uniforms is even scarier than non-builtin.
 916  * It sits on top of the PROG_STATE_VAR parameters that are
 917  * automatically updated from GL context state.
 918  */
 919 void
 920 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
 921 {
 922    const ir_state_slot *const slots = ir->state_slots;
 923    assert(ir->state_slots != NULL);
 924
 925    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 926       /* This state reference has already been setup by ir_to_mesa, but we'll
 927        * get the same index back here.
 928        */
 929       int index = _mesa_add_state_reference(this->fp->Base.Parameters,
 930                                             (gl_state_index *)slots[i].tokens);
 931
 932       /* Add each of the unique swizzles of the element as a parameter.
 933        * This'll end up matching the expected layout of the
 934        * array/matrix/structure we're trying to fill in.
 935        */
 936       int last_swiz = -1;
 937       for (unsigned int j = 0; j < 4; j++) {
 938          int swiz = GET_SWZ(slots[i].swizzle, j);
 939          if (swiz == last_swiz)
 940             break;
 941          last_swiz = swiz;
 942
 943          c->prog_data.param[c->prog_data.nr_params++] =
 944             &fp->Base.Parameters->ParameterValues[index][swiz].f;
 945       }
 946    }
 947 }
 948
 949 fs_reg *
 950 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
 951 {
 952    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 953    fs_reg wpos = *reg;
 954    bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
 955
 956    /* gl_FragCoord.x */
 957    if (ir->pixel_center_integer) {
 958       emit(MOV(wpos, this->pixel_x));
 959    } else {
 960       emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
 961    }
 962    wpos.reg_offset++;
 963
 964    /* gl_FragCoord.y */
 965    if (!flip && ir->pixel_center_integer) {
 966       emit(MOV(wpos, this->pixel_y));
 967    } else {
 968       fs_reg pixel_y = this->pixel_y;
 969       float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
 970
 971       if (flip) {
 972          pixel_y.negate = true;
 973          offset += c->key.drawable_height - 1.0;
 974       }
 975
 976       emit(ADD(wpos, pixel_y, fs_reg(offset)));
 977    }
 978    wpos.reg_offset++;
 979
 980    /* gl_FragCoord.z */
 981    if (brw->gen >= 6) {
 982       emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
 983    } else {
 984       emit(FS_OPCODE_LINTERP, wpos,
 985            this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
 986            this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
 987            interp_reg(VARYING_SLOT_POS, 2));
 988    }
 989    wpos.reg_offset++;
 990
 991    /* gl_FragCoord.w: Already set up in emit_interpolation */
 992    emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
 993
 994    return reg;
 995 }
 996
 997 fs_inst *
 998 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
 999                          glsl_interp_qualifier interpolation_mode,
1000                          bool is_centroid)
1001 {
1002    brw_wm_barycentric_interp_mode barycoord_mode;
1003    if (brw->gen >= 6) {
1004       if (is_centroid) {
1005          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1006             barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1007          else
1008             barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1009       } else {
1010          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1011             barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1012          else
1013             barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1014       }
1015    } else {
1016       /* On Ironlake and below, there is only one interpolation mode.
1017        * Centroid interpolation doesn't mean anything on this hardware --
1018        * there is no multisampling.
1019        */
1020       barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1021    }
1022    return emit(FS_OPCODE_LINTERP, attr,
1023                this->delta_x[barycoord_mode],
1024                this->delta_y[barycoord_mode], interp);
1025 }
1026
1027 fs_reg *
1028 fs_visitor::emit_general_interpolation(ir_variable *ir)
1029 {
1030    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1031    reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
1032    fs_reg attr = *reg;
1033
1034    unsigned int array_elements;
1035    const glsl_type *type;
1036
1037    if (ir->type->is_array()) {
1038       array_elements = ir->type->length;
1039       if (array_elements == 0) {
1040          fail("dereferenced array '%s' has length 0\n", ir->name);
1041       }
1042       type = ir->type->fields.array;
1043    } else {
1044       array_elements = 1;
1045       type = ir->type;
1046    }
1047
1048    glsl_interp_qualifier interpolation_mode =
1049       ir->determine_interpolation_mode(c->key.flat_shade);
1050
1051    int location = ir->location;
1052    for (unsigned int i = 0; i < array_elements; i++) {
1053       for (unsigned int j = 0; j < type->matrix_columns; j++) {
1054          if (c->prog_data.urb_setup[location] == -1) {
1055             /* If there's no incoming setup data for this slot, don't
1056              * emit interpolation for it.
1057              */
1058             attr.reg_offset += type->vector_elements;
1059             location++;
1060             continue;
1061          }
1062
1063          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1064             /* Constant interpolation (flat shading) case. The SF has
1065              * handed us defined values in only the constant offset
1066              * field of the setup reg.
1067              */
1068             for (unsigned int k = 0; k < type->vector_elements; k++) {
1069                struct brw_reg interp = interp_reg(location, k);
1070                interp = suboffset(interp, 3);
1071                interp.type = reg->type;
1072                emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1073                attr.reg_offset++;
1074             }
1075          } else {
1076             /* Smooth/noperspective interpolation case. */
1077             for (unsigned int k = 0; k < type->vector_elements; k++) {
1078                /* FINISHME: At some point we probably want to push
1079                 * this farther by giving similar treatment to the
1080                 * other potentially constant components of the
1081                 * attribute, as well as making brw_vs_constval.c
1082                 * handle varyings other than gl_TexCoord.
1083                 */
1084                struct brw_reg interp = interp_reg(location, k);
1085                emit_linterp(attr, fs_reg(interp), interpolation_mode,
1086                             ir->centroid);
1087                if (brw->needs_unlit_centroid_workaround && ir->centroid) {
1088                   /* Get the pixel/sample mask into f0 so that we know
1089                    * which pixels are lit.  Then, for each channel that is
1090                    * unlit, replace the centroid data with non-centroid
1091                    * data.
1092                    */
1093                   emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1094                   fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1095                                                interpolation_mode, false);
1096                   inst->predicate = BRW_PREDICATE_NORMAL;
1097                   inst->predicate_inverse = true;
1098                }
1099                if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1100                   emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1101                }
1102                attr.reg_offset++;
1103             }
1104
1105          }
1106          location++;
1107       }
1108    }
1109
1110    return reg;
1111 }
1112
1113 fs_reg *
1114 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1115 {
1116    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1117
1118    /* The frontfacing comes in as a bit in the thread payload. */
1119    if (brw->gen >= 6) {
1120       emit(BRW_OPCODE_ASR, *reg,
1121            fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1122            fs_reg(15));
1123       emit(BRW_OPCODE_NOT, *reg, *reg);
1124       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1125    } else {
1126       struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1127       /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1128        * us front face
1129        */
1130       emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1131       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1132    }
1133
1134    return reg;
1135 }
1136
1137 void
1138 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1139 {
1140    assert(dst.type == BRW_REGISTER_TYPE_F);
1141
1142    if (c->key.compute_pos_offset) {
1143       /* Convert int_sample_pos to floating point */
1144       emit(MOV(dst, int_sample_pos));
1145       /* Scale to the range [0, 1] */
1146       emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1147    }
1148    else {
1149       /* From ARB_sample_shading specification:
1150        * "When rendering to a non-multisample buffer, or if multisample
1151        *  rasterization is disabled, gl_SamplePosition will always be
1152        *  (0.5, 0.5).
1153        */
1154       emit(MOV(dst, fs_reg(0.5f)));
1155    }
1156 }
1157
1158 fs_reg *
1159 fs_visitor::emit_samplepos_setup(ir_variable *ir)
1160 {
1161    assert(brw->gen >= 6);
1162    assert(ir->type == glsl_type::vec2_type);
1163
1164    this->current_annotation = "compute sample position";
1165    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1166    fs_reg pos = *reg;
1167    fs_reg int_sample_x = fs_reg(this, glsl_type::int_type);
1168    fs_reg int_sample_y = fs_reg(this, glsl_type::int_type);
1169
1170    /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1171     * mode will be enabled.
1172     *
1173     * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1174     * R31.1:0         Position Offset X/Y for Slot[3:0]
1175     * R31.3:2         Position Offset X/Y for Slot[7:4]
1176     * .....
1177     *
1178     * The X, Y sample positions come in as bytes in  thread payload. So, read
1179     * the positions using vstride=16, width=8, hstride=2.
1180     */
1181    struct brw_reg sample_pos_reg =
1182       stride(retype(brw_vec1_grf(c->sample_pos_reg, 0),
1183                     BRW_REGISTER_TYPE_B), 16, 8, 2);
1184
1185    emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1186    if (dispatch_width == 16) {
1187       int_sample_x.sechalf = true;
1188       fs_inst *inst = emit(MOV(int_sample_x,
1189                                fs_reg(suboffset(sample_pos_reg, 16))));
1190       inst->force_sechalf = true;
1191       int_sample_x.sechalf = false;
1192    }
1193    /* Compute gl_SamplePosition.x */
1194    compute_sample_position(pos, int_sample_x);
1195    pos.reg_offset++;
1196    emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1197    if (dispatch_width == 16) {
1198       int_sample_y.sechalf = true;
1199       fs_inst *inst = emit(MOV(int_sample_y,
1200                                fs_reg(suboffset(sample_pos_reg, 17))));
1201       inst->force_sechalf = true;
1202       int_sample_y.sechalf = false;
1203    }
1204    /* Compute gl_SamplePosition.y */
1205    compute_sample_position(pos, int_sample_y);
1206    return reg;
1207 }
1208
1209 fs_reg *
1210 fs_visitor::emit_sampleid_setup(ir_variable *ir)
1211 {
1212    assert(brw->gen >= 6);
1213
1214    this->current_annotation = "compute sample id";
1215    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1216
1217    if (c->key.compute_sample_id) {
1218       fs_reg t1 = fs_reg(this, glsl_type::int_type);
1219       fs_reg t2 = fs_reg(this, glsl_type::int_type);
1220       t2.type = BRW_REGISTER_TYPE_UW;
1221
1222       /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1223        * 8x multisampling, subspan 0 will represent sample N (where N
1224        * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1225        * 7. We can find the value of N by looking at R0.0 bits 7:6
1226        * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1227        * (since samples are always delivered in pairs). That is, we
1228        * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1229        * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1230        * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1231        * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1232        * populating a temporary variable with the sequence (0, 1, 2, 3),
1233        * and then reading from it using vstride=1, width=4, hstride=0.
1234        * These computations hold good for 4x multisampling as well.
1235        */
1236       emit(BRW_OPCODE_AND, t1,
1237            fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1238            fs_reg(brw_imm_d(0xc0)));
1239       emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1240       /* This works for both SIMD8 and SIMD16 */
1241       emit(MOV(t2, brw_imm_v(0x3210)));
1242       /* This special instruction takes care of setting vstride=1,
1243        * width=4, hstride=0 of t2 during an ADD instruction.
1244        */
1245       emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1246    } else {
1247       /* As per GL_ARB_sample_shading specification:
1248        * "When rendering to a non-multisample buffer, or if multisample
1249        *  rasterization is disabled, gl_SampleID will always be zero."
1250        */
1251       emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1252    }
1253
1254    return reg;
1255 }
1256
1257 fs_reg
1258 fs_visitor::fix_math_operand(fs_reg src)
1259 {
1260    /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1261     * might be able to do better by doing execsize = 1 math and then
1262     * expanding that result out, but we would need to be careful with
1263     * masking.
1264     *
1265     * The hardware ignores source modifiers (negate and abs) on math
1266     * instructions, so we also move to a temp to set those up.
1267     */
1268    if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1269        !src.abs && !src.negate)
1270       return src;
1271
1272    /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1273     * operands to math
1274     */
1275    if (brw->gen >= 7 && src.file != IMM)
1276       return src;
1277
1278    fs_reg expanded = fs_reg(this, glsl_type::float_type);
1279    expanded.type = src.type;
1280    emit(BRW_OPCODE_MOV, expanded, src);
1281    return expanded;
1282 }
1283
1284 fs_inst *
1285 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1286 {
1287    switch (opcode) {
1288    case SHADER_OPCODE_RCP:
1289    case SHADER_OPCODE_RSQ:
1290    case SHADER_OPCODE_SQRT:
1291    case SHADER_OPCODE_EXP2:
1292    case SHADER_OPCODE_LOG2:
1293    case SHADER_OPCODE_SIN:
1294    case SHADER_OPCODE_COS:
1295       break;
1296    default:
1297       assert(!"not reached: bad math opcode");
1298       return NULL;
1299    }
1300
1301    /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
1302     * might be able to do better by doing execsize = 1 math and then
1303     * expanding that result out, but we would need to be careful with
1304     * masking.
1305     *
1306     * Gen 6 hardware ignores source modifiers (negate and abs) on math
1307     * instructions, so we also move to a temp to set those up.
1308     */
1309    if (brw->gen >= 6)
1310       src = fix_math_operand(src);
1311
1312    fs_inst *inst = emit(opcode, dst, src);
1313
1314    if (brw->gen < 6) {
1315       inst->base_mrf = 2;
1316       inst->mlen = dispatch_width / 8;
1317    }
1318
1319    return inst;
1320 }
1321
1322 fs_inst *
1323 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1324 {
1325    int base_mrf = 2;
1326    fs_inst *inst;
1327
1328    switch (opcode) {
1329    case SHADER_OPCODE_INT_QUOTIENT:
1330    case SHADER_OPCODE_INT_REMAINDER:
1331       if (brw->gen >= 7 && dispatch_width == 16)
1332          fail("16-wide INTDIV unsupported\n");
1333       break;
1334    case SHADER_OPCODE_POW:
1335       break;
1336    default:
1337       assert(!"not reached: unsupported binary math opcode.");
1338       return NULL;
1339    }
1340
1341    if (brw->gen >= 6) {
1342       src0 = fix_math_operand(src0);
1343       src1 = fix_math_operand(src1);
1344
1345       inst = emit(opcode, dst, src0, src1);
1346    } else {
1347       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1348        * "Message Payload":
1349        *
1350        * "Operand0[7].  For the INT DIV functions, this operand is the
1351        *  denominator."
1352        *  ...
1353        * "Operand1[7].  For the INT DIV functions, this operand is the
1354        *  numerator."
1355        */
1356       bool is_int_div = opcode != SHADER_OPCODE_POW;
1357       fs_reg &op0 = is_int_div ? src1 : src0;
1358       fs_reg &op1 = is_int_div ? src0 : src1;
1359
1360       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1361       inst = emit(opcode, dst, op0, reg_null_f);
1362
1363       inst->base_mrf = base_mrf;
1364       inst->mlen = 2 * dispatch_width / 8;
1365    }
1366    return inst;
1367 }
1368
1369 void
1370 fs_visitor::assign_curb_setup()
1371 {
1372    c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
1373    if (dispatch_width == 8) {
1374       c->prog_data.first_curbe_grf = c->nr_payload_regs;
1375    } else {
1376       c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1377    }
1378
1379    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1380    foreach_list(node, &this->instructions) {
1381       fs_inst *inst = (fs_inst *)node;
1382
1383       for (unsigned int i = 0; i < 3; i++) {
1384          if (inst->src[i].file == UNIFORM) {
1385             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1386             struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1387                                                   constant_nr / 8,
1388                                                   constant_nr % 8);
1389
1390             inst->src[i].file = HW_REG;
1391             inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
1392          }
1393       }
1394    }
1395 }
1396
1397 void
1398 fs_visitor::calculate_urb_setup()
1399 {
1400    for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1401       c->prog_data.urb_setup[i] = -1;
1402    }
1403
1404    int urb_next = 0;
1405    /* Figure out where each of the incoming setup attributes lands. */
1406    if (brw->gen >= 6) {
1407       if (_mesa_bitcount_64(fp->Base.InputsRead &
1408                             BRW_FS_VARYING_INPUT_MASK) <= 16) {
1409          /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1410           * first 16 varying inputs, so we can put them wherever we want.
1411           * Just put them in order.
1412           *
1413           * This is useful because it means that (a) inputs not used by the
1414           * fragment shader won't take up valuable register space, and (b) we
1415           * won't have to recompile the fragment shader if it gets paired with
1416           * a different vertex (or geometry) shader.
1417           */
1418          for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1419             if (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1420                 BITFIELD64_BIT(i)) {
1421                c->prog_data.urb_setup[i] = urb_next++;
1422             }
1423          }
1424       } else {
1425          /* We have enough input varyings that the SF/SBE pipeline stage can't
1426           * arbitrarily rearrange them to suit our whim; we have to put them
1427           * in an order that matches the output of the previous pipeline stage
1428           * (geometry or vertex shader).
1429           */
1430          struct brw_vue_map prev_stage_vue_map;
1431          brw_compute_vue_map(brw, &prev_stage_vue_map,
1432                              c->key.input_slots_valid);
1433          int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1434          assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1435          for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1436               slot++) {
1437             int varying = prev_stage_vue_map.slot_to_varying[slot];
1438             /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1439              * unused.
1440              */
1441             if (varying != BRW_VARYING_SLOT_COUNT &&
1442                 (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1443                  BITFIELD64_BIT(varying))) {
1444                c->prog_data.urb_setup[varying] = slot - first_slot;
1445             }
1446          }
1447          urb_next = prev_stage_vue_map.num_slots - first_slot;
1448       }
1449    } else {
1450       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1451       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1452          /* Point size is packed into the header, not as a general attribute */
1453          if (i == VARYING_SLOT_PSIZ)
1454             continue;
1455
1456          if (c->key.input_slots_valid & BITFIELD64_BIT(i)) {
1457             /* The back color slot is skipped when the front color is
1458              * also written to.  In addition, some slots can be
1459              * written in the vertex shader and not read in the
1460              * fragment shader.  So the register number must always be
1461              * incremented, mapped or not.
1462              */
1463             if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1464                c->prog_data.urb_setup[i] = urb_next;
1465             urb_next++;
1466          }
1467       }
1468
1469       /*
1470        * It's a FS only attribute, and we did interpolation for this attribute
1471        * in SF thread. So, count it here, too.
1472        *
1473        * See compile_sf_prog() for more info.
1474        */
1475       if (fp->Base.InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1476          c->prog_data.urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1477    }
1478
1479    c->prog_data.num_varying_inputs = urb_next;
1480 }
1481
1482 void
1483 fs_visitor::assign_urb_setup()
1484 {
1485    int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1486
1487    /* Offset all the urb_setup[] index by the actual position of the
1488     * setup regs, now that the location of the constants has been chosen.
1489     */
1490    foreach_list(node, &this->instructions) {
1491       fs_inst *inst = (fs_inst *)node;
1492
1493       if (inst->opcode == FS_OPCODE_LINTERP) {
1494          assert(inst->src[2].file == HW_REG);
1495          inst->src[2].fixed_hw_reg.nr += urb_start;
1496       }
1497
1498       if (inst->opcode == FS_OPCODE_CINTERP) {
1499          assert(inst->src[0].file == HW_REG);
1500          inst->src[0].fixed_hw_reg.nr += urb_start;
1501       }
1502    }
1503
1504    /* Each attribute is 4 setup channels, each of which is half a reg. */
1505    this->first_non_payload_grf =
1506       urb_start + c->prog_data.num_varying_inputs * 2;
1507 }
1508
1509 /**
1510  * Split large virtual GRFs into separate components if we can.
1511  *
1512  * This is mostly duplicated with what brw_fs_vector_splitting does,
1513  * but that's really conservative because it's afraid of doing
1514  * splitting that doesn't result in real progress after the rest of
1515  * the optimization phases, which would cause infinite looping in
1516  * optimization.  We can do it once here, safely.  This also has the
1517  * opportunity to split interpolated values, or maybe even uniforms,
1518  * which we don't have at the IR level.
1519  *
1520  * We want to split, because virtual GRFs are what we register
1521  * allocate and spill (due to contiguousness requirements for some
1522  * instructions), and they're what we naturally generate in the
1523  * codegen process, but most virtual GRFs don't actually need to be
1524  * contiguous sets of GRFs.  If we split, we'll end up with reduced
1525  * live intervals and better dead code elimination and coalescing.
1526  */
1527 void
1528 fs_visitor::split_virtual_grfs()
1529 {
1530    int num_vars = this->virtual_grf_count;
1531    bool split_grf[num_vars];
1532    int new_virtual_grf[num_vars];
1533
1534    /* Try to split anything > 0 sized. */
1535    for (int i = 0; i < num_vars; i++) {
1536       if (this->virtual_grf_sizes[i] != 1)
1537          split_grf[i] = true;
1538       else
1539          split_grf[i] = false;
1540    }
1541
1542    if (brw->has_pln &&
1543        this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1544       /* PLN opcodes rely on the delta_xy being contiguous.  We only have to
1545        * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1546        * Gen6, that was the only supported interpolation mode, and since Gen6,
1547        * delta_x and delta_y are in fixed hardware registers.
1548        */
1549       split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1550          false;
1551    }
1552
1553    foreach_list(node, &this->instructions) {
1554       fs_inst *inst = (fs_inst *)node;
1555
1556       /* If there's a SEND message that requires contiguous destination
1557        * registers, no splitting is allowed.
1558        */
1559       if (inst->regs_written > 1) {
1560          split_grf[inst->dst.reg] = false;
1561       }
1562
1563       /* If we're sending from a GRF, don't split it, on the assumption that
1564        * the send is reading the whole thing.
1565        */
1566       if (inst->is_send_from_grf()) {
1567          for (int i = 0; i < 3; i++) {
1568             if (inst->src[i].file == GRF) {
1569                split_grf[inst->src[i].reg] = false;
1570             }
1571          }
1572       }
1573    }
1574
1575    /* Allocate new space for split regs.  Note that the virtual
1576     * numbers will be contiguous.
1577     */
1578    for (int i = 0; i < num_vars; i++) {
1579       if (split_grf[i]) {
1580          new_virtual_grf[i] = virtual_grf_alloc(1);
1581          for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1582             int reg = virtual_grf_alloc(1);
1583             assert(reg == new_virtual_grf[i] + j - 1);
1584             (void) reg;
1585          }
1586          this->virtual_grf_sizes[i] = 1;
1587       }
1588    }
1589
1590    foreach_list(node, &this->instructions) {
1591       fs_inst *inst = (fs_inst *)node;
1592
1593       if (inst->dst.file == GRF &&
1594           split_grf[inst->dst.reg] &&
1595           inst->dst.reg_offset != 0) {
1596          inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1597                           inst->dst.reg_offset - 1);
1598          inst->dst.reg_offset = 0;
1599       }
1600       for (int i = 0; i < 3; i++) {
1601          if (inst->src[i].file == GRF &&
1602              split_grf[inst->src[i].reg] &&
1603              inst->src[i].reg_offset != 0) {
1604             inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1605                                 inst->src[i].reg_offset - 1);
1606             inst->src[i].reg_offset = 0;
1607          }
1608       }
1609    }
1610    invalidate_live_intervals();
1611 }
1612
1613 /**
1614  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1615  *
1616  * During code generation, we create tons of temporary variables, many of
1617  * which get immediately killed and are never used again.  Yet, in later
1618  * optimization and analysis passes, such as compute_live_intervals, we need
1619  * to loop over all the virtual GRFs.  Compacting them can save a lot of
1620  * overhead.
1621  */
1622 void
1623 fs_visitor::compact_virtual_grfs()
1624 {
1625    /* Mark which virtual GRFs are used, and count how many. */
1626    int remap_table[this->virtual_grf_count];
1627    memset(remap_table, -1, sizeof(remap_table));
1628
1629    foreach_list(node, &this->instructions) {
1630       const fs_inst *inst = (const fs_inst *) node;
1631
1632       if (inst->dst.file == GRF)
1633          remap_table[inst->dst.reg] = 0;
1634
1635       for (int i = 0; i < 3; i++) {
1636          if (inst->src[i].file == GRF)
1637             remap_table[inst->src[i].reg] = 0;
1638       }
1639    }
1640
1641    /* In addition to registers used in instructions, fs_visitor keeps
1642     * direct references to certain special values which must be patched:
1643     */
1644    fs_reg *special[] = {
1645       &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1646       &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1647       &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1648       &delta_x[0], &delta_x[1], &delta_x[2],
1649       &delta_x[3], &delta_x[4], &delta_x[5],
1650       &delta_y[0], &delta_y[1], &delta_y[2],
1651       &delta_y[3], &delta_y[4], &delta_y[5],
1652    };
1653    STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1654    STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1655
1656    /* Treat all special values as used, to be conservative */
1657    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1658       if (special[i]->file == GRF)
1659          remap_table[special[i]->reg] = 0;
1660    }
1661
1662    /* Compact the GRF arrays. */
1663    int new_index = 0;
1664    for (int i = 0; i < this->virtual_grf_count; i++) {
1665       if (remap_table[i] != -1) {
1666          remap_table[i] = new_index;
1667          virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1668          invalidate_live_intervals();
1669          ++new_index;
1670       }
1671    }
1672
1673    this->virtual_grf_count = new_index;
1674
1675    /* Patch all the instructions to use the newly renumbered registers */
1676    foreach_list(node, &this->instructions) {
1677       fs_inst *inst = (fs_inst *) node;
1678
1679       if (inst->dst.file == GRF)
1680          inst->dst.reg = remap_table[inst->dst.reg];
1681
1682       for (int i = 0; i < 3; i++) {
1683          if (inst->src[i].file == GRF)
1684             inst->src[i].reg = remap_table[inst->src[i].reg];
1685       }
1686    }
1687
1688    /* Patch all the references to special values */
1689    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1690       if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1691          special[i]->reg = remap_table[special[i]->reg];
1692    }
1693 }
1694
1695 bool
1696 fs_visitor::remove_dead_constants()
1697 {
1698    if (dispatch_width == 8) {
1699       this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
1700       this->nr_params_remap = c->prog_data.nr_params;
1701
1702       for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
1703          this->params_remap[i] = -1;
1704
1705       /* Find which params are still in use. */
1706       foreach_list(node, &this->instructions) {
1707          fs_inst *inst = (fs_inst *)node;
1708
1709          for (int i = 0; i < 3; i++) {
1710             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1711
1712             if (inst->src[i].file != UNIFORM)
1713                continue;
1714
1715             /* Section 5.11 of the OpenGL 4.3 spec says:
1716              *
1717              *     "Out-of-bounds reads return undefined values, which include
1718              *     values from other variables of the active program or zero."
1719              */
1720             if (constant_nr < 0 || constant_nr >= (int)c->prog_data.nr_params) {
1721                constant_nr = 0;
1722             }
1723
1724             /* For now, set this to non-negative.  We'll give it the
1725              * actual new number in a moment, in order to keep the
1726              * register numbers nicely ordered.
1727              */
1728             this->params_remap[constant_nr] = 0;
1729          }
1730       }
1731
1732       /* Figure out what the new numbers for the params will be.  At some
1733        * point when we're doing uniform array access, we're going to want
1734        * to keep the distinction between .reg and .reg_offset, but for
1735        * now we don't care.
1736        */
1737       unsigned int new_nr_params = 0;
1738       for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1739          if (this->params_remap[i] != -1) {
1740             this->params_remap[i] = new_nr_params++;
1741          }
1742       }
1743
1744       /* Update the list of params to be uploaded to match our new numbering. */
1745       for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1746          int remapped = this->params_remap[i];
1747
1748          if (remapped == -1)
1749             continue;
1750
1751          c->prog_data.param[remapped] = c->prog_data.param[i];
1752       }
1753
1754       c->prog_data.nr_params = new_nr_params;
1755    } else {
1756       /* This should have been generated in the 8-wide pass already. */
1757       assert(this->params_remap);
1758    }
1759
1760    /* Now do the renumbering of the shader to remove unused params. */
1761    foreach_list(node, &this->instructions) {
1762       fs_inst *inst = (fs_inst *)node;
1763
1764       for (int i = 0; i < 3; i++) {
1765          int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1766
1767          if (inst->src[i].file != UNIFORM)
1768             continue;
1769
1770          /* as above alias to 0 */
1771          if (constant_nr < 0 || constant_nr >= (int)this->nr_params_remap) {
1772             constant_nr = 0;
1773          }
1774          assert(this->params_remap[constant_nr] != -1);
1775          inst->src[i].reg = this->params_remap[constant_nr];
1776          inst->src[i].reg_offset = 0;
1777       }
1778    }
1779
1780    return true;
1781 }
1782
1783 /*
1784  * Implements array access of uniforms by inserting a
1785  * PULL_CONSTANT_LOAD instruction.
1786  *
1787  * Unlike temporary GRF array access (where we don't support it due to
1788  * the difficulty of doing relative addressing on instruction
1789  * destinations), we could potentially do array access of uniforms
1790  * that were loaded in GRF space as push constants.  In real-world
1791  * usage we've seen, though, the arrays being used are always larger
1792  * than we could load as push constants, so just always move all
1793  * uniform array access out to a pull constant buffer.
1794  */
1795 void
1796 fs_visitor::move_uniform_array_access_to_pull_constants()
1797 {
1798    int pull_constant_loc[c->prog_data.nr_params];
1799
1800    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1801       pull_constant_loc[i] = -1;
1802    }
1803
1804    /* Walk through and find array access of uniforms.  Put a copy of that
1805     * uniform in the pull constant buffer.
1806     *
1807     * Note that we don't move constant-indexed accesses to arrays.  No
1808     * testing has been done of the performance impact of this choice.
1809     */
1810    foreach_list_safe(node, &this->instructions) {
1811       fs_inst *inst = (fs_inst *)node;
1812
1813       for (int i = 0 ; i < 3; i++) {
1814          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1815             continue;
1816
1817          int uniform = inst->src[i].reg;
1818
1819          /* If this array isn't already present in the pull constant buffer,
1820           * add it.
1821           */
1822          if (pull_constant_loc[uniform] == -1) {
1823             const float **values = &c->prog_data.param[uniform];
1824
1825             pull_constant_loc[uniform] = c->prog_data.nr_pull_params;
1826
1827             assert(param_size[uniform]);
1828
1829             for (int j = 0; j < param_size[uniform]; j++) {
1830                c->prog_data.pull_param[c->prog_data.nr_pull_params++] =
1831                   values[j];
1832             }
1833          }
1834
1835          /* Set up the annotation tracking for new generated instructions. */
1836          base_ir = inst->ir;
1837          current_annotation = inst->annotation;
1838
1839          fs_reg surf_index = fs_reg(c->prog_data.base.binding_table.pull_constants_start);
1840          fs_reg temp = fs_reg(this, glsl_type::float_type);
1841          exec_list list = VARYING_PULL_CONSTANT_LOAD(temp,
1842                                                      surf_index,
1843                                                      *inst->src[i].reladdr,
1844                                                      pull_constant_loc[uniform] +
1845                                                      inst->src[i].reg_offset);
1846          inst->insert_before(&list);
1847
1848          inst->src[i].file = temp.file;
1849          inst->src[i].reg = temp.reg;
1850          inst->src[i].reg_offset = temp.reg_offset;
1851          inst->src[i].reladdr = NULL;
1852       }
1853    }
1854 }
1855
1856 /**
1857  * Choose accesses from the UNIFORM file to demote to using the pull
1858  * constant buffer.
1859  *
1860  * We allow a fragment shader to have more than the specified minimum
1861  * maximum number of fragment shader uniform components (64).  If
1862  * there are too many of these, they'd fill up all of register space.
1863  * So, this will push some of them out to the pull constant buffer and
1864  * update the program to load them.
1865  */
1866 void
1867 fs_visitor::setup_pull_constants()
1868 {
1869    /* Only allow 16 registers (128 uniform components) as push constants. */
1870    unsigned int max_uniform_components = 16 * 8;
1871    if (c->prog_data.nr_params <= max_uniform_components)
1872       return;
1873
1874    if (dispatch_width == 16) {
1875       fail("Pull constants not supported in 16-wide\n");
1876       return;
1877    }
1878
1879    /* Just demote the end of the list.  We could probably do better
1880     * here, demoting things that are rarely used in the program first.
1881     */
1882    unsigned int pull_uniform_base = max_uniform_components;
1883
1884    int pull_constant_loc[c->prog_data.nr_params];
1885    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1886       if (i < pull_uniform_base) {
1887          pull_constant_loc[i] = -1;
1888       } else {
1889          pull_constant_loc[i] = -1;
1890          /* If our constant is already being uploaded for reladdr purposes,
1891           * reuse it.
1892           */
1893          for (unsigned int j = 0; j < c->prog_data.nr_pull_params; j++) {
1894             if (c->prog_data.pull_param[j] == c->prog_data.param[i]) {
1895                pull_constant_loc[i] = j;
1896                break;
1897             }
1898          }
1899          if (pull_constant_loc[i] == -1) {
1900             int pull_index = c->prog_data.nr_pull_params++;
1901             c->prog_data.pull_param[pull_index] = c->prog_data.param[i];
1902             pull_constant_loc[i] = pull_index;;
1903          }
1904       }
1905    }
1906    c->prog_data.nr_params = pull_uniform_base;
1907
1908    foreach_list(node, &this->instructions) {
1909       fs_inst *inst = (fs_inst *)node;
1910
1911       for (int i = 0; i < 3; i++) {
1912          if (inst->src[i].file != UNIFORM)
1913             continue;
1914
1915          int pull_index = pull_constant_loc[inst->src[i].reg +
1916                                             inst->src[i].reg_offset];
1917          if (pull_index == -1)
1918             continue;
1919
1920          assert(!inst->src[i].reladdr);
1921
1922          fs_reg dst = fs_reg(this, glsl_type::float_type);
1923          fs_reg index = fs_reg(c->prog_data.base.binding_table.pull_constants_start);
1924          fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1925          fs_inst *pull =
1926             new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1927                                  dst, index, offset);
1928          pull->ir = inst->ir;
1929          pull->annotation = inst->annotation;
1930
1931          inst->insert_before(pull);
1932
1933          inst->src[i].file = GRF;
1934          inst->src[i].reg = dst.reg;
1935          inst->src[i].reg_offset = 0;
1936          inst->src[i].smear = pull_index & 3;
1937       }
1938    }
1939 }
1940
1941 bool
1942 fs_visitor::opt_algebraic()
1943 {
1944    bool progress = false;
1945
1946    foreach_list(node, &this->instructions) {
1947       fs_inst *inst = (fs_inst *)node;
1948
1949       switch (inst->opcode) {
1950       case BRW_OPCODE_MUL:
1951          if (inst->src[1].file != IMM)
1952             continue;
1953
1954          /* a * 1.0 = a */
1955          if (inst->src[1].is_one()) {
1956             inst->opcode = BRW_OPCODE_MOV;
1957             inst->src[1] = reg_undef;
1958             progress = true;
1959             break;
1960          }
1961
1962          /* a * 0.0 = 0.0 */
1963          if (inst->src[1].is_zero()) {
1964             inst->opcode = BRW_OPCODE_MOV;
1965             inst->src[0] = inst->src[1];
1966             inst->src[1] = reg_undef;
1967             progress = true;
1968             break;
1969          }
1970
1971          break;
1972       case BRW_OPCODE_ADD:
1973          if (inst->src[1].file != IMM)
1974             continue;
1975
1976          /* a + 0.0 = a */
1977          if (inst->src[1].is_zero()) {
1978             inst->opcode = BRW_OPCODE_MOV;
1979             inst->src[1] = reg_undef;
1980             progress = true;
1981             break;
1982          }
1983          break;
1984       case BRW_OPCODE_OR:
1985          if (inst->src[0].equals(inst->src[1])) {
1986             inst->opcode = BRW_OPCODE_MOV;
1987             inst->src[1] = reg_undef;
1988             progress = true;
1989             break;
1990          }
1991          break;
1992       case BRW_OPCODE_SEL:
1993          if (inst->saturate && inst->src[1].file == IMM) {
1994             switch (inst->conditional_mod) {
1995             case BRW_CONDITIONAL_LE:
1996             case BRW_CONDITIONAL_L:
1997                switch (inst->src[1].type) {
1998                case BRW_REGISTER_TYPE_F:
1999                   if (inst->src[1].imm.f >= 1.0f) {
2000                      inst->opcode = BRW_OPCODE_MOV;
2001                      inst->src[1] = reg_undef;
2002                      progress = true;
2003                   }
2004                   break;
2005                default:
2006                   break;
2007                }
2008                break;
2009             case BRW_CONDITIONAL_GE:
2010             case BRW_CONDITIONAL_G:
2011                switch (inst->src[1].type) {
2012                case BRW_REGISTER_TYPE_F:
2013                   if (inst->src[1].imm.f <= 0.0f) {
2014                      inst->opcode = BRW_OPCODE_MOV;
2015                      inst->src[1] = reg_undef;
2016                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
2017                      progress = true;
2018                   }
2019                   break;
2020                default:
2021                   break;
2022                }
2023             default:
2024                break;
2025             }
2026          }
2027          break;
2028       default:
2029          break;
2030       }
2031    }
2032
2033    return progress;
2034 }
2035
2036 /**
2037  * Removes any instructions writing a VGRF where that VGRF is not used by any
2038  * later instruction.
2039  */
2040 bool
2041 fs_visitor::dead_code_eliminate()
2042 {
2043    bool progress = false;
2044    int pc = 0;
2045
2046    calculate_live_intervals();
2047
2048    foreach_list_safe(node, &this->instructions) {
2049       fs_inst *inst = (fs_inst *)node;
2050
2051       if (inst->dst.file == GRF && !inst->has_side_effects()) {
2052          bool dead = true;
2053
2054          for (int i = 0; i < inst->regs_written; i++) {
2055             int var = live_intervals->var_from_vgrf[inst->dst.reg];
2056             assert(live_intervals->end[var + inst->dst.reg_offset + i] >= pc);
2057             if (live_intervals->end[var + inst->dst.reg_offset + i] != pc) {
2058                dead = false;
2059                break;
2060             }
2061          }
2062
2063          if (dead) {
2064             /* Don't dead code eliminate instructions that write to the
2065              * accumulator as a side-effect. Instead just set the destination
2066              * to the null register to free it.
2067              */
2068             switch (inst->opcode) {
2069             case BRW_OPCODE_ADDC:
2070             case BRW_OPCODE_SUBB:
2071             case BRW_OPCODE_MACH:
2072                inst->dst = fs_reg(retype(brw_null_reg(), inst->dst.type));
2073                break;
2074             default:
2075                inst->remove();
2076                progress = true;
2077                break;
2078             }
2079          }
2080       }
2081
2082       pc++;
2083    }
2084
2085    if (progress)
2086       invalidate_live_intervals();
2087
2088    return progress;
2089 }
2090
2091 struct dead_code_hash_key
2092 {
2093    int vgrf;
2094    int reg_offset;
2095 };
2096
2097 static bool
2098 dead_code_hash_compare(const void *a, const void *b)
2099 {
2100    return memcmp(a, b, sizeof(struct dead_code_hash_key)) == 0;
2101 }
2102
2103 static void
2104 clear_dead_code_hash(struct hash_table *ht)
2105 {
2106    struct hash_entry *entry;
2107
2108    hash_table_foreach(ht, entry) {
2109       _mesa_hash_table_remove(ht, entry);
2110    }
2111 }
2112
2113 static void
2114 insert_dead_code_hash(struct hash_table *ht,
2115                       int vgrf, int reg_offset, fs_inst *inst)
2116 {
2117    /* We don't bother freeing keys, because they'll be GCed with the ht. */
2118    struct dead_code_hash_key *key = ralloc(ht, struct dead_code_hash_key);
2119
2120    key->vgrf = vgrf;
2121    key->reg_offset = reg_offset;
2122
2123    _mesa_hash_table_insert(ht, _mesa_hash_data(key, sizeof(*key)), key, inst);
2124 }
2125
2126 static struct hash_entry *
2127 get_dead_code_hash_entry(struct hash_table *ht, int vgrf, int reg_offset)
2128 {
2129    struct dead_code_hash_key key;
2130
2131    key.vgrf = vgrf;
2132    key.reg_offset = reg_offset;
2133
2134    return _mesa_hash_table_search(ht, _mesa_hash_data(&key, sizeof(key)), &key);
2135 }
2136
2137 static void
2138 remove_dead_code_hash(struct hash_table *ht,
2139                       int vgrf, int reg_offset)
2140 {
2141    struct hash_entry *entry = get_dead_code_hash_entry(ht, vgrf, reg_offset);
2142    if (!entry)
2143       return;
2144
2145    _mesa_hash_table_remove(ht, entry);
2146 }
2147
2148 /**
2149  * Walks basic blocks, removing any regs that are written but not read before
2150  * being redefined.
2151  *
2152  * The dead_code_eliminate() function implements a global dead code
2153  * elimination, but it only handles the removing the last write to a register
2154  * if it's never read.  This one can handle intermediate writes, but only
2155  * within a basic block.
2156  */
2157 bool
2158 fs_visitor::dead_code_eliminate_local()
2159 {
2160    struct hash_table *ht;
2161    bool progress = false;
2162
2163    ht = _mesa_hash_table_create(mem_ctx, dead_code_hash_compare);
2164
2165    foreach_list_safe(node, &this->instructions) {
2166       fs_inst *inst = (fs_inst *)node;
2167
2168       /* At a basic block, empty the HT since we don't understand dataflow
2169        * here.
2170        */
2171       if (inst->is_control_flow()) {
2172          clear_dead_code_hash(ht);
2173          continue;
2174       }
2175
2176       /* Clear the HT of any instructions that got read. */
2177       for (int i = 0; i < 3; i++) {
2178          fs_reg src = inst->src[i];
2179          if (src.file != GRF)
2180             continue;
2181
2182          int read = 1;
2183          if (inst->is_send_from_grf())
2184             read = virtual_grf_sizes[src.reg] - src.reg_offset;
2185
2186          for (int reg_offset = src.reg_offset;
2187               reg_offset < src.reg_offset + read;
2188               reg_offset++) {
2189             remove_dead_code_hash(ht, src.reg, reg_offset);
2190          }
2191       }
2192
2193       /* Add any update of a GRF to the HT, removing a previous write if it
2194        * wasn't read.
2195        */
2196       if (inst->dst.file == GRF) {
2197          if (inst->regs_written > 1) {
2198             /* We don't know how to trim channels from an instruction's
2199              * writes, so we can't incrementally remove unread channels from
2200              * it.  Just remove whatever it overwrites from the table
2201              */
2202             for (int i = 0; i < inst->regs_written; i++) {
2203                remove_dead_code_hash(ht,
2204                                      inst->dst.reg,
2205                                      inst->dst.reg_offset + i);
2206             }
2207          } else {
2208             struct hash_entry *entry =
2209                get_dead_code_hash_entry(ht, inst->dst.reg,
2210                                         inst->dst.reg_offset);
2211
2212             if (entry) {
2213                if (inst->is_partial_write()) {
2214                   /* For a partial write, we can't remove any previous dead code
2215                    * candidate, since we're just modifying their result.
2216                    */
2217                } else {
2218                   /* We're completely updating a channel, and there was a
2219                    * previous write to the channel that wasn't read.  Kill it!
2220                    */
2221                   fs_inst *inst = (fs_inst *)entry->data;
2222                   inst->remove();
2223                   progress = true;
2224                }
2225
2226                _mesa_hash_table_remove(ht, entry);
2227             }
2228
2229             if (!inst->has_side_effects())
2230                insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
2231                                      inst);
2232          }
2233       }
2234    }
2235
2236    _mesa_hash_table_destroy(ht, NULL);
2237
2238    if (progress)
2239       invalidate_live_intervals();
2240
2241    return progress;
2242 }
2243
2244 /**
2245  * Implements register coalescing: Checks if the two registers involved in a
2246  * raw move don't interfere, in which case they can both be stored in the same
2247  * place and the MOV removed.
2248  */
2249 bool
2250 fs_visitor::register_coalesce()
2251 {
2252    bool progress = false;
2253
2254    calculate_live_intervals();
2255
2256    foreach_list_safe(node, &this->instructions) {
2257       fs_inst *inst = (fs_inst *)node;
2258
2259       if (inst->opcode != BRW_OPCODE_MOV ||
2260           inst->is_partial_write() ||
2261           inst->saturate ||
2262           inst->src[0].file != GRF ||
2263           inst->src[0].negate ||
2264           inst->src[0].abs ||
2265           inst->src[0].smear != -1 ||
2266           inst->dst.file != GRF ||
2267           inst->dst.type != inst->src[0].type ||
2268           virtual_grf_sizes[inst->src[0].reg] != 1) {
2269          continue;
2270       }
2271
2272       int var_from = live_intervals->var_from_reg(&inst->src[0]);
2273       int var_to = live_intervals->var_from_reg(&inst->dst);
2274
2275       if (live_intervals->vars_interfere(var_from, var_to) &&
2276           !inst->dst.equals(inst->src[0]))
2277          continue;
2278
2279       int reg_from = inst->src[0].reg;
2280       assert(inst->src[0].reg_offset == 0);
2281       int reg_to = inst->dst.reg;
2282       int reg_to_offset = inst->dst.reg_offset;
2283
2284       foreach_list(node, &this->instructions) {
2285          fs_inst *scan_inst = (fs_inst *)node;
2286
2287          if (scan_inst->dst.file == GRF &&
2288              scan_inst->dst.reg == reg_from) {
2289             scan_inst->dst.reg = reg_to;
2290             scan_inst->dst.reg_offset = reg_to_offset;
2291          }
2292          for (int i = 0; i < 3; i++) {
2293             if (scan_inst->src[i].file == GRF &&
2294                 scan_inst->src[i].reg == reg_from) {
2295                scan_inst->src[i].reg = reg_to;
2296                scan_inst->src[i].reg_offset = reg_to_offset;
2297             }
2298          }
2299       }
2300
2301       inst->remove();
2302       progress = true;
2303       continue;
2304    }
2305
2306    if (progress)
2307       invalidate_live_intervals();
2308
2309    return progress;
2310 }
2311
2312 bool
2313 fs_visitor::compute_to_mrf()
2314 {
2315    bool progress = false;
2316    int next_ip = 0;
2317
2318    calculate_live_intervals();
2319
2320    foreach_list_safe(node, &this->instructions) {
2321       fs_inst *inst = (fs_inst *)node;
2322
2323       int ip = next_ip;
2324       next_ip++;
2325
2326       if (inst->opcode != BRW_OPCODE_MOV ||
2327           inst->is_partial_write() ||
2328           inst->dst.file != MRF || inst->src[0].file != GRF ||
2329           inst->dst.type != inst->src[0].type ||
2330           inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
2331          continue;
2332
2333       /* Work out which hardware MRF registers are written by this
2334        * instruction.
2335        */
2336       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2337       int mrf_high;
2338       if (inst->dst.reg & BRW_MRF_COMPR4) {
2339          mrf_high = mrf_low + 4;
2340       } else if (dispatch_width == 16 &&
2341                  (!inst->force_uncompressed && !inst->force_sechalf)) {
2342          mrf_high = mrf_low + 1;
2343       } else {
2344          mrf_high = mrf_low;
2345       }
2346
2347       /* Can't compute-to-MRF this GRF if someone else was going to
2348        * read it later.
2349        */
2350       if (this->virtual_grf_end[inst->src[0].reg] > ip)
2351          continue;
2352
2353       /* Found a move of a GRF to a MRF.  Let's see if we can go
2354        * rewrite the thing that made this GRF to write into the MRF.
2355        */
2356       fs_inst *scan_inst;
2357       for (scan_inst = (fs_inst *)inst->prev;
2358            scan_inst->prev != NULL;
2359            scan_inst = (fs_inst *)scan_inst->prev) {
2360          if (scan_inst->dst.file == GRF &&
2361              scan_inst->dst.reg == inst->src[0].reg) {
2362             /* Found the last thing to write our reg we want to turn
2363              * into a compute-to-MRF.
2364              */
2365
2366             /* If this one instruction didn't populate all the
2367              * channels, bail.  We might be able to rewrite everything
2368              * that writes that reg, but it would require smarter
2369              * tracking to delay the rewriting until complete success.
2370              */
2371             if (scan_inst->is_partial_write())
2372                break;
2373
2374             /* Things returning more than one register would need us to
2375              * understand coalescing out more than one MOV at a time.
2376              */
2377             if (scan_inst->regs_written > 1)
2378                break;
2379
2380             /* SEND instructions can't have MRF as a destination. */
2381             if (scan_inst->mlen)
2382                break;
2383
2384             if (brw->gen == 6) {
2385                /* gen6 math instructions must have the destination be
2386                 * GRF, so no compute-to-MRF for them.
2387                 */
2388                if (scan_inst->is_math()) {
2389                   break;
2390                }
2391             }
2392
2393             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2394                /* Found the creator of our MRF's source value. */
2395                scan_inst->dst.file = MRF;
2396                scan_inst->dst.reg = inst->dst.reg;
2397                scan_inst->saturate |= inst->saturate;
2398                inst->remove();
2399                progress = true;
2400             }
2401             break;
2402          }
2403
2404          /* We don't handle control flow here.  Most computation of
2405           * values that end up in MRFs are shortly before the MRF
2406           * write anyway.
2407           */
2408          if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2409             break;
2410
2411          /* You can't read from an MRF, so if someone else reads our
2412           * MRF's source GRF that we wanted to rewrite, that stops us.
2413           */
2414          bool interfered = false;
2415          for (int i = 0; i < 3; i++) {
2416             if (scan_inst->src[i].file == GRF &&
2417                 scan_inst->src[i].reg == inst->src[0].reg &&
2418                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2419                interfered = true;
2420             }
2421          }
2422          if (interfered)
2423             break;
2424
2425          if (scan_inst->dst.file == MRF) {
2426             /* If somebody else writes our MRF here, we can't
2427              * compute-to-MRF before that.
2428              */
2429             int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2430             int scan_mrf_high;
2431
2432             if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2433                scan_mrf_high = scan_mrf_low + 4;
2434             } else if (dispatch_width == 16 &&
2435                        (!scan_inst->force_uncompressed &&
2436                         !scan_inst->force_sechalf)) {
2437                scan_mrf_high = scan_mrf_low + 1;
2438             } else {
2439                scan_mrf_high = scan_mrf_low;
2440             }
2441
2442             if (mrf_low == scan_mrf_low ||
2443                 mrf_low == scan_mrf_high ||
2444                 mrf_high == scan_mrf_low ||
2445                 mrf_high == scan_mrf_high) {
2446                break;
2447             }
2448          }
2449
2450          if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2451             /* Found a SEND instruction, which means that there are
2452              * live values in MRFs from base_mrf to base_mrf +
2453              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
2454              * above it.
2455              */
2456             if (mrf_low >= scan_inst->base_mrf &&
2457                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2458                break;
2459             }
2460             if (mrf_high >= scan_inst->base_mrf &&
2461                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2462                break;
2463             }
2464          }
2465       }
2466    }
2467
2468    if (progress)
2469       invalidate_live_intervals();
2470
2471    return progress;
2472 }
2473
2474 /**
2475  * Walks through basic blocks, looking for repeated MRF writes and
2476  * removing the later ones.
2477  */
2478 bool
2479 fs_visitor::remove_duplicate_mrf_writes()
2480 {
2481    fs_inst *last_mrf_move[16];
2482    bool progress = false;
2483
2484    /* Need to update the MRF tracking for compressed instructions. */
2485    if (dispatch_width == 16)
2486       return false;
2487
2488    memset(last_mrf_move, 0, sizeof(last_mrf_move));
2489
2490    foreach_list_safe(node, &this->instructions) {
2491       fs_inst *inst = (fs_inst *)node;
2492
2493       if (inst->is_control_flow()) {
2494          memset(last_mrf_move, 0, sizeof(last_mrf_move));
2495       }
2496
2497       if (inst->opcode == BRW_OPCODE_MOV &&
2498           inst->dst.file == MRF) {
2499          fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2500          if (prev_inst && inst->equals(prev_inst)) {
2501             inst->remove();
2502             progress = true;
2503             continue;
2504          }
2505       }
2506
2507       /* Clear out the last-write records for MRFs that were overwritten. */
2508       if (inst->dst.file == MRF) {
2509          last_mrf_move[inst->dst.reg] = NULL;
2510       }
2511
2512       if (inst->mlen > 0 && inst->base_mrf != -1) {
2513          /* Found a SEND instruction, which will include two or fewer
2514           * implied MRF writes.  We could do better here.
2515           */
2516          for (int i = 0; i < implied_mrf_writes(inst); i++) {
2517             last_mrf_move[inst->base_mrf + i] = NULL;
2518          }
2519       }
2520
2521       /* Clear out any MRF move records whose sources got overwritten. */
2522       if (inst->dst.file == GRF) {
2523          for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2524             if (last_mrf_move[i] &&
2525                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2526                last_mrf_move[i] = NULL;
2527             }
2528          }
2529       }
2530
2531       if (inst->opcode == BRW_OPCODE_MOV &&
2532           inst->dst.file == MRF &&
2533           inst->src[0].file == GRF &&
2534           !inst->is_partial_write()) {
2535          last_mrf_move[inst->dst.reg] = inst;
2536       }
2537    }
2538
2539    if (progress)
2540       invalidate_live_intervals();
2541
2542    return progress;
2543 }
2544
2545 static void
2546 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2547                         int first_grf, int grf_len)
2548 {
2549    bool inst_16wide = (dispatch_width > 8 &&
2550                        !inst->force_uncompressed &&
2551                        !inst->force_sechalf);
2552
2553    /* Clear the flag for registers that actually got read (as expected). */
2554    for (int i = 0; i < 3; i++) {
2555       int grf;
2556       if (inst->src[i].file == GRF) {
2557          grf = inst->src[i].reg;
2558       } else if (inst->src[i].file == HW_REG &&
2559                  inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2560          grf = inst->src[i].fixed_hw_reg.nr;
2561       } else {
2562          continue;
2563       }
2564
2565       if (grf >= first_grf &&
2566           grf < first_grf + grf_len) {
2567          deps[grf - first_grf] = false;
2568          if (inst_16wide)
2569             deps[grf - first_grf + 1] = false;
2570       }
2571    }
2572 }
2573
2574 /**
2575  * Implements this workaround for the original 965:
2576  *
2577  *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2578  *      check for post destination dependencies on this instruction, software
2579  *      must ensure that there is no destination hazard for the case of ‘write
2580  *      followed by a posted write’ shown in the following example.
2581  *
2582  *      1. mov r3 0
2583  *      2. send r3.xy <rest of send instruction>
2584  *      3. mov r2 r3
2585  *
2586  *      Due to no post-destination dependency check on the ‘send’, the above
2587  *      code sequence could have two instructions (1 and 2) in flight at the
2588  *      same time that both consider ‘r3’ as the target of their final writes.
2589  */
2590 void
2591 fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
2592 {
2593    int reg_size = dispatch_width / 8;
2594    int write_len = inst->regs_written * reg_size;
2595    int first_write_grf = inst->dst.reg;
2596    bool needs_dep[BRW_MAX_MRF];
2597    assert(write_len < (int)sizeof(needs_dep) - 1);
2598
2599    memset(needs_dep, false, sizeof(needs_dep));
2600    memset(needs_dep, true, write_len);
2601
2602    clear_deps_for_inst_src(inst, dispatch_width,
2603                            needs_dep, first_write_grf, write_len);
2604
2605    /* Walk backwards looking for writes to registers we're writing which
2606     * aren't read since being written.  If we hit the start of the program,
2607     * we assume that there are no outstanding dependencies on entry to the
2608     * program.
2609     */
2610    for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2611         scan_inst != NULL;
2612         scan_inst = (fs_inst *)scan_inst->prev) {
2613
2614       /* If we hit control flow, assume that there *are* outstanding
2615        * dependencies, and force their cleanup before our instruction.
2616        */
2617       if (scan_inst->is_control_flow()) {
2618          for (int i = 0; i < write_len; i++) {
2619             if (needs_dep[i]) {
2620                inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2621             }
2622          }
2623          return;
2624       }
2625
2626       bool scan_inst_16wide = (dispatch_width > 8 &&
2627                                !scan_inst->force_uncompressed &&
2628                                !scan_inst->force_sechalf);
2629
2630       /* We insert our reads as late as possible on the assumption that any
2631        * instruction but a MOV that might have left us an outstanding
2632        * dependency has more latency than a MOV.
2633        */
2634       if (scan_inst->dst.file == GRF) {
2635          for (int i = 0; i < scan_inst->regs_written; i++) {
2636             int reg = scan_inst->dst.reg + i * reg_size;
2637
2638             if (reg >= first_write_grf &&
2639                 reg < first_write_grf + write_len &&
2640                 needs_dep[reg - first_write_grf]) {
2641                inst->insert_before(DEP_RESOLVE_MOV(reg));
2642                needs_dep[reg - first_write_grf] = false;
2643                if (scan_inst_16wide)
2644                   needs_dep[reg - first_write_grf + 1] = false;
2645             }
2646          }
2647       }
2648
2649       /* Clear the flag for registers that actually got read (as expected). */
2650       clear_deps_for_inst_src(scan_inst, dispatch_width,
2651                               needs_dep, first_write_grf, write_len);
2652
2653       /* Continue the loop only if we haven't resolved all the dependencies */
2654       int i;
2655       for (i = 0; i < write_len; i++) {
2656          if (needs_dep[i])
2657             break;
2658       }
2659       if (i == write_len)
2660          return;
2661    }
2662 }
2663
2664 /**
2665  * Implements this workaround for the original 965:
2666  *
2667  *     "[DevBW, DevCL] Errata: A destination register from a send can not be
2668  *      used as a destination register until after it has been sourced by an
2669  *      instruction with a different destination register.
2670  */
2671 void
2672 fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
2673 {
2674    int write_len = inst->regs_written * dispatch_width / 8;
2675    int first_write_grf = inst->dst.reg;
2676    bool needs_dep[BRW_MAX_MRF];
2677    assert(write_len < (int)sizeof(needs_dep) - 1);
2678
2679    memset(needs_dep, false, sizeof(needs_dep));
2680    memset(needs_dep, true, write_len);
2681    /* Walk forwards looking for writes to registers we're writing which aren't
2682     * read before being written.
2683     */
2684    for (fs_inst *scan_inst = (fs_inst *)inst->next;
2685         !scan_inst->is_tail_sentinel();
2686         scan_inst = (fs_inst *)scan_inst->next) {
2687       /* If we hit control flow, force resolve all remaining dependencies. */
2688       if (scan_inst->is_control_flow()) {
2689          for (int i = 0; i < write_len; i++) {
2690             if (needs_dep[i])
2691                scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2692          }
2693          return;
2694       }
2695
2696       /* Clear the flag for registers that actually got read (as expected). */
2697       clear_deps_for_inst_src(scan_inst, dispatch_width,
2698                               needs_dep, first_write_grf, write_len);
2699
2700       /* We insert our reads as late as possible since they're reading the
2701        * result of a SEND, which has massive latency.
2702        */
2703       if (scan_inst->dst.file == GRF &&
2704           scan_inst->dst.reg >= first_write_grf &&
2705           scan_inst->dst.reg < first_write_grf + write_len &&
2706           needs_dep[scan_inst->dst.reg - first_write_grf]) {
2707          scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2708          needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2709       }
2710
2711       /* Continue the loop only if we haven't resolved all the dependencies */
2712       int i;
2713       for (i = 0; i < write_len; i++) {
2714          if (needs_dep[i])
2715             break;
2716       }
2717       if (i == write_len)
2718          return;
2719    }
2720
2721    /* If we hit the end of the program, resolve all remaining dependencies out
2722     * of paranoia.
2723     */
2724    fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2725    assert(last_inst->eot);
2726    for (int i = 0; i < write_len; i++) {
2727       if (needs_dep[i])
2728          last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2729    }
2730 }
2731
2732 void
2733 fs_visitor::insert_gen4_send_dependency_workarounds()
2734 {
2735    if (brw->gen != 4 || brw->is_g4x)
2736       return;
2737
2738    /* Note that we're done with register allocation, so GRF fs_regs always
2739     * have a .reg_offset of 0.
2740     */
2741
2742    foreach_list_safe(node, &this->instructions) {
2743       fs_inst *inst = (fs_inst *)node;
2744
2745       if (inst->mlen != 0 && inst->dst.file == GRF) {
2746          insert_gen4_pre_send_dependency_workarounds(inst);
2747          insert_gen4_post_send_dependency_workarounds(inst);
2748       }
2749    }
2750 }
2751
2752 /**
2753  * Turns the generic expression-style uniform pull constant load instruction
2754  * into a hardware-specific series of instructions for loading a pull
2755  * constant.
2756  *
2757  * The expression style allows the CSE pass before this to optimize out
2758  * repeated loads from the same offset, and gives the pre-register-allocation
2759  * scheduling full flexibility, while the conversion to native instructions
2760  * allows the post-register-allocation scheduler the best information
2761  * possible.
2762  *
2763  * Note that execution masking for setting up pull constant loads is special:
2764  * the channels that need to be written are unrelated to the current execution
2765  * mask, since a later instruction will use one of the result channels as a
2766  * source operand for all 8 or 16 of its channels.
2767  */
2768 void
2769 fs_visitor::lower_uniform_pull_constant_loads()
2770 {
2771    foreach_list(node, &this->instructions) {
2772       fs_inst *inst = (fs_inst *)node;
2773
2774       if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2775          continue;
2776
2777       if (brw->gen >= 7) {
2778          /* The offset arg before was a vec4-aligned byte offset.  We need to
2779           * turn it into a dword offset.
2780           */
2781          fs_reg const_offset_reg = inst->src[1];
2782          assert(const_offset_reg.file == IMM &&
2783                 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2784          const_offset_reg.imm.u /= 4;
2785          fs_reg payload = fs_reg(this, glsl_type::uint_type);
2786
2787          /* This is actually going to be a MOV, but since only the first dword
2788           * is accessed, we have a special opcode to do just that one.  Note
2789           * that this needs to be an operation that will be considered a def
2790           * by live variable analysis, or register allocation will explode.
2791           */
2792          fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2793                                                payload, const_offset_reg);
2794          setup->force_writemask_all = true;
2795
2796          setup->ir = inst->ir;
2797          setup->annotation = inst->annotation;
2798          inst->insert_before(setup);
2799
2800          /* Similarly, this will only populate the first 4 channels of the
2801           * result register (since we only use smear values from 0-3), but we
2802           * don't tell the optimizer.
2803           */
2804          inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2805          inst->src[1] = payload;
2806
2807          invalidate_live_intervals();
2808       } else {
2809          /* Before register allocation, we didn't tell the scheduler about the
2810           * MRF we use.  We know it's safe to use this MRF because nothing
2811           * else does except for register spill/unspill, which generates and
2812           * uses its MRF within a single IR instruction.
2813           */
2814          inst->base_mrf = 14;
2815          inst->mlen = 1;
2816       }
2817    }
2818 }
2819
2820 void
2821 fs_visitor::dump_instruction(backend_instruction *be_inst)
2822 {
2823    fs_inst *inst = (fs_inst *)be_inst;
2824
2825    if (inst->predicate) {
2826       printf("(%cf0.%d) ",
2827              inst->predicate_inverse ? '-' : '+',
2828              inst->flag_subreg);
2829    }
2830
2831    printf("%s", brw_instruction_name(inst->opcode));
2832    if (inst->saturate)
2833       printf(".sat");
2834    if (inst->conditional_mod) {
2835       printf("%s", conditional_modifier[inst->conditional_mod]);
2836       if (!inst->predicate &&
2837           (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2838                               inst->opcode != BRW_OPCODE_IF &&
2839                               inst->opcode != BRW_OPCODE_WHILE))) {
2840          printf(".f0.%d", inst->flag_subreg);
2841       }
2842    }
2843    printf(" ");
2844
2845
2846    switch (inst->dst.file) {
2847    case GRF:
2848       printf("vgrf%d", inst->dst.reg);
2849       if (inst->dst.reg_offset)
2850          printf("+%d", inst->dst.reg_offset);
2851       break;
2852    case MRF:
2853       printf("m%d", inst->dst.reg);
2854       break;
2855    case BAD_FILE:
2856       printf("(null)");
2857       break;
2858    case UNIFORM:
2859       printf("***u%d***", inst->dst.reg);
2860       break;
2861    case HW_REG:
2862       if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
2863          switch (inst->dst.fixed_hw_reg.nr) {
2864          case BRW_ARF_NULL:
2865             printf("null");
2866             break;
2867          case BRW_ARF_ADDRESS:
2868             printf("a0.%d", inst->dst.fixed_hw_reg.subnr);
2869             break;
2870          case BRW_ARF_ACCUMULATOR:
2871             printf("acc%d", inst->dst.fixed_hw_reg.subnr);
2872             break;
2873          case BRW_ARF_FLAG:
2874             printf("f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
2875                              inst->dst.fixed_hw_reg.subnr);
2876             break;
2877          default:
2878             printf("arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
2879                                inst->dst.fixed_hw_reg.subnr);
2880             break;
2881          }
2882       } else {
2883          printf("hw_reg%d", inst->dst.fixed_hw_reg.nr);
2884       }
2885       if (inst->dst.fixed_hw_reg.subnr)
2886          printf("+%d", inst->dst.fixed_hw_reg.subnr);
2887       break;
2888    default:
2889       printf("???");
2890       break;
2891    }
2892    printf(":%s, ", reg_encoding[inst->dst.type]);
2893
2894    for (int i = 0; i < 3 && inst->src[i].file != BAD_FILE; i++) {
2895       if (inst->src[i].negate)
2896          printf("-");
2897       if (inst->src[i].abs)
2898          printf("|");
2899       switch (inst->src[i].file) {
2900       case GRF:
2901          printf("vgrf%d", inst->src[i].reg);
2902          if (inst->src[i].reg_offset)
2903             printf("+%d", inst->src[i].reg_offset);
2904          break;
2905       case MRF:
2906          printf("***m%d***", inst->src[i].reg);
2907          break;
2908       case UNIFORM:
2909          printf("u%d", inst->src[i].reg);
2910          if (inst->src[i].reg_offset)
2911             printf(".%d", inst->src[i].reg_offset);
2912          break;
2913       case BAD_FILE:
2914          printf("(null)");
2915          break;
2916       case IMM:
2917          switch (inst->src[i].type) {
2918          case BRW_REGISTER_TYPE_F:
2919             printf("%ff", inst->src[i].imm.f);
2920             break;
2921          case BRW_REGISTER_TYPE_D:
2922             printf("%dd", inst->src[i].imm.i);
2923             break;
2924          case BRW_REGISTER_TYPE_UD:
2925             printf("%uu", inst->src[i].imm.u);
2926             break;
2927          default:
2928             printf("???");
2929             break;
2930          }
2931          break;
2932       case HW_REG:
2933          if (inst->src[i].fixed_hw_reg.negate)
2934             printf("-");
2935          if (inst->src[i].fixed_hw_reg.abs)
2936             printf("|");
2937          if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
2938             switch (inst->src[i].fixed_hw_reg.nr) {
2939             case BRW_ARF_NULL:
2940                printf("null");
2941                break;
2942             case BRW_ARF_ADDRESS:
2943                printf("a0.%d", inst->src[i].fixed_hw_reg.subnr);
2944                break;
2945             case BRW_ARF_ACCUMULATOR:
2946                printf("acc%d", inst->src[i].fixed_hw_reg.subnr);
2947                break;
2948             case BRW_ARF_FLAG:
2949                printf("f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
2950                                 inst->src[i].fixed_hw_reg.subnr);
2951                break;
2952             default:
2953                printf("arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
2954                                   inst->src[i].fixed_hw_reg.subnr);
2955                break;
2956             }
2957          } else {
2958             printf("hw_reg%d", inst->src[i].fixed_hw_reg.nr);
2959          }
2960          if (inst->src[i].fixed_hw_reg.subnr)
2961             printf("+%d", inst->src[i].fixed_hw_reg.subnr);
2962          if (inst->src[i].fixed_hw_reg.abs)
2963             printf("|");
2964          break;
2965       default:
2966          printf("???");
2967          break;
2968       }
2969       if (inst->src[i].abs)
2970          printf("|");
2971
2972       if (inst->src[i].file != IMM) {
2973          printf(":%s", reg_encoding[inst->src[i].type]);
2974       }
2975
2976       if (i < 2 && inst->src[i + 1].file != BAD_FILE)
2977          printf(", ");
2978    }
2979
2980    printf(" ");
2981
2982    if (inst->force_uncompressed)
2983       printf("1sthalf ");
2984
2985    if (inst->force_sechalf)
2986       printf("2ndhalf ");
2987
2988    printf("\n");
2989 }
2990
2991 /**
2992  * Possibly returns an instruction that set up @param reg.
2993  *
2994  * Sometimes we want to take the result of some expression/variable
2995  * dereference tree and rewrite the instruction generating the result
2996  * of the tree.  When processing the tree, we know that the
2997  * instructions generated are all writing temporaries that are dead
2998  * outside of this tree.  So, if we have some instructions that write
2999  * a temporary, we're free to point that temp write somewhere else.
3000  *
3001  * Note that this doesn't guarantee that the instruction generated
3002  * only reg -- it might be the size=4 destination of a texture instruction.
3003  */
3004 fs_inst *
3005 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3006                                            fs_inst *end,
3007                                            fs_reg reg)
3008 {
3009    if (end == start ||
3010        end->is_partial_write() ||
3011        reg.reladdr ||
3012        !reg.equals(end->dst)) {
3013       return NULL;
3014    } else {
3015       return end;
3016    }
3017 }
3018
3019 void
3020 fs_visitor::setup_payload_gen6()
3021 {
3022    bool uses_depth =
3023       (fp->Base.InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3024    unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
3025
3026    assert(brw->gen >= 6);
3027
3028    /* R0-1: masks, pixel X/Y coordinates. */
3029    c->nr_payload_regs = 2;
3030    /* R2: only for 32-pixel dispatch.*/
3031
3032    /* R3-26: barycentric interpolation coordinates.  These appear in the
3033     * same order that they appear in the brw_wm_barycentric_interp_mode
3034     * enum.  Each set of coordinates occupies 2 registers if dispatch width
3035     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
3036     * appear if they were enabled using the "Barycentric Interpolation
3037     * Mode" bits in WM_STATE.
3038     */
3039    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3040       if (barycentric_interp_modes & (1 << i)) {
3041          c->barycentric_coord_reg[i] = c->nr_payload_regs;
3042          c->nr_payload_regs += 2;
3043          if (dispatch_width == 16) {
3044             c->nr_payload_regs += 2;
3045          }
3046       }
3047    }
3048
3049    /* R27: interpolated depth if uses source depth */
3050    if (uses_depth) {
3051       c->source_depth_reg = c->nr_payload_regs;
3052       c->nr_payload_regs++;
3053       if (dispatch_width == 16) {
3054          /* R28: interpolated depth if not 8-wide. */
3055          c->nr_payload_regs++;
3056       }
3057    }
3058    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3059    if (uses_depth) {
3060       c->source_w_reg = c->nr_payload_regs;
3061       c->nr_payload_regs++;
3062       if (dispatch_width == 16) {
3063          /* R30: interpolated W if not 8-wide. */
3064          c->nr_payload_regs++;
3065       }
3066    }
3067
3068    c->prog_data.uses_pos_offset = c->key.compute_pos_offset;
3069    /* R31: MSAA position offsets. */
3070    if (c->prog_data.uses_pos_offset) {
3071       c->sample_pos_reg = c->nr_payload_regs;
3072       c->nr_payload_regs++;
3073    }
3074
3075    /* R32-: bary for 32-pixel. */
3076    /* R58-59: interp W for 32-pixel. */
3077
3078    if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3079       c->source_depth_to_render_target = true;
3080    }
3081 }
3082
3083 void
3084 fs_visitor::assign_binding_table_offsets()
3085 {
3086    uint32_t next_binding_table_offset = 0;
3087
3088    /* If there are no color regions, we still perform an FB write to a null
3089     * renderbuffer, which we place at surface index 0.
3090     */
3091    c->prog_data.binding_table.render_target_start = next_binding_table_offset;
3092    next_binding_table_offset += MAX2(c->key.nr_color_regions, 1);
3093
3094    assign_common_binding_table_offsets(next_binding_table_offset);
3095 }
3096
3097 bool
3098 fs_visitor::run()
3099 {
3100    sanity_param_count = fp->Base.Parameters->NumParameters;
3101    uint32_t orig_nr_params = c->prog_data.nr_params;
3102    bool allocated_without_spills;
3103
3104    assign_binding_table_offsets();
3105
3106    if (brw->gen >= 6)
3107       setup_payload_gen6();
3108    else
3109       setup_payload_gen4();
3110
3111    if (0) {
3112       emit_dummy_fs();
3113    } else {
3114       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3115          emit_shader_time_begin();
3116
3117       calculate_urb_setup();
3118       if (fp->Base.InputsRead > 0) {
3119          if (brw->gen < 6)
3120             emit_interpolation_setup_gen4();
3121          else
3122             emit_interpolation_setup_gen6();
3123       }
3124
3125       /* We handle discards by keeping track of the still-live pixels in f0.1.
3126        * Initialize it with the dispatched pixels.
3127        */
3128       if (fp->UsesKill || c->key.alpha_test_func) {
3129          fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
3130          discard_init->flag_subreg = 1;
3131       }
3132
3133       /* Generate FS IR for main().  (the visitor only descends into
3134        * functions called "main").
3135        */
3136       if (shader) {
3137          foreach_list(node, &*shader->ir) {
3138             ir_instruction *ir = (ir_instruction *)node;
3139             base_ir = ir;
3140             this->result = reg_undef;
3141             ir->accept(this);
3142          }
3143       } else {
3144          emit_fragment_program_code();
3145       }
3146       base_ir = NULL;
3147       if (failed)
3148          return false;
3149
3150       emit(FS_OPCODE_PLACEHOLDER_HALT);
3151
3152       if (c->key.alpha_test_func)
3153          emit_alpha_test();
3154
3155       emit_fb_writes();
3156
3157       split_virtual_grfs();
3158
3159       move_uniform_array_access_to_pull_constants();
3160       remove_dead_constants();
3161       setup_pull_constants();
3162
3163       bool progress;
3164       do {
3165          progress = false;
3166
3167          compact_virtual_grfs();
3168
3169          progress = remove_duplicate_mrf_writes() || progress;
3170
3171          progress = opt_algebraic() || progress;
3172          progress = opt_cse() || progress;
3173          progress = opt_copy_propagate() || progress;
3174          progress = opt_peephole_sel() || progress;
3175          progress = opt_peephole_predicated_break() || progress;
3176          progress = dead_code_eliminate() || progress;
3177          progress = dead_code_eliminate_local() || progress;
3178          progress = dead_control_flow_eliminate(this) || progress;
3179          progress = register_coalesce() || progress;
3180          progress = compute_to_mrf() || progress;
3181       } while (progress);
3182
3183       lower_uniform_pull_constant_loads();
3184
3185       assign_curb_setup();
3186       assign_urb_setup();
3187
3188       static enum instruction_scheduler_mode pre_modes[] = {
3189          SCHEDULE_PRE,
3190          SCHEDULE_PRE_NON_LIFO,
3191          SCHEDULE_PRE_LIFO,
3192       };
3193
3194       /* Try each scheduling heuristic to see if it can successfully register
3195        * allocate without spilling.  They should be ordered by decreasing
3196        * performance but increasing likelihood of allocating.
3197        */
3198       for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3199          schedule_instructions(pre_modes[i]);
3200
3201          if (0) {
3202             assign_regs_trivial();
3203             allocated_without_spills = true;
3204          } else {
3205             allocated_without_spills = assign_regs(false);
3206          }
3207          if (allocated_without_spills)
3208             break;
3209       }
3210
3211       if (!allocated_without_spills) {
3212          /* We assume that any spilling is worse than just dropping back to
3213           * SIMD8.  There's probably actually some intermediate point where
3214           * SIMD16 with a couple of spills is still better.
3215           */
3216          if (dispatch_width == 16) {
3217             fail("Failure to register allocate.  Reduce number of "
3218                  "live scalar values to avoid this.");
3219          }
3220
3221          /* Since we're out of heuristics, just go spill registers until we
3222           * get an allocation.
3223           */
3224          while (!assign_regs(true)) {
3225             if (failed)
3226                break;
3227          }
3228       }
3229    }
3230    assert(force_uncompressed_stack == 0);
3231
3232    /* This must come after all optimization and register allocation, since
3233     * it inserts dead code that happens to have side effects, and it does
3234     * so based on the actual physical registers in use.
3235     */
3236    insert_gen4_send_dependency_workarounds();
3237
3238    if (failed)
3239       return false;
3240
3241    if (!allocated_without_spills)
3242       schedule_instructions(SCHEDULE_POST);
3243
3244    if (dispatch_width == 8) {
3245       c->prog_data.reg_blocks = brw_register_blocks(grf_used);
3246    } else {
3247       c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
3248
3249       /* Make sure we didn't try to sneak in an extra uniform */
3250       assert(orig_nr_params == c->prog_data.nr_params);
3251       (void) orig_nr_params;
3252    }
3253
3254    /* If any state parameters were appended, then ParameterValues could have
3255     * been realloced, in which case the driver uniform storage set up by
3256     * _mesa_associate_uniform_storage() would point to freed memory.  Make
3257     * sure that didn't happen.
3258     */
3259    assert(sanity_param_count == fp->Base.Parameters->NumParameters);
3260
3261    return !failed;
3262 }
3263
3264 const unsigned *
3265 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
3266                struct gl_fragment_program *fp,
3267                struct gl_shader_program *prog,
3268                unsigned *final_assembly_size)
3269 {
3270    bool start_busy = false;
3271    float start_time = 0;
3272
3273    if (unlikely(brw->perf_debug)) {
3274       start_busy = (brw->batch.last_bo &&
3275                     drm_intel_bo_busy(brw->batch.last_bo));
3276       start_time = get_time();
3277    }
3278
3279    struct brw_shader *shader = NULL;
3280    if (prog)
3281       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3282
3283    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3284       if (prog) {
3285          printf("GLSL IR for native fragment shader %d:\n", prog->Name);
3286          _mesa_print_ir(shader->ir, NULL);
3287          printf("\n\n");
3288       } else {
3289          printf("ARB_fragment_program %d ir for native fragment shader\n",
3290                 fp->Base.Id);
3291          _mesa_print_program(&fp->Base);
3292       }
3293    }
3294
3295    /* Now the main event: Visit the shader IR and generate our FS IR for it.
3296     */
3297    fs_visitor v(brw, c, prog, fp, 8);
3298    if (!v.run()) {
3299       if (prog) {
3300          prog->LinkStatus = false;
3301          ralloc_strcat(&prog->InfoLog, v.fail_msg);
3302       }
3303
3304       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3305                     v.fail_msg);
3306
3307       return NULL;
3308    }
3309
3310    exec_list *simd16_instructions = NULL;
3311    fs_visitor v2(brw, c, prog, fp, 16);
3312    if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16))) {
3313       if (c->prog_data.nr_pull_params == 0) {
3314          /* Try a 16-wide compile */
3315          v2.import_uniforms(&v);
3316          if (!v2.run()) {
3317             perf_debug("16-wide shader failed to compile, falling back to "
3318                        "8-wide at a 10-20%% performance cost: %s", v2.fail_msg);
3319          } else {
3320             simd16_instructions = &v2.instructions;
3321          }
3322       } else {
3323          perf_debug("Skipping 16-wide due to pull parameters.\n");
3324       }
3325    }
3326
3327    fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
3328    const unsigned *generated = g.generate_assembly(&v.instructions,
3329                                                    simd16_instructions,
3330                                                    final_assembly_size);
3331
3332    if (unlikely(brw->perf_debug) && shader) {
3333       if (shader->compiled_once)
3334          brw_wm_debug_recompile(brw, prog, &c->key);
3335       shader->compiled_once = true;
3336
3337       if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
3338          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3339                     (get_time() - start_time) * 1000);
3340       }
3341    }
3342
3343    return generated;
3344 }
3345
3346 bool
3347 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
3348 {
3349    struct brw_context *brw = brw_context(ctx);
3350    struct brw_wm_prog_key key;
3351
3352    if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
3353       return true;
3354
3355    struct gl_fragment_program *fp = (struct gl_fragment_program *)
3356       prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
3357    struct brw_fragment_program *bfp = brw_fragment_program(fp);
3358    bool program_uses_dfdy = fp->UsesDFdy;
3359
3360    memset(&key, 0, sizeof(key));
3361
3362    if (brw->gen < 6) {
3363       if (fp->UsesKill)
3364          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3365
3366       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3367          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3368
3369       /* Just assume depth testing. */
3370       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3371       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3372    }
3373
3374    if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
3375                                          BRW_FS_VARYING_INPUT_MASK) > 16)
3376       key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
3377
3378    key.clamp_fragment_color = ctx->API == API_OPENGL_COMPAT;
3379
3380    unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
3381    for (unsigned i = 0; i < sampler_count; i++) {
3382       if (fp->Base.ShadowSamplers & (1 << i)) {
3383          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3384          key.tex.swizzles[i] =
3385             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3386       } else {
3387          /* Color sampler: assume no swizzling. */
3388          key.tex.swizzles[i] = SWIZZLE_XYZW;
3389       }
3390    }
3391
3392    if (fp->Base.InputsRead & VARYING_BIT_POS) {
3393       key.drawable_height = ctx->DrawBuffer->Height;
3394    }
3395
3396    if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3397       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
3398    }
3399
3400    key.nr_color_regions = 1;
3401
3402    /* GL_FRAGMENT_SHADER_DERIVATIVE_HINT is almost always GL_DONT_CARE.  The
3403     * quality of the derivatives is likely to be determined by the driconf
3404     * option.
3405     */
3406    key.high_quality_derivatives = brw->disable_derivative_optimization;
3407
3408    key.program_string_id = bfp->id;
3409
3410    uint32_t old_prog_offset = brw->wm.base.prog_offset;
3411    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3412
3413    bool success = do_wm_prog(brw, prog, bfp, &key);
3414
3415    brw->wm.base.prog_offset = old_prog_offset;
3416    brw->wm.prog_data = old_prog_data;
3417
3418    return success;
3419 }