src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 extern "C" {
  32
  33 #include <sys/types.h>
  34
  35 #include "main/hash_table.h"
  36 #include "main/macros.h"
  37 #include "main/shaderobj.h"
  38 #include "main/fbobject.h"
  39 #include "program/prog_parameter.h"
  40 #include "program/prog_print.h"
  41 #include "program/register_allocate.h"
  42 #include "program/sampler.h"
  43 #include "program/hash_table.h"
  44 #include "brw_context.h"
  45 #include "brw_eu.h"
  46 #include "brw_wm.h"
  47 }
  48 #include "brw_fs.h"
  49 #include "brw_dead_control_flow.h"
  50 #include "main/uniforms.h"
  51 #include "brw_fs_live_variables.h"
  52 #include "glsl/glsl_types.h"
  53
  54 void
  55 fs_inst::init()
  56 {
  57    memset(this, 0, sizeof(*this));
  58    this->opcode = BRW_OPCODE_NOP;
  59    this->conditional_mod = BRW_CONDITIONAL_NONE;
  60
  61    this->dst = reg_undef;
  62    this->src[0] = reg_undef;
  63    this->src[1] = reg_undef;
  64    this->src[2] = reg_undef;
  65
  66    /* This will be the case for almost all instructions. */
  67    this->regs_written = 1;
  68 }
  69
  70 fs_inst::fs_inst()
  71 {
  72    init();
  73 }
  74
  75 fs_inst::fs_inst(enum opcode opcode)
  76 {
  77    init();
  78    this->opcode = opcode;
  79 }
  80
  81 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
  82 {
  83    init();
  84    this->opcode = opcode;
  85    this->dst = dst;
  86
  87    if (dst.file == GRF)
  88       assert(dst.reg_offset >= 0);
  89 }
  90
  91 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
  92 {
  93    init();
  94    this->opcode = opcode;
  95    this->dst = dst;
  96    this->src[0] = src0;
  97
  98    if (dst.file == GRF)
  99       assert(dst.reg_offset >= 0);
 100    if (src[0].file == GRF)
 101       assert(src[0].reg_offset >= 0);
 102 }
 103
 104 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 105 {
 106    init();
 107    this->opcode = opcode;
 108    this->dst = dst;
 109    this->src[0] = src0;
 110    this->src[1] = src1;
 111
 112    if (dst.file == GRF)
 113       assert(dst.reg_offset >= 0);
 114    if (src[0].file == GRF)
 115       assert(src[0].reg_offset >= 0);
 116    if (src[1].file == GRF)
 117       assert(src[1].reg_offset >= 0);
 118 }
 119
 120 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
 121                  fs_reg src0, fs_reg src1, fs_reg src2)
 122 {
 123    init();
 124    this->opcode = opcode;
 125    this->dst = dst;
 126    this->src[0] = src0;
 127    this->src[1] = src1;
 128    this->src[2] = src2;
 129
 130    if (dst.file == GRF)
 131       assert(dst.reg_offset >= 0);
 132    if (src[0].file == GRF)
 133       assert(src[0].reg_offset >= 0);
 134    if (src[1].file == GRF)
 135       assert(src[1].reg_offset >= 0);
 136    if (src[2].file == GRF)
 137       assert(src[2].reg_offset >= 0);
 138 }
 139
 140 #define ALU1(op)                                                        \
 141    fs_inst *                                                            \
 142    fs_visitor::op(fs_reg dst, fs_reg src0)                              \
 143    {                                                                    \
 144       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
 145    }
 146
 147 #define ALU2(op)                                                        \
 148    fs_inst *                                                            \
 149    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1)                 \
 150    {                                                                    \
 151       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
 152    }
 153
 154 #define ALU3(op)                                                        \
 155    fs_inst *                                                            \
 156    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1, fs_reg src2)    \
 157    {                                                                    \
 158       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
 159    }
 160
 161 ALU1(NOT)
 162 ALU1(MOV)
 163 ALU1(FRC)
 164 ALU1(RNDD)
 165 ALU1(RNDE)
 166 ALU1(RNDZ)
 167 ALU2(ADD)
 168 ALU2(MUL)
 169 ALU2(MACH)
 170 ALU2(AND)
 171 ALU2(OR)
 172 ALU2(XOR)
 173 ALU2(SHL)
 174 ALU2(SHR)
 175 ALU2(ASR)
 176 ALU3(LRP)
 177 ALU1(BFREV)
 178 ALU3(BFE)
 179 ALU2(BFI1)
 180 ALU3(BFI2)
 181 ALU1(FBH)
 182 ALU1(FBL)
 183 ALU1(CBIT)
 184 ALU3(MAD)
 185 ALU2(ADDC)
 186 ALU2(SUBB)
 187 ALU2(SEL)
 188
 189 /** Gen4 predicated IF. */
 190 fs_inst *
 191 fs_visitor::IF(uint32_t predicate)
 192 {
 193    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
 194    inst->predicate = predicate;
 195    return inst;
 196 }
 197
 198 /** Gen6 IF with embedded comparison. */
 199 fs_inst *
 200 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
 201 {
 202    assert(brw->gen == 6);
 203    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
 204                                         reg_null_d, src0, src1);
 205    inst->conditional_mod = condition;
 206    return inst;
 207 }
 208
 209 /**
 210  * CMP: Sets the low bit of the destination channels with the result
 211  * of the comparison, while the upper bits are undefined, and updates
 212  * the flag register with the packed 16 bits of the result.
 213  */
 214 fs_inst *
 215 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
 216 {
 217    fs_inst *inst;
 218
 219    /* Take the instruction:
 220     *
 221     * CMP null<d> src0<f> src1<f>
 222     *
 223     * Original gen4 does type conversion to the destination type before
 224     * comparison, producing garbage results for floating point comparisons.
 225     * gen5 does the comparison on the execution type (resolved source types),
 226     * so dst type doesn't matter.  gen6 does comparison and then uses the
 227     * result as if it was the dst type with no conversion, which happens to
 228     * mostly work out for float-interpreted-as-int since our comparisons are
 229     * for >0, =0, <0.
 230     */
 231    if (brw->gen == 4) {
 232       dst.type = src0.type;
 233       if (dst.file == HW_REG)
 234          dst.fixed_hw_reg.type = dst.type;
 235    }
 236
 237    resolve_ud_negate(&src0);
 238    resolve_ud_negate(&src1);
 239
 240    inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
 241    inst->conditional_mod = condition;
 242
 243    return inst;
 244 }
 245
 246 exec_list
 247 fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
 248                                        fs_reg varying_offset,
 249                                        uint32_t const_offset)
 250 {
 251    exec_list instructions;
 252    fs_inst *inst;
 253
 254    /* We have our constant surface use a pitch of 4 bytes, so our index can
 255     * be any component of a vector, and then we load 4 contiguous
 256     * components starting from that.
 257     *
 258     * We break down the const_offset to a portion added to the variable
 259     * offset and a portion done using reg_offset, which means that if you
 260     * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
 261     * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
 262     * CSE can later notice that those loads are all the same and eliminate
 263     * the redundant ones.
 264     */
 265    fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
 266    instructions.push_tail(ADD(vec4_offset,
 267                               varying_offset, const_offset & ~3));
 268
 269    int scale = 1;
 270    if (brw->gen == 4 && dispatch_width == 8) {
 271       /* Pre-gen5, we can either use a SIMD8 message that requires (header,
 272        * u, v, r) as parameters, or we can just use the SIMD16 message
 273        * consisting of (header, u).  We choose the second, at the cost of a
 274        * longer return length.
 275        */
 276       scale = 2;
 277    }
 278
 279    enum opcode op;
 280    if (brw->gen >= 7)
 281       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
 282    else
 283       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
 284    fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4 * scale), dst.type);
 285    inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
 286    inst->regs_written = 4 * scale;
 287    instructions.push_tail(inst);
 288
 289    if (brw->gen < 7) {
 290       inst->base_mrf = 13;
 291       inst->header_present = true;
 292       if (brw->gen == 4)
 293          inst->mlen = 3;
 294       else
 295          inst->mlen = 1 + dispatch_width / 8;
 296    }
 297
 298    vec4_result.reg_offset += (const_offset & 3) * scale;
 299    instructions.push_tail(MOV(dst, vec4_result));
 300
 301    return instructions;
 302 }
 303
 304 /**
 305  * A helper for MOV generation for fixing up broken hardware SEND dependency
 306  * handling.
 307  */
 308 fs_inst *
 309 fs_visitor::DEP_RESOLVE_MOV(int grf)
 310 {
 311    fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
 312
 313    inst->ir = NULL;
 314    inst->annotation = "send dependency resolve";
 315
 316    /* The caller always wants uncompressed to emit the minimal extra
 317     * dependencies, and to avoid having to deal with aligning its regs to 2.
 318     */
 319    inst->force_uncompressed = true;
 320
 321    return inst;
 322 }
 323
 324 bool
 325 fs_inst::equals(fs_inst *inst)
 326 {
 327    return (opcode == inst->opcode &&
 328            dst.equals(inst->dst) &&
 329            src[0].equals(inst->src[0]) &&
 330            src[1].equals(inst->src[1]) &&
 331            src[2].equals(inst->src[2]) &&
 332            saturate == inst->saturate &&
 333            predicate == inst->predicate &&
 334            conditional_mod == inst->conditional_mod &&
 335            mlen == inst->mlen &&
 336            base_mrf == inst->base_mrf &&
 337            sampler == inst->sampler &&
 338            target == inst->target &&
 339            eot == inst->eot &&
 340            header_present == inst->header_present &&
 341            shadow_compare == inst->shadow_compare &&
 342            offset == inst->offset);
 343 }
 344
 345 bool
 346 fs_inst::overwrites_reg(const fs_reg &reg)
 347 {
 348    return (reg.file == dst.file &&
 349            reg.reg == dst.reg &&
 350            reg.reg_offset >= dst.reg_offset  &&
 351            reg.reg_offset < dst.reg_offset + regs_written);
 352 }
 353
 354 bool
 355 fs_inst::is_send_from_grf()
 356 {
 357    return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
 358            opcode == SHADER_OPCODE_SHADER_TIME_ADD ||
 359            (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
 360             src[1].file == GRF) ||
 361            (is_tex() && src[0].file == GRF));
 362 }
 363
 364 bool
 365 fs_visitor::can_do_source_mods(fs_inst *inst)
 366 {
 367    if (brw->gen == 6 && inst->is_math())
 368       return false;
 369
 370    if (inst->is_send_from_grf())
 371       return false;
 372
 373    if (!inst->can_do_source_mods())
 374       return false;
 375
 376    return true;
 377 }
 378
 379 void
 380 fs_reg::init()
 381 {
 382    memset(this, 0, sizeof(*this));
 383    this->smear = -1;
 384 }
 385
 386 /** Generic unset register constructor. */
 387 fs_reg::fs_reg()
 388 {
 389    init();
 390    this->file = BAD_FILE;
 391 }
 392
 393 /** Immediate value constructor. */
 394 fs_reg::fs_reg(float f)
 395 {
 396    init();
 397    this->file = IMM;
 398    this->type = BRW_REGISTER_TYPE_F;
 399    this->imm.f = f;
 400 }
 401
 402 /** Immediate value constructor. */
 403 fs_reg::fs_reg(int32_t i)
 404 {
 405    init();
 406    this->file = IMM;
 407    this->type = BRW_REGISTER_TYPE_D;
 408    this->imm.i = i;
 409 }
 410
 411 /** Immediate value constructor. */
 412 fs_reg::fs_reg(uint32_t u)
 413 {
 414    init();
 415    this->file = IMM;
 416    this->type = BRW_REGISTER_TYPE_UD;
 417    this->imm.u = u;
 418 }
 419
 420 /** Fixed brw_reg. */
 421 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
 422 {
 423    init();
 424    this->file = HW_REG;
 425    this->fixed_hw_reg = fixed_hw_reg;
 426    this->type = fixed_hw_reg.type;
 427 }
 428
 429 bool
 430 fs_reg::equals(const fs_reg &r) const
 431 {
 432    return (file == r.file &&
 433            reg == r.reg &&
 434            reg_offset == r.reg_offset &&
 435            type == r.type &&
 436            negate == r.negate &&
 437            abs == r.abs &&
 438            !reladdr && !r.reladdr &&
 439            memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
 440                   sizeof(fixed_hw_reg)) == 0 &&
 441            smear == r.smear &&
 442            imm.u == r.imm.u);
 443 }
 444
 445 fs_reg
 446 fs_reg::retype(uint32_t type)
 447 {
 448    fs_reg result = *this;
 449    result.type = type;
 450    return result;
 451 }
 452
 453 bool
 454 fs_reg::is_zero() const
 455 {
 456    if (file != IMM)
 457       return false;
 458
 459    return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
 460 }
 461
 462 bool
 463 fs_reg::is_one() const
 464 {
 465    if (file != IMM)
 466       return false;
 467
 468    return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
 469 }
 470
 471 bool
 472 fs_reg::is_null() const
 473 {
 474    return file == HW_REG &&
 475           fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
 476           fixed_hw_reg.nr == BRW_ARF_NULL;
 477 }
 478
 479 bool
 480 fs_reg::is_valid_3src() const
 481 {
 482    return file == GRF || file == UNIFORM;
 483 }
 484
 485 int
 486 fs_visitor::type_size(const struct glsl_type *type)
 487 {
 488    unsigned int size, i;
 489
 490    switch (type->base_type) {
 491    case GLSL_TYPE_UINT:
 492    case GLSL_TYPE_INT:
 493    case GLSL_TYPE_FLOAT:
 494    case GLSL_TYPE_BOOL:
 495       return type->components();
 496    case GLSL_TYPE_ARRAY:
 497       return type_size(type->fields.array) * type->length;
 498    case GLSL_TYPE_STRUCT:
 499       size = 0;
 500       for (i = 0; i < type->length; i++) {
 501          size += type_size(type->fields.structure[i].type);
 502       }
 503       return size;
 504    case GLSL_TYPE_SAMPLER:
 505       /* Samplers take up no register space, since they're baked in at
 506        * link time.
 507        */
 508       return 0;
 509    case GLSL_TYPE_ATOMIC_UINT:
 510       return 0;
 511    case GLSL_TYPE_VOID:
 512    case GLSL_TYPE_ERROR:
 513    case GLSL_TYPE_INTERFACE:
 514       assert(!"not reached");
 515       break;
 516    }
 517
 518    return 0;
 519 }
 520
 521 fs_reg
 522 fs_visitor::get_timestamp()
 523 {
 524    assert(brw->gen >= 7);
 525
 526    fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 527                                           BRW_ARF_TIMESTAMP,
 528                                           0),
 529                              BRW_REGISTER_TYPE_UD));
 530
 531    fs_reg dst = fs_reg(this, glsl_type::uint_type);
 532
 533    fs_inst *mov = emit(MOV(dst, ts));
 534    /* We want to read the 3 fields we care about (mostly field 0, but also 2)
 535     * even if it's not enabled in the dispatch.
 536     */
 537    mov->force_writemask_all = true;
 538    mov->force_uncompressed = true;
 539
 540    /* The caller wants the low 32 bits of the timestamp.  Since it's running
 541     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
 542     * which is plenty of time for our purposes.  It is identical across the
 543     * EUs, but since it's tracking GPU core speed it will increment at a
 544     * varying rate as render P-states change.
 545     *
 546     * The caller could also check if render P-states have changed (or anything
 547     * else that might disrupt timing) by setting smear to 2 and checking if
 548     * that field is != 0.
 549     */
 550    dst.smear = 0;
 551
 552    return dst;
 553 }
 554
 555 void
 556 fs_visitor::emit_shader_time_begin()
 557 {
 558    current_annotation = "shader time start";
 559    shader_start_time = get_timestamp();
 560 }
 561
 562 void
 563 fs_visitor::emit_shader_time_end()
 564 {
 565    current_annotation = "shader time end";
 566
 567    enum shader_time_shader_type type, written_type, reset_type;
 568    if (dispatch_width == 8) {
 569       type = ST_FS8;
 570       written_type = ST_FS8_WRITTEN;
 571       reset_type = ST_FS8_RESET;
 572    } else {
 573       assert(dispatch_width == 16);
 574       type = ST_FS16;
 575       written_type = ST_FS16_WRITTEN;
 576       reset_type = ST_FS16_RESET;
 577    }
 578
 579    fs_reg shader_end_time = get_timestamp();
 580
 581    /* Check that there weren't any timestamp reset events (assuming these
 582     * were the only two timestamp reads that happened).
 583     */
 584    fs_reg reset = shader_end_time;
 585    reset.smear = 2;
 586    fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
 587    test->conditional_mod = BRW_CONDITIONAL_Z;
 588    emit(IF(BRW_PREDICATE_NORMAL));
 589
 590    push_force_uncompressed();
 591    fs_reg start = shader_start_time;
 592    start.negate = true;
 593    fs_reg diff = fs_reg(this, glsl_type::uint_type);
 594    emit(ADD(diff, start, shader_end_time));
 595
 596    /* If there were no instructions between the two timestamp gets, the diff
 597     * is 2 cycles.  Remove that overhead, so I can forget about that when
 598     * trying to determine the time taken for single instructions.
 599     */
 600    emit(ADD(diff, diff, fs_reg(-2u)));
 601
 602    emit_shader_time_write(type, diff);
 603    emit_shader_time_write(written_type, fs_reg(1u));
 604    emit(BRW_OPCODE_ELSE);
 605    emit_shader_time_write(reset_type, fs_reg(1u));
 606    emit(BRW_OPCODE_ENDIF);
 607
 608    pop_force_uncompressed();
 609 }
 610
 611 void
 612 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
 613                                    fs_reg value)
 614 {
 615    int shader_time_index =
 616       brw_get_shader_time_index(brw, shader_prog, &fp->Base, type);
 617    fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
 618
 619    fs_reg payload;
 620    if (dispatch_width == 8)
 621       payload = fs_reg(this, glsl_type::uvec2_type);
 622    else
 623       payload = fs_reg(this, glsl_type::uint_type);
 624
 625    emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
 626                 fs_reg(), payload, offset, value));
 627 }
 628
 629 void
 630 fs_visitor::fail(const char *format, ...)
 631 {
 632    va_list va;
 633    char *msg;
 634
 635    if (failed)
 636       return;
 637
 638    failed = true;
 639
 640    va_start(va, format);
 641    msg = ralloc_vasprintf(mem_ctx, format, va);
 642    va_end(va);
 643    msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
 644
 645    this->fail_msg = msg;
 646
 647    if (INTEL_DEBUG & DEBUG_WM) {
 648       fprintf(stderr, "%s",  msg);
 649    }
 650 }
 651
 652 fs_inst *
 653 fs_visitor::emit(enum opcode opcode)
 654 {
 655    return emit(fs_inst(opcode));
 656 }
 657
 658 fs_inst *
 659 fs_visitor::emit(enum opcode opcode, fs_reg dst)
 660 {
 661    return emit(fs_inst(opcode, dst));
 662 }
 663
 664 fs_inst *
 665 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
 666 {
 667    return emit(fs_inst(opcode, dst, src0));
 668 }
 669
 670 fs_inst *
 671 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 672 {
 673    return emit(fs_inst(opcode, dst, src0, src1));
 674 }
 675
 676 fs_inst *
 677 fs_visitor::emit(enum opcode opcode, fs_reg dst,
 678                  fs_reg src0, fs_reg src1, fs_reg src2)
 679 {
 680    return emit(fs_inst(opcode, dst, src0, src1, src2));
 681 }
 682
 683 void
 684 fs_visitor::push_force_uncompressed()
 685 {
 686    force_uncompressed_stack++;
 687 }
 688
 689 void
 690 fs_visitor::pop_force_uncompressed()
 691 {
 692    force_uncompressed_stack--;
 693    assert(force_uncompressed_stack >= 0);
 694 }
 695
 696 /**
 697  * Returns true if the instruction has a flag that means it won't
 698  * update an entire destination register.
 699  *
 700  * For example, dead code elimination and live variable analysis want to know
 701  * when a write to a variable screens off any preceding values that were in
 702  * it.
 703  */
 704 bool
 705 fs_inst::is_partial_write()
 706 {
 707    return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
 708            this->force_uncompressed ||
 709            this->force_sechalf);
 710 }
 711
 712 int
 713 fs_inst::regs_read(fs_visitor *v, int arg)
 714 {
 715    if (is_tex() && arg == 0 && src[0].file == GRF) {
 716       if (v->dispatch_width == 16)
 717          return (mlen + 1) / 2;
 718       else
 719          return mlen;
 720    }
 721    return 1;
 722 }
 723
 724 bool
 725 fs_inst::reads_flag()
 726 {
 727    return predicate;
 728 }
 729
 730 bool
 731 fs_inst::writes_flag()
 732 {
 733    return (conditional_mod && opcode != BRW_OPCODE_SEL) ||
 734           opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
 735 }
 736
 737 /**
 738  * Returns how many MRFs an FS opcode will write over.
 739  *
 740  * Note that this is not the 0 or 1 implied writes in an actual gen
 741  * instruction -- the FS opcodes often generate MOVs in addition.
 742  */
 743 int
 744 fs_visitor::implied_mrf_writes(fs_inst *inst)
 745 {
 746    if (inst->mlen == 0)
 747       return 0;
 748
 749    if (inst->base_mrf == -1)
 750       return 0;
 751
 752    switch (inst->opcode) {
 753    case SHADER_OPCODE_RCP:
 754    case SHADER_OPCODE_RSQ:
 755    case SHADER_OPCODE_SQRT:
 756    case SHADER_OPCODE_EXP2:
 757    case SHADER_OPCODE_LOG2:
 758    case SHADER_OPCODE_SIN:
 759    case SHADER_OPCODE_COS:
 760       return 1 * dispatch_width / 8;
 761    case SHADER_OPCODE_POW:
 762    case SHADER_OPCODE_INT_QUOTIENT:
 763    case SHADER_OPCODE_INT_REMAINDER:
 764       return 2 * dispatch_width / 8;
 765    case SHADER_OPCODE_TEX:
 766    case FS_OPCODE_TXB:
 767    case SHADER_OPCODE_TXD:
 768    case SHADER_OPCODE_TXF:
 769    case SHADER_OPCODE_TXF_MS:
 770    case SHADER_OPCODE_TXF_MCS:
 771    case SHADER_OPCODE_TG4:
 772    case SHADER_OPCODE_TG4_OFFSET:
 773    case SHADER_OPCODE_TXL:
 774    case SHADER_OPCODE_TXS:
 775    case SHADER_OPCODE_LOD:
 776       return 1;
 777    case FS_OPCODE_FB_WRITE:
 778       return 2;
 779    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 780    case SHADER_OPCODE_GEN4_SCRATCH_READ:
 781       return 1;
 782    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
 783       return inst->mlen;
 784    case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
 785       return 2;
 786    case SHADER_OPCODE_UNTYPED_ATOMIC:
 787    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
 788       return 0;
 789    default:
 790       assert(!"not reached");
 791       return inst->mlen;
 792    }
 793 }
 794
 795 int
 796 fs_visitor::virtual_grf_alloc(int size)
 797 {
 798    if (virtual_grf_array_size <= virtual_grf_count) {
 799       if (virtual_grf_array_size == 0)
 800          virtual_grf_array_size = 16;
 801       else
 802          virtual_grf_array_size *= 2;
 803       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 804                                    virtual_grf_array_size);
 805    }
 806    virtual_grf_sizes[virtual_grf_count] = size;
 807    return virtual_grf_count++;
 808 }
 809
 810 /** Fixed HW reg constructor. */
 811 fs_reg::fs_reg(enum register_file file, int reg)
 812 {
 813    init();
 814    this->file = file;
 815    this->reg = reg;
 816    this->type = BRW_REGISTER_TYPE_F;
 817 }
 818
 819 /** Fixed HW reg constructor. */
 820 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
 821 {
 822    init();
 823    this->file = file;
 824    this->reg = reg;
 825    this->type = type;
 826 }
 827
 828 /** Automatic reg constructor. */
 829 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
 830 {
 831    init();
 832
 833    this->file = GRF;
 834    this->reg = v->virtual_grf_alloc(v->type_size(type));
 835    this->reg_offset = 0;
 836    this->type = brw_type_for_base_type(type);
 837 }
 838
 839 fs_reg *
 840 fs_visitor::variable_storage(ir_variable *var)
 841 {
 842    return (fs_reg *)hash_table_find(this->variable_ht, var);
 843 }
 844
 845 void
 846 import_uniforms_callback(const void *key,
 847                          void *data,
 848                          void *closure)
 849 {
 850    struct hash_table *dst_ht = (struct hash_table *)closure;
 851    const fs_reg *reg = (const fs_reg *)data;
 852
 853    if (reg->file != UNIFORM)
 854       return;
 855
 856    hash_table_insert(dst_ht, data, key);
 857 }
 858
 859 /* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
 860  * This brings in those uniform definitions
 861  */
 862 void
 863 fs_visitor::import_uniforms(fs_visitor *v)
 864 {
 865    hash_table_call_foreach(v->variable_ht,
 866                            import_uniforms_callback,
 867                            variable_ht);
 868    this->params_remap = v->params_remap;
 869    this->nr_params_remap = v->nr_params_remap;
 870 }
 871
 872 /* Our support for uniforms is piggy-backed on the struct
 873  * gl_fragment_program, because that's where the values actually
 874  * get stored, rather than in some global gl_shader_program uniform
 875  * store.
 876  */
 877 void
 878 fs_visitor::setup_uniform_values(ir_variable *ir)
 879 {
 880    int namelen = strlen(ir->name);
 881
 882    /* The data for our (non-builtin) uniforms is stored in a series of
 883     * gl_uniform_driver_storage structs for each subcomponent that
 884     * glGetUniformLocation() could name.  We know it's been set up in the same
 885     * order we'd walk the type, so walk the list of storage and find anything
 886     * with our name, or the prefix of a component that starts with our name.
 887     */
 888    unsigned params_before = c->prog_data.nr_params;
 889    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
 890       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 891
 892       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 893           (storage->name[namelen] != 0 &&
 894            storage->name[namelen] != '.' &&
 895            storage->name[namelen] != '[')) {
 896          continue;
 897       }
 898
 899       unsigned slots = storage->type->component_slots();
 900       if (storage->array_elements)
 901          slots *= storage->array_elements;
 902
 903       for (unsigned i = 0; i < slots; i++) {
 904          c->prog_data.param[c->prog_data.nr_params++] =
 905             &storage->storage[i].f;
 906       }
 907    }
 908
 909    /* Make sure we actually initialized the right amount of stuff here. */
 910    assert(params_before + ir->type->component_slots() ==
 911           c->prog_data.nr_params);
 912    (void)params_before;
 913 }
 914
 915
 916 /* Our support for builtin uniforms is even scarier than non-builtin.
 917  * It sits on top of the PROG_STATE_VAR parameters that are
 918  * automatically updated from GL context state.
 919  */
 920 void
 921 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
 922 {
 923    const ir_state_slot *const slots = ir->state_slots;
 924    assert(ir->state_slots != NULL);
 925
 926    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 927       /* This state reference has already been setup by ir_to_mesa, but we'll
 928        * get the same index back here.
 929        */
 930       int index = _mesa_add_state_reference(this->fp->Base.Parameters,
 931                                             (gl_state_index *)slots[i].tokens);
 932
 933       /* Add each of the unique swizzles of the element as a parameter.
 934        * This'll end up matching the expected layout of the
 935        * array/matrix/structure we're trying to fill in.
 936        */
 937       int last_swiz = -1;
 938       for (unsigned int j = 0; j < 4; j++) {
 939          int swiz = GET_SWZ(slots[i].swizzle, j);
 940          if (swiz == last_swiz)
 941             break;
 942          last_swiz = swiz;
 943
 944          c->prog_data.param[c->prog_data.nr_params++] =
 945             &fp->Base.Parameters->ParameterValues[index][swiz].f;
 946       }
 947    }
 948 }
 949
 950 fs_reg *
 951 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
 952 {
 953    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 954    fs_reg wpos = *reg;
 955    bool flip = !ir->data.origin_upper_left ^ c->key.render_to_fbo;
 956
 957    /* gl_FragCoord.x */
 958    if (ir->data.pixel_center_integer) {
 959       emit(MOV(wpos, this->pixel_x));
 960    } else {
 961       emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
 962    }
 963    wpos.reg_offset++;
 964
 965    /* gl_FragCoord.y */
 966    if (!flip && ir->data.pixel_center_integer) {
 967       emit(MOV(wpos, this->pixel_y));
 968    } else {
 969       fs_reg pixel_y = this->pixel_y;
 970       float offset = (ir->data.pixel_center_integer ? 0.0 : 0.5);
 971
 972       if (flip) {
 973          pixel_y.negate = true;
 974          offset += c->key.drawable_height - 1.0;
 975       }
 976
 977       emit(ADD(wpos, pixel_y, fs_reg(offset)));
 978    }
 979    wpos.reg_offset++;
 980
 981    /* gl_FragCoord.z */
 982    if (brw->gen >= 6) {
 983       emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
 984    } else {
 985       emit(FS_OPCODE_LINTERP, wpos,
 986            this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
 987            this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
 988            interp_reg(VARYING_SLOT_POS, 2));
 989    }
 990    wpos.reg_offset++;
 991
 992    /* gl_FragCoord.w: Already set up in emit_interpolation */
 993    emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
 994
 995    return reg;
 996 }
 997
 998 fs_inst *
 999 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1000                          glsl_interp_qualifier interpolation_mode,
1001                          bool is_centroid)
1002 {
1003    brw_wm_barycentric_interp_mode barycoord_mode;
1004    if (brw->gen >= 6) {
1005       if (is_centroid) {
1006          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1007             barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1008          else
1009             barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1010       } else {
1011          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1012             barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1013          else
1014             barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1015       }
1016    } else {
1017       /* On Ironlake and below, there is only one interpolation mode.
1018        * Centroid interpolation doesn't mean anything on this hardware --
1019        * there is no multisampling.
1020        */
1021       barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1022    }
1023    return emit(FS_OPCODE_LINTERP, attr,
1024                this->delta_x[barycoord_mode],
1025                this->delta_y[barycoord_mode], interp);
1026 }
1027
1028 fs_reg *
1029 fs_visitor::emit_general_interpolation(ir_variable *ir)
1030 {
1031    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1032    reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
1033    fs_reg attr = *reg;
1034
1035    unsigned int array_elements;
1036    const glsl_type *type;
1037
1038    if (ir->type->is_array()) {
1039       array_elements = ir->type->length;
1040       if (array_elements == 0) {
1041          fail("dereferenced array '%s' has length 0\n", ir->name);
1042       }
1043       type = ir->type->fields.array;
1044    } else {
1045       array_elements = 1;
1046       type = ir->type;
1047    }
1048
1049    glsl_interp_qualifier interpolation_mode =
1050       ir->determine_interpolation_mode(c->key.flat_shade);
1051
1052    int location = ir->data.location;
1053    for (unsigned int i = 0; i < array_elements; i++) {
1054       for (unsigned int j = 0; j < type->matrix_columns; j++) {
1055          if (c->prog_data.urb_setup[location] == -1) {
1056             /* If there's no incoming setup data for this slot, don't
1057              * emit interpolation for it.
1058              */
1059             attr.reg_offset += type->vector_elements;
1060             location++;
1061             continue;
1062          }
1063
1064          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1065             /* Constant interpolation (flat shading) case. The SF has
1066              * handed us defined values in only the constant offset
1067              * field of the setup reg.
1068              */
1069             for (unsigned int k = 0; k < type->vector_elements; k++) {
1070                struct brw_reg interp = interp_reg(location, k);
1071                interp = suboffset(interp, 3);
1072                interp.type = reg->type;
1073                emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1074                attr.reg_offset++;
1075             }
1076          } else {
1077             /* Smooth/noperspective interpolation case. */
1078             for (unsigned int k = 0; k < type->vector_elements; k++) {
1079                /* FINISHME: At some point we probably want to push
1080                 * this farther by giving similar treatment to the
1081                 * other potentially constant components of the
1082                 * attribute, as well as making brw_vs_constval.c
1083                 * handle varyings other than gl_TexCoord.
1084                 */
1085                struct brw_reg interp = interp_reg(location, k);
1086                emit_linterp(attr, fs_reg(interp), interpolation_mode,
1087                             ir->data.centroid);
1088                if (brw->needs_unlit_centroid_workaround && ir->data.centroid) {
1089                   /* Get the pixel/sample mask into f0 so that we know
1090                    * which pixels are lit.  Then, for each channel that is
1091                    * unlit, replace the centroid data with non-centroid
1092                    * data.
1093                    */
1094                   emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1095                   fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1096                                                interpolation_mode, false);
1097                   inst->predicate = BRW_PREDICATE_NORMAL;
1098                   inst->predicate_inverse = true;
1099                }
1100                if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1101                   emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1102                }
1103                attr.reg_offset++;
1104             }
1105
1106          }
1107          location++;
1108       }
1109    }
1110
1111    return reg;
1112 }
1113
1114 fs_reg *
1115 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1116 {
1117    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1118
1119    /* The frontfacing comes in as a bit in the thread payload. */
1120    if (brw->gen >= 6) {
1121       emit(BRW_OPCODE_ASR, *reg,
1122            fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1123            fs_reg(15));
1124       emit(BRW_OPCODE_NOT, *reg, *reg);
1125       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1126    } else {
1127       struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1128       /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1129        * us front face
1130        */
1131       emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1132       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1133    }
1134
1135    return reg;
1136 }
1137
1138 void
1139 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1140 {
1141    assert(dst.type == BRW_REGISTER_TYPE_F);
1142
1143    if (c->key.compute_pos_offset) {
1144       /* Convert int_sample_pos to floating point */
1145       emit(MOV(dst, int_sample_pos));
1146       /* Scale to the range [0, 1] */
1147       emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1148    }
1149    else {
1150       /* From ARB_sample_shading specification:
1151        * "When rendering to a non-multisample buffer, or if multisample
1152        *  rasterization is disabled, gl_SamplePosition will always be
1153        *  (0.5, 0.5).
1154        */
1155       emit(MOV(dst, fs_reg(0.5f)));
1156    }
1157 }
1158
1159 fs_reg *
1160 fs_visitor::emit_samplepos_setup(ir_variable *ir)
1161 {
1162    assert(brw->gen >= 6);
1163    assert(ir->type == glsl_type::vec2_type);
1164
1165    this->current_annotation = "compute sample position";
1166    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1167    fs_reg pos = *reg;
1168    fs_reg int_sample_x = fs_reg(this, glsl_type::int_type);
1169    fs_reg int_sample_y = fs_reg(this, glsl_type::int_type);
1170
1171    /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1172     * mode will be enabled.
1173     *
1174     * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1175     * R31.1:0         Position Offset X/Y for Slot[3:0]
1176     * R31.3:2         Position Offset X/Y for Slot[7:4]
1177     * .....
1178     *
1179     * The X, Y sample positions come in as bytes in  thread payload. So, read
1180     * the positions using vstride=16, width=8, hstride=2.
1181     */
1182    struct brw_reg sample_pos_reg =
1183       stride(retype(brw_vec1_grf(c->sample_pos_reg, 0),
1184                     BRW_REGISTER_TYPE_B), 16, 8, 2);
1185
1186    emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1187    if (dispatch_width == 16) {
1188       int_sample_x.sechalf = true;
1189       fs_inst *inst = emit(MOV(int_sample_x,
1190                                fs_reg(suboffset(sample_pos_reg, 16))));
1191       inst->force_sechalf = true;
1192       int_sample_x.sechalf = false;
1193    }
1194    /* Compute gl_SamplePosition.x */
1195    compute_sample_position(pos, int_sample_x);
1196    pos.reg_offset++;
1197    emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1198    if (dispatch_width == 16) {
1199       int_sample_y.sechalf = true;
1200       fs_inst *inst = emit(MOV(int_sample_y,
1201                                fs_reg(suboffset(sample_pos_reg, 17))));
1202       inst->force_sechalf = true;
1203       int_sample_y.sechalf = false;
1204    }
1205    /* Compute gl_SamplePosition.y */
1206    compute_sample_position(pos, int_sample_y);
1207    return reg;
1208 }
1209
1210 fs_reg *
1211 fs_visitor::emit_sampleid_setup(ir_variable *ir)
1212 {
1213    assert(brw->gen >= 6);
1214
1215    this->current_annotation = "compute sample id";
1216    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1217
1218    if (c->key.compute_sample_id) {
1219       fs_reg t1 = fs_reg(this, glsl_type::int_type);
1220       fs_reg t2 = fs_reg(this, glsl_type::int_type);
1221       t2.type = BRW_REGISTER_TYPE_UW;
1222
1223       /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1224        * 8x multisampling, subspan 0 will represent sample N (where N
1225        * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1226        * 7. We can find the value of N by looking at R0.0 bits 7:6
1227        * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1228        * (since samples are always delivered in pairs). That is, we
1229        * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1230        * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1231        * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1232        * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1233        * populating a temporary variable with the sequence (0, 1, 2, 3),
1234        * and then reading from it using vstride=1, width=4, hstride=0.
1235        * These computations hold good for 4x multisampling as well.
1236        */
1237       emit(BRW_OPCODE_AND, t1,
1238            fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1239            fs_reg(brw_imm_d(0xc0)));
1240       emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1241       /* This works for both SIMD8 and SIMD16 */
1242       emit(MOV(t2, brw_imm_v(0x3210)));
1243       /* This special instruction takes care of setting vstride=1,
1244        * width=4, hstride=0 of t2 during an ADD instruction.
1245        */
1246       emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1247    } else {
1248       /* As per GL_ARB_sample_shading specification:
1249        * "When rendering to a non-multisample buffer, or if multisample
1250        *  rasterization is disabled, gl_SampleID will always be zero."
1251        */
1252       emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1253    }
1254
1255    return reg;
1256 }
1257
1258 fs_reg
1259 fs_visitor::fix_math_operand(fs_reg src)
1260 {
1261    /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1262     * might be able to do better by doing execsize = 1 math and then
1263     * expanding that result out, but we would need to be careful with
1264     * masking.
1265     *
1266     * The hardware ignores source modifiers (negate and abs) on math
1267     * instructions, so we also move to a temp to set those up.
1268     */
1269    if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1270        !src.abs && !src.negate)
1271       return src;
1272
1273    /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1274     * operands to math
1275     */
1276    if (brw->gen >= 7 && src.file != IMM)
1277       return src;
1278
1279    fs_reg expanded = fs_reg(this, glsl_type::float_type);
1280    expanded.type = src.type;
1281    emit(BRW_OPCODE_MOV, expanded, src);
1282    return expanded;
1283 }
1284
1285 fs_inst *
1286 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1287 {
1288    switch (opcode) {
1289    case SHADER_OPCODE_RCP:
1290    case SHADER_OPCODE_RSQ:
1291    case SHADER_OPCODE_SQRT:
1292    case SHADER_OPCODE_EXP2:
1293    case SHADER_OPCODE_LOG2:
1294    case SHADER_OPCODE_SIN:
1295    case SHADER_OPCODE_COS:
1296       break;
1297    default:
1298       assert(!"not reached: bad math opcode");
1299       return NULL;
1300    }
1301
1302    /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
1303     * might be able to do better by doing execsize = 1 math and then
1304     * expanding that result out, but we would need to be careful with
1305     * masking.
1306     *
1307     * Gen 6 hardware ignores source modifiers (negate and abs) on math
1308     * instructions, so we also move to a temp to set those up.
1309     */
1310    if (brw->gen >= 6)
1311       src = fix_math_operand(src);
1312
1313    fs_inst *inst = emit(opcode, dst, src);
1314
1315    if (brw->gen < 6) {
1316       inst->base_mrf = 2;
1317       inst->mlen = dispatch_width / 8;
1318    }
1319
1320    return inst;
1321 }
1322
1323 fs_inst *
1324 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1325 {
1326    int base_mrf = 2;
1327    fs_inst *inst;
1328
1329    switch (opcode) {
1330    case SHADER_OPCODE_INT_QUOTIENT:
1331    case SHADER_OPCODE_INT_REMAINDER:
1332       if (brw->gen >= 7 && dispatch_width == 16)
1333          fail("16-wide INTDIV unsupported\n");
1334       break;
1335    case SHADER_OPCODE_POW:
1336       break;
1337    default:
1338       assert(!"not reached: unsupported binary math opcode.");
1339       return NULL;
1340    }
1341
1342    if (brw->gen >= 6) {
1343       src0 = fix_math_operand(src0);
1344       src1 = fix_math_operand(src1);
1345
1346       inst = emit(opcode, dst, src0, src1);
1347    } else {
1348       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1349        * "Message Payload":
1350        *
1351        * "Operand0[7].  For the INT DIV functions, this operand is the
1352        *  denominator."
1353        *  ...
1354        * "Operand1[7].  For the INT DIV functions, this operand is the
1355        *  numerator."
1356        */
1357       bool is_int_div = opcode != SHADER_OPCODE_POW;
1358       fs_reg &op0 = is_int_div ? src1 : src0;
1359       fs_reg &op1 = is_int_div ? src0 : src1;
1360
1361       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1362       inst = emit(opcode, dst, op0, reg_null_f);
1363
1364       inst->base_mrf = base_mrf;
1365       inst->mlen = 2 * dispatch_width / 8;
1366    }
1367    return inst;
1368 }
1369
1370 void
1371 fs_visitor::assign_curb_setup()
1372 {
1373    c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
1374    if (dispatch_width == 8) {
1375       c->prog_data.first_curbe_grf = c->nr_payload_regs;
1376    } else {
1377       c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1378    }
1379
1380    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1381    foreach_list(node, &this->instructions) {
1382       fs_inst *inst = (fs_inst *)node;
1383
1384       for (unsigned int i = 0; i < 3; i++) {
1385          if (inst->src[i].file == UNIFORM) {
1386             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1387             struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1388                                                   constant_nr / 8,
1389                                                   constant_nr % 8);
1390
1391             inst->src[i].file = HW_REG;
1392             inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
1393          }
1394       }
1395    }
1396 }
1397
1398 void
1399 fs_visitor::calculate_urb_setup()
1400 {
1401    for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1402       c->prog_data.urb_setup[i] = -1;
1403    }
1404
1405    int urb_next = 0;
1406    /* Figure out where each of the incoming setup attributes lands. */
1407    if (brw->gen >= 6) {
1408       if (_mesa_bitcount_64(fp->Base.InputsRead &
1409                             BRW_FS_VARYING_INPUT_MASK) <= 16) {
1410          /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1411           * first 16 varying inputs, so we can put them wherever we want.
1412           * Just put them in order.
1413           *
1414           * This is useful because it means that (a) inputs not used by the
1415           * fragment shader won't take up valuable register space, and (b) we
1416           * won't have to recompile the fragment shader if it gets paired with
1417           * a different vertex (or geometry) shader.
1418           */
1419          for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1420             if (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1421                 BITFIELD64_BIT(i)) {
1422                c->prog_data.urb_setup[i] = urb_next++;
1423             }
1424          }
1425       } else {
1426          /* We have enough input varyings that the SF/SBE pipeline stage can't
1427           * arbitrarily rearrange them to suit our whim; we have to put them
1428           * in an order that matches the output of the previous pipeline stage
1429           * (geometry or vertex shader).
1430           */
1431          struct brw_vue_map prev_stage_vue_map;
1432          brw_compute_vue_map(brw, &prev_stage_vue_map,
1433                              c->key.input_slots_valid);
1434          int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1435          assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1436          for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1437               slot++) {
1438             int varying = prev_stage_vue_map.slot_to_varying[slot];
1439             /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1440              * unused.
1441              */
1442             if (varying != BRW_VARYING_SLOT_COUNT &&
1443                 (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1444                  BITFIELD64_BIT(varying))) {
1445                c->prog_data.urb_setup[varying] = slot - first_slot;
1446             }
1447          }
1448          urb_next = prev_stage_vue_map.num_slots - first_slot;
1449       }
1450    } else {
1451       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1452       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1453          /* Point size is packed into the header, not as a general attribute */
1454          if (i == VARYING_SLOT_PSIZ)
1455             continue;
1456
1457          if (c->key.input_slots_valid & BITFIELD64_BIT(i)) {
1458             /* The back color slot is skipped when the front color is
1459              * also written to.  In addition, some slots can be
1460              * written in the vertex shader and not read in the
1461              * fragment shader.  So the register number must always be
1462              * incremented, mapped or not.
1463              */
1464             if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1465                c->prog_data.urb_setup[i] = urb_next;
1466             urb_next++;
1467          }
1468       }
1469
1470       /*
1471        * It's a FS only attribute, and we did interpolation for this attribute
1472        * in SF thread. So, count it here, too.
1473        *
1474        * See compile_sf_prog() for more info.
1475        */
1476       if (fp->Base.InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1477          c->prog_data.urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1478    }
1479
1480    c->prog_data.num_varying_inputs = urb_next;
1481 }
1482
1483 void
1484 fs_visitor::assign_urb_setup()
1485 {
1486    int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1487
1488    /* Offset all the urb_setup[] index by the actual position of the
1489     * setup regs, now that the location of the constants has been chosen.
1490     */
1491    foreach_list(node, &this->instructions) {
1492       fs_inst *inst = (fs_inst *)node;
1493
1494       if (inst->opcode == FS_OPCODE_LINTERP) {
1495          assert(inst->src[2].file == HW_REG);
1496          inst->src[2].fixed_hw_reg.nr += urb_start;
1497       }
1498
1499       if (inst->opcode == FS_OPCODE_CINTERP) {
1500          assert(inst->src[0].file == HW_REG);
1501          inst->src[0].fixed_hw_reg.nr += urb_start;
1502       }
1503    }
1504
1505    /* Each attribute is 4 setup channels, each of which is half a reg. */
1506    this->first_non_payload_grf =
1507       urb_start + c->prog_data.num_varying_inputs * 2;
1508 }
1509
1510 /**
1511  * Split large virtual GRFs into separate components if we can.
1512  *
1513  * This is mostly duplicated with what brw_fs_vector_splitting does,
1514  * but that's really conservative because it's afraid of doing
1515  * splitting that doesn't result in real progress after the rest of
1516  * the optimization phases, which would cause infinite looping in
1517  * optimization.  We can do it once here, safely.  This also has the
1518  * opportunity to split interpolated values, or maybe even uniforms,
1519  * which we don't have at the IR level.
1520  *
1521  * We want to split, because virtual GRFs are what we register
1522  * allocate and spill (due to contiguousness requirements for some
1523  * instructions), and they're what we naturally generate in the
1524  * codegen process, but most virtual GRFs don't actually need to be
1525  * contiguous sets of GRFs.  If we split, we'll end up with reduced
1526  * live intervals and better dead code elimination and coalescing.
1527  */
1528 void
1529 fs_visitor::split_virtual_grfs()
1530 {
1531    int num_vars = this->virtual_grf_count;
1532    bool split_grf[num_vars];
1533    int new_virtual_grf[num_vars];
1534
1535    /* Try to split anything > 0 sized. */
1536    for (int i = 0; i < num_vars; i++) {
1537       if (this->virtual_grf_sizes[i] != 1)
1538          split_grf[i] = true;
1539       else
1540          split_grf[i] = false;
1541    }
1542
1543    if (brw->has_pln &&
1544        this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1545       /* PLN opcodes rely on the delta_xy being contiguous.  We only have to
1546        * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1547        * Gen6, that was the only supported interpolation mode, and since Gen6,
1548        * delta_x and delta_y are in fixed hardware registers.
1549        */
1550       split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1551          false;
1552    }
1553
1554    foreach_list(node, &this->instructions) {
1555       fs_inst *inst = (fs_inst *)node;
1556
1557       /* If there's a SEND message that requires contiguous destination
1558        * registers, no splitting is allowed.
1559        */
1560       if (inst->regs_written > 1) {
1561          split_grf[inst->dst.reg] = false;
1562       }
1563
1564       /* If we're sending from a GRF, don't split it, on the assumption that
1565        * the send is reading the whole thing.
1566        */
1567       if (inst->is_send_from_grf()) {
1568          for (int i = 0; i < 3; i++) {
1569             if (inst->src[i].file == GRF) {
1570                split_grf[inst->src[i].reg] = false;
1571             }
1572          }
1573       }
1574    }
1575
1576    /* Allocate new space for split regs.  Note that the virtual
1577     * numbers will be contiguous.
1578     */
1579    for (int i = 0; i < num_vars; i++) {
1580       if (split_grf[i]) {
1581          new_virtual_grf[i] = virtual_grf_alloc(1);
1582          for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1583             int reg = virtual_grf_alloc(1);
1584             assert(reg == new_virtual_grf[i] + j - 1);
1585             (void) reg;
1586          }
1587          this->virtual_grf_sizes[i] = 1;
1588       }
1589    }
1590
1591    foreach_list(node, &this->instructions) {
1592       fs_inst *inst = (fs_inst *)node;
1593
1594       if (inst->dst.file == GRF &&
1595           split_grf[inst->dst.reg] &&
1596           inst->dst.reg_offset != 0) {
1597          inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1598                           inst->dst.reg_offset - 1);
1599          inst->dst.reg_offset = 0;
1600       }
1601       for (int i = 0; i < 3; i++) {
1602          if (inst->src[i].file == GRF &&
1603              split_grf[inst->src[i].reg] &&
1604              inst->src[i].reg_offset != 0) {
1605             inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1606                                 inst->src[i].reg_offset - 1);
1607             inst->src[i].reg_offset = 0;
1608          }
1609       }
1610    }
1611    invalidate_live_intervals();
1612 }
1613
1614 /**
1615  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1616  *
1617  * During code generation, we create tons of temporary variables, many of
1618  * which get immediately killed and are never used again.  Yet, in later
1619  * optimization and analysis passes, such as compute_live_intervals, we need
1620  * to loop over all the virtual GRFs.  Compacting them can save a lot of
1621  * overhead.
1622  */
1623 void
1624 fs_visitor::compact_virtual_grfs()
1625 {
1626    /* Mark which virtual GRFs are used, and count how many. */
1627    int remap_table[this->virtual_grf_count];
1628    memset(remap_table, -1, sizeof(remap_table));
1629
1630    foreach_list(node, &this->instructions) {
1631       const fs_inst *inst = (const fs_inst *) node;
1632
1633       if (inst->dst.file == GRF)
1634          remap_table[inst->dst.reg] = 0;
1635
1636       for (int i = 0; i < 3; i++) {
1637          if (inst->src[i].file == GRF)
1638             remap_table[inst->src[i].reg] = 0;
1639       }
1640    }
1641
1642    /* In addition to registers used in instructions, fs_visitor keeps
1643     * direct references to certain special values which must be patched:
1644     */
1645    fs_reg *special[] = {
1646       &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1647       &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1648       &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1649       &delta_x[0], &delta_x[1], &delta_x[2],
1650       &delta_x[3], &delta_x[4], &delta_x[5],
1651       &delta_y[0], &delta_y[1], &delta_y[2],
1652       &delta_y[3], &delta_y[4], &delta_y[5],
1653    };
1654    STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1655    STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1656
1657    /* Treat all special values as used, to be conservative */
1658    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1659       if (special[i]->file == GRF)
1660          remap_table[special[i]->reg] = 0;
1661    }
1662
1663    /* Compact the GRF arrays. */
1664    int new_index = 0;
1665    for (int i = 0; i < this->virtual_grf_count; i++) {
1666       if (remap_table[i] != -1) {
1667          remap_table[i] = new_index;
1668          virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1669          invalidate_live_intervals();
1670          ++new_index;
1671       }
1672    }
1673
1674    this->virtual_grf_count = new_index;
1675
1676    /* Patch all the instructions to use the newly renumbered registers */
1677    foreach_list(node, &this->instructions) {
1678       fs_inst *inst = (fs_inst *) node;
1679
1680       if (inst->dst.file == GRF)
1681          inst->dst.reg = remap_table[inst->dst.reg];
1682
1683       for (int i = 0; i < 3; i++) {
1684          if (inst->src[i].file == GRF)
1685             inst->src[i].reg = remap_table[inst->src[i].reg];
1686       }
1687    }
1688
1689    /* Patch all the references to special values */
1690    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1691       if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1692          special[i]->reg = remap_table[special[i]->reg];
1693    }
1694 }
1695
1696 bool
1697 fs_visitor::remove_dead_constants()
1698 {
1699    if (dispatch_width == 8) {
1700       this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
1701       this->nr_params_remap = c->prog_data.nr_params;
1702
1703       for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
1704          this->params_remap[i] = -1;
1705
1706       /* Find which params are still in use. */
1707       foreach_list(node, &this->instructions) {
1708          fs_inst *inst = (fs_inst *)node;
1709
1710          for (int i = 0; i < 3; i++) {
1711             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1712
1713             if (inst->src[i].file != UNIFORM)
1714                continue;
1715
1716             /* Section 5.11 of the OpenGL 4.3 spec says:
1717              *
1718              *     "Out-of-bounds reads return undefined values, which include
1719              *     values from other variables of the active program or zero."
1720              */
1721             if (constant_nr < 0 || constant_nr >= (int)c->prog_data.nr_params) {
1722                constant_nr = 0;
1723             }
1724
1725             /* For now, set this to non-negative.  We'll give it the
1726              * actual new number in a moment, in order to keep the
1727              * register numbers nicely ordered.
1728              */
1729             this->params_remap[constant_nr] = 0;
1730          }
1731       }
1732
1733       /* Figure out what the new numbers for the params will be.  At some
1734        * point when we're doing uniform array access, we're going to want
1735        * to keep the distinction between .reg and .reg_offset, but for
1736        * now we don't care.
1737        */
1738       unsigned int new_nr_params = 0;
1739       for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1740          if (this->params_remap[i] != -1) {
1741             this->params_remap[i] = new_nr_params++;
1742          }
1743       }
1744
1745       /* Update the list of params to be uploaded to match our new numbering. */
1746       for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1747          int remapped = this->params_remap[i];
1748
1749          if (remapped == -1)
1750             continue;
1751
1752          c->prog_data.param[remapped] = c->prog_data.param[i];
1753       }
1754
1755       c->prog_data.nr_params = new_nr_params;
1756    } else {
1757       /* This should have been generated in the 8-wide pass already. */
1758       assert(this->params_remap);
1759    }
1760
1761    /* Now do the renumbering of the shader to remove unused params. */
1762    foreach_list(node, &this->instructions) {
1763       fs_inst *inst = (fs_inst *)node;
1764
1765       for (int i = 0; i < 3; i++) {
1766          int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1767
1768          if (inst->src[i].file != UNIFORM)
1769             continue;
1770
1771          /* as above alias to 0 */
1772          if (constant_nr < 0 || constant_nr >= (int)this->nr_params_remap) {
1773             constant_nr = 0;
1774          }
1775          assert(this->params_remap[constant_nr] != -1);
1776          inst->src[i].reg = this->params_remap[constant_nr];
1777          inst->src[i].reg_offset = 0;
1778       }
1779    }
1780
1781    return true;
1782 }
1783
1784 /*
1785  * Implements array access of uniforms by inserting a
1786  * PULL_CONSTANT_LOAD instruction.
1787  *
1788  * Unlike temporary GRF array access (where we don't support it due to
1789  * the difficulty of doing relative addressing on instruction
1790  * destinations), we could potentially do array access of uniforms
1791  * that were loaded in GRF space as push constants.  In real-world
1792  * usage we've seen, though, the arrays being used are always larger
1793  * than we could load as push constants, so just always move all
1794  * uniform array access out to a pull constant buffer.
1795  */
1796 void
1797 fs_visitor::move_uniform_array_access_to_pull_constants()
1798 {
1799    int pull_constant_loc[c->prog_data.nr_params];
1800
1801    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1802       pull_constant_loc[i] = -1;
1803    }
1804
1805    /* Walk through and find array access of uniforms.  Put a copy of that
1806     * uniform in the pull constant buffer.
1807     *
1808     * Note that we don't move constant-indexed accesses to arrays.  No
1809     * testing has been done of the performance impact of this choice.
1810     */
1811    foreach_list_safe(node, &this->instructions) {
1812       fs_inst *inst = (fs_inst *)node;
1813
1814       for (int i = 0 ; i < 3; i++) {
1815          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1816             continue;
1817
1818          int uniform = inst->src[i].reg;
1819
1820          /* If this array isn't already present in the pull constant buffer,
1821           * add it.
1822           */
1823          if (pull_constant_loc[uniform] == -1) {
1824             const float **values = &c->prog_data.param[uniform];
1825
1826             pull_constant_loc[uniform] = c->prog_data.nr_pull_params;
1827
1828             assert(param_size[uniform]);
1829
1830             for (int j = 0; j < param_size[uniform]; j++) {
1831                c->prog_data.pull_param[c->prog_data.nr_pull_params++] =
1832                   values[j];
1833             }
1834          }
1835
1836          /* Set up the annotation tracking for new generated instructions. */
1837          base_ir = inst->ir;
1838          current_annotation = inst->annotation;
1839
1840          fs_reg surf_index = fs_reg(c->prog_data.base.binding_table.pull_constants_start);
1841          fs_reg temp = fs_reg(this, glsl_type::float_type);
1842          exec_list list = VARYING_PULL_CONSTANT_LOAD(temp,
1843                                                      surf_index,
1844                                                      *inst->src[i].reladdr,
1845                                                      pull_constant_loc[uniform] +
1846                                                      inst->src[i].reg_offset);
1847          inst->insert_before(&list);
1848
1849          inst->src[i].file = temp.file;
1850          inst->src[i].reg = temp.reg;
1851          inst->src[i].reg_offset = temp.reg_offset;
1852          inst->src[i].reladdr = NULL;
1853       }
1854    }
1855 }
1856
1857 /**
1858  * Choose accesses from the UNIFORM file to demote to using the pull
1859  * constant buffer.
1860  *
1861  * We allow a fragment shader to have more than the specified minimum
1862  * maximum number of fragment shader uniform components (64).  If
1863  * there are too many of these, they'd fill up all of register space.
1864  * So, this will push some of them out to the pull constant buffer and
1865  * update the program to load them.
1866  */
1867 void
1868 fs_visitor::setup_pull_constants()
1869 {
1870    /* Only allow 16 registers (128 uniform components) as push constants. */
1871    unsigned int max_uniform_components = 16 * 8;
1872    if (c->prog_data.nr_params <= max_uniform_components)
1873       return;
1874
1875    if (dispatch_width == 16) {
1876       fail("Pull constants not supported in 16-wide\n");
1877       return;
1878    }
1879
1880    /* Just demote the end of the list.  We could probably do better
1881     * here, demoting things that are rarely used in the program first.
1882     */
1883    unsigned int pull_uniform_base = max_uniform_components;
1884
1885    int pull_constant_loc[c->prog_data.nr_params];
1886    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1887       if (i < pull_uniform_base) {
1888          pull_constant_loc[i] = -1;
1889       } else {
1890          pull_constant_loc[i] = -1;
1891          /* If our constant is already being uploaded for reladdr purposes,
1892           * reuse it.
1893           */
1894          for (unsigned int j = 0; j < c->prog_data.nr_pull_params; j++) {
1895             if (c->prog_data.pull_param[j] == c->prog_data.param[i]) {
1896                pull_constant_loc[i] = j;
1897                break;
1898             }
1899          }
1900          if (pull_constant_loc[i] == -1) {
1901             int pull_index = c->prog_data.nr_pull_params++;
1902             c->prog_data.pull_param[pull_index] = c->prog_data.param[i];
1903             pull_constant_loc[i] = pull_index;;
1904          }
1905       }
1906    }
1907    c->prog_data.nr_params = pull_uniform_base;
1908
1909    foreach_list(node, &this->instructions) {
1910       fs_inst *inst = (fs_inst *)node;
1911
1912       for (int i = 0; i < 3; i++) {
1913          if (inst->src[i].file != UNIFORM)
1914             continue;
1915
1916          int pull_index = pull_constant_loc[inst->src[i].reg +
1917                                             inst->src[i].reg_offset];
1918          if (pull_index == -1)
1919             continue;
1920
1921          assert(!inst->src[i].reladdr);
1922
1923          fs_reg dst = fs_reg(this, glsl_type::float_type);
1924          fs_reg index = fs_reg(c->prog_data.base.binding_table.pull_constants_start);
1925          fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1926          fs_inst *pull =
1927             new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1928                                  dst, index, offset);
1929          pull->ir = inst->ir;
1930          pull->annotation = inst->annotation;
1931
1932          inst->insert_before(pull);
1933
1934          inst->src[i].file = GRF;
1935          inst->src[i].reg = dst.reg;
1936          inst->src[i].reg_offset = 0;
1937          inst->src[i].smear = pull_index & 3;
1938       }
1939    }
1940 }
1941
1942 bool
1943 fs_visitor::opt_algebraic()
1944 {
1945    bool progress = false;
1946
1947    foreach_list(node, &this->instructions) {
1948       fs_inst *inst = (fs_inst *)node;
1949
1950       switch (inst->opcode) {
1951       case BRW_OPCODE_MUL:
1952          if (inst->src[1].file != IMM)
1953             continue;
1954
1955          /* a * 1.0 = a */
1956          if (inst->src[1].is_one()) {
1957             inst->opcode = BRW_OPCODE_MOV;
1958             inst->src[1] = reg_undef;
1959             progress = true;
1960             break;
1961          }
1962
1963          /* a * 0.0 = 0.0 */
1964          if (inst->src[1].is_zero()) {
1965             inst->opcode = BRW_OPCODE_MOV;
1966             inst->src[0] = inst->src[1];
1967             inst->src[1] = reg_undef;
1968             progress = true;
1969             break;
1970          }
1971
1972          break;
1973       case BRW_OPCODE_ADD:
1974          if (inst->src[1].file != IMM)
1975             continue;
1976
1977          /* a + 0.0 = a */
1978          if (inst->src[1].is_zero()) {
1979             inst->opcode = BRW_OPCODE_MOV;
1980             inst->src[1] = reg_undef;
1981             progress = true;
1982             break;
1983          }
1984          break;
1985       case BRW_OPCODE_OR:
1986          if (inst->src[0].equals(inst->src[1])) {
1987             inst->opcode = BRW_OPCODE_MOV;
1988             inst->src[1] = reg_undef;
1989             progress = true;
1990             break;
1991          }
1992          break;
1993       case BRW_OPCODE_SEL:
1994          if (inst->saturate && inst->src[1].file == IMM) {
1995             switch (inst->conditional_mod) {
1996             case BRW_CONDITIONAL_LE:
1997             case BRW_CONDITIONAL_L:
1998                switch (inst->src[1].type) {
1999                case BRW_REGISTER_TYPE_F:
2000                   if (inst->src[1].imm.f >= 1.0f) {
2001                      inst->opcode = BRW_OPCODE_MOV;
2002                      inst->src[1] = reg_undef;
2003                      progress = true;
2004                   }
2005                   break;
2006                default:
2007                   break;
2008                }
2009                break;
2010             case BRW_CONDITIONAL_GE:
2011             case BRW_CONDITIONAL_G:
2012                switch (inst->src[1].type) {
2013                case BRW_REGISTER_TYPE_F:
2014                   if (inst->src[1].imm.f <= 0.0f) {
2015                      inst->opcode = BRW_OPCODE_MOV;
2016                      inst->src[1] = reg_undef;
2017                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
2018                      progress = true;
2019                   }
2020                   break;
2021                default:
2022                   break;
2023                }
2024             default:
2025                break;
2026             }
2027          }
2028          break;
2029       default:
2030          break;
2031       }
2032    }
2033
2034    return progress;
2035 }
2036
2037 /**
2038  * Removes any instructions writing a VGRF where that VGRF is not used by any
2039  * later instruction.
2040  */
2041 bool
2042 fs_visitor::dead_code_eliminate()
2043 {
2044    bool progress = false;
2045    int pc = 0;
2046
2047    calculate_live_intervals();
2048
2049    foreach_list_safe(node, &this->instructions) {
2050       fs_inst *inst = (fs_inst *)node;
2051
2052       if (inst->dst.file == GRF && !inst->has_side_effects()) {
2053          bool dead = true;
2054
2055          for (int i = 0; i < inst->regs_written; i++) {
2056             int var = live_intervals->var_from_vgrf[inst->dst.reg];
2057             assert(live_intervals->end[var + inst->dst.reg_offset + i] >= pc);
2058             if (live_intervals->end[var + inst->dst.reg_offset + i] != pc) {
2059                dead = false;
2060                break;
2061             }
2062          }
2063
2064          if (dead) {
2065             /* Don't dead code eliminate instructions that write to the
2066              * accumulator as a side-effect. Instead just set the destination
2067              * to the null register to free it.
2068              */
2069             switch (inst->opcode) {
2070             case BRW_OPCODE_ADDC:
2071             case BRW_OPCODE_SUBB:
2072             case BRW_OPCODE_MACH:
2073                inst->dst = fs_reg(retype(brw_null_reg(), inst->dst.type));
2074                break;
2075             default:
2076                inst->remove();
2077                progress = true;
2078                break;
2079             }
2080          }
2081       }
2082
2083       pc++;
2084    }
2085
2086    if (progress)
2087       invalidate_live_intervals();
2088
2089    return progress;
2090 }
2091
2092 struct dead_code_hash_key
2093 {
2094    int vgrf;
2095    int reg_offset;
2096 };
2097
2098 static bool
2099 dead_code_hash_compare(const void *a, const void *b)
2100 {
2101    return memcmp(a, b, sizeof(struct dead_code_hash_key)) == 0;
2102 }
2103
2104 static void
2105 clear_dead_code_hash(struct hash_table *ht)
2106 {
2107    struct hash_entry *entry;
2108
2109    hash_table_foreach(ht, entry) {
2110       _mesa_hash_table_remove(ht, entry);
2111    }
2112 }
2113
2114 static void
2115 insert_dead_code_hash(struct hash_table *ht,
2116                       int vgrf, int reg_offset, fs_inst *inst)
2117 {
2118    /* We don't bother freeing keys, because they'll be GCed with the ht. */
2119    struct dead_code_hash_key *key = ralloc(ht, struct dead_code_hash_key);
2120
2121    key->vgrf = vgrf;
2122    key->reg_offset = reg_offset;
2123
2124    _mesa_hash_table_insert(ht, _mesa_hash_data(key, sizeof(*key)), key, inst);
2125 }
2126
2127 static struct hash_entry *
2128 get_dead_code_hash_entry(struct hash_table *ht, int vgrf, int reg_offset)
2129 {
2130    struct dead_code_hash_key key;
2131
2132    key.vgrf = vgrf;
2133    key.reg_offset = reg_offset;
2134
2135    return _mesa_hash_table_search(ht, _mesa_hash_data(&key, sizeof(key)), &key);
2136 }
2137
2138 static void
2139 remove_dead_code_hash(struct hash_table *ht,
2140                       int vgrf, int reg_offset)
2141 {
2142    struct hash_entry *entry = get_dead_code_hash_entry(ht, vgrf, reg_offset);
2143    if (!entry)
2144       return;
2145
2146    _mesa_hash_table_remove(ht, entry);
2147 }
2148
2149 /**
2150  * Walks basic blocks, removing any regs that are written but not read before
2151  * being redefined.
2152  *
2153  * The dead_code_eliminate() function implements a global dead code
2154  * elimination, but it only handles the removing the last write to a register
2155  * if it's never read.  This one can handle intermediate writes, but only
2156  * within a basic block.
2157  */
2158 bool
2159 fs_visitor::dead_code_eliminate_local()
2160 {
2161    struct hash_table *ht;
2162    bool progress = false;
2163
2164    ht = _mesa_hash_table_create(mem_ctx, dead_code_hash_compare);
2165
2166    foreach_list_safe(node, &this->instructions) {
2167       fs_inst *inst = (fs_inst *)node;
2168
2169       /* At a basic block, empty the HT since we don't understand dataflow
2170        * here.
2171        */
2172       if (inst->is_control_flow()) {
2173          clear_dead_code_hash(ht);
2174          continue;
2175       }
2176
2177       /* Clear the HT of any instructions that got read. */
2178       for (int i = 0; i < 3; i++) {
2179          fs_reg src = inst->src[i];
2180          if (src.file != GRF)
2181             continue;
2182
2183          int read = 1;
2184          if (inst->is_send_from_grf())
2185             read = virtual_grf_sizes[src.reg] - src.reg_offset;
2186
2187          for (int reg_offset = src.reg_offset;
2188               reg_offset < src.reg_offset + read;
2189               reg_offset++) {
2190             remove_dead_code_hash(ht, src.reg, reg_offset);
2191          }
2192       }
2193
2194       /* Add any update of a GRF to the HT, removing a previous write if it
2195        * wasn't read.
2196        */
2197       if (inst->dst.file == GRF) {
2198          if (inst->regs_written > 1) {
2199             /* We don't know how to trim channels from an instruction's
2200              * writes, so we can't incrementally remove unread channels from
2201              * it.  Just remove whatever it overwrites from the table
2202              */
2203             for (int i = 0; i < inst->regs_written; i++) {
2204                remove_dead_code_hash(ht,
2205                                      inst->dst.reg,
2206                                      inst->dst.reg_offset + i);
2207             }
2208          } else {
2209             struct hash_entry *entry =
2210                get_dead_code_hash_entry(ht, inst->dst.reg,
2211                                         inst->dst.reg_offset);
2212
2213             if (entry) {
2214                if (inst->is_partial_write()) {
2215                   /* For a partial write, we can't remove any previous dead code
2216                    * candidate, since we're just modifying their result.
2217                    */
2218                } else {
2219                   /* We're completely updating a channel, and there was a
2220                    * previous write to the channel that wasn't read.  Kill it!
2221                    */
2222                   fs_inst *inst = (fs_inst *)entry->data;
2223                   inst->remove();
2224                   progress = true;
2225                }
2226
2227                _mesa_hash_table_remove(ht, entry);
2228             }
2229
2230             if (!inst->has_side_effects())
2231                insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
2232                                      inst);
2233          }
2234       }
2235    }
2236
2237    _mesa_hash_table_destroy(ht, NULL);
2238
2239    if (progress)
2240       invalidate_live_intervals();
2241
2242    return progress;
2243 }
2244
2245 /**
2246  * Implements register coalescing: Checks if the two registers involved in a
2247  * raw move don't interfere, in which case they can both be stored in the same
2248  * place and the MOV removed.
2249  */
2250 bool
2251 fs_visitor::register_coalesce()
2252 {
2253    bool progress = false;
2254
2255    calculate_live_intervals();
2256
2257    foreach_list_safe(node, &this->instructions) {
2258       fs_inst *inst = (fs_inst *)node;
2259
2260       if (inst->opcode != BRW_OPCODE_MOV ||
2261           inst->is_partial_write() ||
2262           inst->saturate ||
2263           inst->src[0].file != GRF ||
2264           inst->src[0].negate ||
2265           inst->src[0].abs ||
2266           inst->src[0].smear != -1 ||
2267           inst->dst.file != GRF ||
2268           inst->dst.type != inst->src[0].type ||
2269           virtual_grf_sizes[inst->src[0].reg] != 1) {
2270          continue;
2271       }
2272
2273       int var_from = live_intervals->var_from_reg(&inst->src[0]);
2274       int var_to = live_intervals->var_from_reg(&inst->dst);
2275
2276       if (live_intervals->vars_interfere(var_from, var_to) &&
2277           !inst->dst.equals(inst->src[0]))
2278          continue;
2279
2280       int reg_from = inst->src[0].reg;
2281       assert(inst->src[0].reg_offset == 0);
2282       int reg_to = inst->dst.reg;
2283       int reg_to_offset = inst->dst.reg_offset;
2284
2285       foreach_list(node, &this->instructions) {
2286          fs_inst *scan_inst = (fs_inst *)node;
2287
2288          if (scan_inst->dst.file == GRF &&
2289              scan_inst->dst.reg == reg_from) {
2290             scan_inst->dst.reg = reg_to;
2291             scan_inst->dst.reg_offset = reg_to_offset;
2292          }
2293          for (int i = 0; i < 3; i++) {
2294             if (scan_inst->src[i].file == GRF &&
2295                 scan_inst->src[i].reg == reg_from) {
2296                scan_inst->src[i].reg = reg_to;
2297                scan_inst->src[i].reg_offset = reg_to_offset;
2298             }
2299          }
2300       }
2301
2302       inst->remove();
2303       progress = true;
2304       continue;
2305    }
2306
2307    if (progress)
2308       invalidate_live_intervals();
2309
2310    return progress;
2311 }
2312
2313 bool
2314 fs_visitor::compute_to_mrf()
2315 {
2316    bool progress = false;
2317    int next_ip = 0;
2318
2319    calculate_live_intervals();
2320
2321    foreach_list_safe(node, &this->instructions) {
2322       fs_inst *inst = (fs_inst *)node;
2323
2324       int ip = next_ip;
2325       next_ip++;
2326
2327       if (inst->opcode != BRW_OPCODE_MOV ||
2328           inst->is_partial_write() ||
2329           inst->dst.file != MRF || inst->src[0].file != GRF ||
2330           inst->dst.type != inst->src[0].type ||
2331           inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
2332          continue;
2333
2334       /* Work out which hardware MRF registers are written by this
2335        * instruction.
2336        */
2337       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2338       int mrf_high;
2339       if (inst->dst.reg & BRW_MRF_COMPR4) {
2340          mrf_high = mrf_low + 4;
2341       } else if (dispatch_width == 16 &&
2342                  (!inst->force_uncompressed && !inst->force_sechalf)) {
2343          mrf_high = mrf_low + 1;
2344       } else {
2345          mrf_high = mrf_low;
2346       }
2347
2348       /* Can't compute-to-MRF this GRF if someone else was going to
2349        * read it later.
2350        */
2351       if (this->virtual_grf_end[inst->src[0].reg] > ip)
2352          continue;
2353
2354       /* Found a move of a GRF to a MRF.  Let's see if we can go
2355        * rewrite the thing that made this GRF to write into the MRF.
2356        */
2357       fs_inst *scan_inst;
2358       for (scan_inst = (fs_inst *)inst->prev;
2359            scan_inst->prev != NULL;
2360            scan_inst = (fs_inst *)scan_inst->prev) {
2361          if (scan_inst->dst.file == GRF &&
2362              scan_inst->dst.reg == inst->src[0].reg) {
2363             /* Found the last thing to write our reg we want to turn
2364              * into a compute-to-MRF.
2365              */
2366
2367             /* If this one instruction didn't populate all the
2368              * channels, bail.  We might be able to rewrite everything
2369              * that writes that reg, but it would require smarter
2370              * tracking to delay the rewriting until complete success.
2371              */
2372             if (scan_inst->is_partial_write())
2373                break;
2374
2375             /* Things returning more than one register would need us to
2376              * understand coalescing out more than one MOV at a time.
2377              */
2378             if (scan_inst->regs_written > 1)
2379                break;
2380
2381             /* SEND instructions can't have MRF as a destination. */
2382             if (scan_inst->mlen)
2383                break;
2384
2385             if (brw->gen == 6) {
2386                /* gen6 math instructions must have the destination be
2387                 * GRF, so no compute-to-MRF for them.
2388                 */
2389                if (scan_inst->is_math()) {
2390                   break;
2391                }
2392             }
2393
2394             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2395                /* Found the creator of our MRF's source value. */
2396                scan_inst->dst.file = MRF;
2397                scan_inst->dst.reg = inst->dst.reg;
2398                scan_inst->saturate |= inst->saturate;
2399                inst->remove();
2400                progress = true;
2401             }
2402             break;
2403          }
2404
2405          /* We don't handle control flow here.  Most computation of
2406           * values that end up in MRFs are shortly before the MRF
2407           * write anyway.
2408           */
2409          if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2410             break;
2411
2412          /* You can't read from an MRF, so if someone else reads our
2413           * MRF's source GRF that we wanted to rewrite, that stops us.
2414           */
2415          bool interfered = false;
2416          for (int i = 0; i < 3; i++) {
2417             if (scan_inst->src[i].file == GRF &&
2418                 scan_inst->src[i].reg == inst->src[0].reg &&
2419                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2420                interfered = true;
2421             }
2422          }
2423          if (interfered)
2424             break;
2425
2426          if (scan_inst->dst.file == MRF) {
2427             /* If somebody else writes our MRF here, we can't
2428              * compute-to-MRF before that.
2429              */
2430             int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2431             int scan_mrf_high;
2432
2433             if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2434                scan_mrf_high = scan_mrf_low + 4;
2435             } else if (dispatch_width == 16 &&
2436                        (!scan_inst->force_uncompressed &&
2437                         !scan_inst->force_sechalf)) {
2438                scan_mrf_high = scan_mrf_low + 1;
2439             } else {
2440                scan_mrf_high = scan_mrf_low;
2441             }
2442
2443             if (mrf_low == scan_mrf_low ||
2444                 mrf_low == scan_mrf_high ||
2445                 mrf_high == scan_mrf_low ||
2446                 mrf_high == scan_mrf_high) {
2447                break;
2448             }
2449          }
2450
2451          if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2452             /* Found a SEND instruction, which means that there are
2453              * live values in MRFs from base_mrf to base_mrf +
2454              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
2455              * above it.
2456              */
2457             if (mrf_low >= scan_inst->base_mrf &&
2458                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2459                break;
2460             }
2461             if (mrf_high >= scan_inst->base_mrf &&
2462                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2463                break;
2464             }
2465          }
2466       }
2467    }
2468
2469    if (progress)
2470       invalidate_live_intervals();
2471
2472    return progress;
2473 }
2474
2475 /**
2476  * Walks through basic blocks, looking for repeated MRF writes and
2477  * removing the later ones.
2478  */
2479 bool
2480 fs_visitor::remove_duplicate_mrf_writes()
2481 {
2482    fs_inst *last_mrf_move[16];
2483    bool progress = false;
2484
2485    /* Need to update the MRF tracking for compressed instructions. */
2486    if (dispatch_width == 16)
2487       return false;
2488
2489    memset(last_mrf_move, 0, sizeof(last_mrf_move));
2490
2491    foreach_list_safe(node, &this->instructions) {
2492       fs_inst *inst = (fs_inst *)node;
2493
2494       if (inst->is_control_flow()) {
2495          memset(last_mrf_move, 0, sizeof(last_mrf_move));
2496       }
2497
2498       if (inst->opcode == BRW_OPCODE_MOV &&
2499           inst->dst.file == MRF) {
2500          fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2501          if (prev_inst && inst->equals(prev_inst)) {
2502             inst->remove();
2503             progress = true;
2504             continue;
2505          }
2506       }
2507
2508       /* Clear out the last-write records for MRFs that were overwritten. */
2509       if (inst->dst.file == MRF) {
2510          last_mrf_move[inst->dst.reg] = NULL;
2511       }
2512
2513       if (inst->mlen > 0 && inst->base_mrf != -1) {
2514          /* Found a SEND instruction, which will include two or fewer
2515           * implied MRF writes.  We could do better here.
2516           */
2517          for (int i = 0; i < implied_mrf_writes(inst); i++) {
2518             last_mrf_move[inst->base_mrf + i] = NULL;
2519          }
2520       }
2521
2522       /* Clear out any MRF move records whose sources got overwritten. */
2523       if (inst->dst.file == GRF) {
2524          for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2525             if (last_mrf_move[i] &&
2526                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2527                last_mrf_move[i] = NULL;
2528             }
2529          }
2530       }
2531
2532       if (inst->opcode == BRW_OPCODE_MOV &&
2533           inst->dst.file == MRF &&
2534           inst->src[0].file == GRF &&
2535           !inst->is_partial_write()) {
2536          last_mrf_move[inst->dst.reg] = inst;
2537       }
2538    }
2539
2540    if (progress)
2541       invalidate_live_intervals();
2542
2543    return progress;
2544 }
2545
2546 static void
2547 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2548                         int first_grf, int grf_len)
2549 {
2550    bool inst_16wide = (dispatch_width > 8 &&
2551                        !inst->force_uncompressed &&
2552                        !inst->force_sechalf);
2553
2554    /* Clear the flag for registers that actually got read (as expected). */
2555    for (int i = 0; i < 3; i++) {
2556       int grf;
2557       if (inst->src[i].file == GRF) {
2558          grf = inst->src[i].reg;
2559       } else if (inst->src[i].file == HW_REG &&
2560                  inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2561          grf = inst->src[i].fixed_hw_reg.nr;
2562       } else {
2563          continue;
2564       }
2565
2566       if (grf >= first_grf &&
2567           grf < first_grf + grf_len) {
2568          deps[grf - first_grf] = false;
2569          if (inst_16wide)
2570             deps[grf - first_grf + 1] = false;
2571       }
2572    }
2573 }
2574
2575 /**
2576  * Implements this workaround for the original 965:
2577  *
2578  *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2579  *      check for post destination dependencies on this instruction, software
2580  *      must ensure that there is no destination hazard for the case of ‘write
2581  *      followed by a posted write’ shown in the following example.
2582  *
2583  *      1. mov r3 0
2584  *      2. send r3.xy <rest of send instruction>
2585  *      3. mov r2 r3
2586  *
2587  *      Due to no post-destination dependency check on the ‘send’, the above
2588  *      code sequence could have two instructions (1 and 2) in flight at the
2589  *      same time that both consider ‘r3’ as the target of their final writes.
2590  */
2591 void
2592 fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
2593 {
2594    int reg_size = dispatch_width / 8;
2595    int write_len = inst->regs_written * reg_size;
2596    int first_write_grf = inst->dst.reg;
2597    bool needs_dep[BRW_MAX_MRF];
2598    assert(write_len < (int)sizeof(needs_dep) - 1);
2599
2600    memset(needs_dep, false, sizeof(needs_dep));
2601    memset(needs_dep, true, write_len);
2602
2603    clear_deps_for_inst_src(inst, dispatch_width,
2604                            needs_dep, first_write_grf, write_len);
2605
2606    /* Walk backwards looking for writes to registers we're writing which
2607     * aren't read since being written.  If we hit the start of the program,
2608     * we assume that there are no outstanding dependencies on entry to the
2609     * program.
2610     */
2611    for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2612         scan_inst != NULL;
2613         scan_inst = (fs_inst *)scan_inst->prev) {
2614
2615       /* If we hit control flow, assume that there *are* outstanding
2616        * dependencies, and force their cleanup before our instruction.
2617        */
2618       if (scan_inst->is_control_flow()) {
2619          for (int i = 0; i < write_len; i++) {
2620             if (needs_dep[i]) {
2621                inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2622             }
2623          }
2624          return;
2625       }
2626
2627       bool scan_inst_16wide = (dispatch_width > 8 &&
2628                                !scan_inst->force_uncompressed &&
2629                                !scan_inst->force_sechalf);
2630
2631       /* We insert our reads as late as possible on the assumption that any
2632        * instruction but a MOV that might have left us an outstanding
2633        * dependency has more latency than a MOV.
2634        */
2635       if (scan_inst->dst.file == GRF) {
2636          for (int i = 0; i < scan_inst->regs_written; i++) {
2637             int reg = scan_inst->dst.reg + i * reg_size;
2638
2639             if (reg >= first_write_grf &&
2640                 reg < first_write_grf + write_len &&
2641                 needs_dep[reg - first_write_grf]) {
2642                inst->insert_before(DEP_RESOLVE_MOV(reg));
2643                needs_dep[reg - first_write_grf] = false;
2644                if (scan_inst_16wide)
2645                   needs_dep[reg - first_write_grf + 1] = false;
2646             }
2647          }
2648       }
2649
2650       /* Clear the flag for registers that actually got read (as expected). */
2651       clear_deps_for_inst_src(scan_inst, dispatch_width,
2652                               needs_dep, first_write_grf, write_len);
2653
2654       /* Continue the loop only if we haven't resolved all the dependencies */
2655       int i;
2656       for (i = 0; i < write_len; i++) {
2657          if (needs_dep[i])
2658             break;
2659       }
2660       if (i == write_len)
2661          return;
2662    }
2663 }
2664
2665 /**
2666  * Implements this workaround for the original 965:
2667  *
2668  *     "[DevBW, DevCL] Errata: A destination register from a send can not be
2669  *      used as a destination register until after it has been sourced by an
2670  *      instruction with a different destination register.
2671  */
2672 void
2673 fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
2674 {
2675    int write_len = inst->regs_written * dispatch_width / 8;
2676    int first_write_grf = inst->dst.reg;
2677    bool needs_dep[BRW_MAX_MRF];
2678    assert(write_len < (int)sizeof(needs_dep) - 1);
2679
2680    memset(needs_dep, false, sizeof(needs_dep));
2681    memset(needs_dep, true, write_len);
2682    /* Walk forwards looking for writes to registers we're writing which aren't
2683     * read before being written.
2684     */
2685    for (fs_inst *scan_inst = (fs_inst *)inst->next;
2686         !scan_inst->is_tail_sentinel();
2687         scan_inst = (fs_inst *)scan_inst->next) {
2688       /* If we hit control flow, force resolve all remaining dependencies. */
2689       if (scan_inst->is_control_flow()) {
2690          for (int i = 0; i < write_len; i++) {
2691             if (needs_dep[i])
2692                scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2693          }
2694          return;
2695       }
2696
2697       /* Clear the flag for registers that actually got read (as expected). */
2698       clear_deps_for_inst_src(scan_inst, dispatch_width,
2699                               needs_dep, first_write_grf, write_len);
2700
2701       /* We insert our reads as late as possible since they're reading the
2702        * result of a SEND, which has massive latency.
2703        */
2704       if (scan_inst->dst.file == GRF &&
2705           scan_inst->dst.reg >= first_write_grf &&
2706           scan_inst->dst.reg < first_write_grf + write_len &&
2707           needs_dep[scan_inst->dst.reg - first_write_grf]) {
2708          scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2709          needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2710       }
2711
2712       /* Continue the loop only if we haven't resolved all the dependencies */
2713       int i;
2714       for (i = 0; i < write_len; i++) {
2715          if (needs_dep[i])
2716             break;
2717       }
2718       if (i == write_len)
2719          return;
2720    }
2721
2722    /* If we hit the end of the program, resolve all remaining dependencies out
2723     * of paranoia.
2724     */
2725    fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2726    assert(last_inst->eot);
2727    for (int i = 0; i < write_len; i++) {
2728       if (needs_dep[i])
2729          last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2730    }
2731 }
2732
2733 void
2734 fs_visitor::insert_gen4_send_dependency_workarounds()
2735 {
2736    if (brw->gen != 4 || brw->is_g4x)
2737       return;
2738
2739    /* Note that we're done with register allocation, so GRF fs_regs always
2740     * have a .reg_offset of 0.
2741     */
2742
2743    foreach_list_safe(node, &this->instructions) {
2744       fs_inst *inst = (fs_inst *)node;
2745
2746       if (inst->mlen != 0 && inst->dst.file == GRF) {
2747          insert_gen4_pre_send_dependency_workarounds(inst);
2748          insert_gen4_post_send_dependency_workarounds(inst);
2749       }
2750    }
2751 }
2752
2753 /**
2754  * Turns the generic expression-style uniform pull constant load instruction
2755  * into a hardware-specific series of instructions for loading a pull
2756  * constant.
2757  *
2758  * The expression style allows the CSE pass before this to optimize out
2759  * repeated loads from the same offset, and gives the pre-register-allocation
2760  * scheduling full flexibility, while the conversion to native instructions
2761  * allows the post-register-allocation scheduler the best information
2762  * possible.
2763  *
2764  * Note that execution masking for setting up pull constant loads is special:
2765  * the channels that need to be written are unrelated to the current execution
2766  * mask, since a later instruction will use one of the result channels as a
2767  * source operand for all 8 or 16 of its channels.
2768  */
2769 void
2770 fs_visitor::lower_uniform_pull_constant_loads()
2771 {
2772    foreach_list(node, &this->instructions) {
2773       fs_inst *inst = (fs_inst *)node;
2774
2775       if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2776          continue;
2777
2778       if (brw->gen >= 7) {
2779          /* The offset arg before was a vec4-aligned byte offset.  We need to
2780           * turn it into a dword offset.
2781           */
2782          fs_reg const_offset_reg = inst->src[1];
2783          assert(const_offset_reg.file == IMM &&
2784                 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2785          const_offset_reg.imm.u /= 4;
2786          fs_reg payload = fs_reg(this, glsl_type::uint_type);
2787
2788          /* This is actually going to be a MOV, but since only the first dword
2789           * is accessed, we have a special opcode to do just that one.  Note
2790           * that this needs to be an operation that will be considered a def
2791           * by live variable analysis, or register allocation will explode.
2792           */
2793          fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2794                                                payload, const_offset_reg);
2795          setup->force_writemask_all = true;
2796
2797          setup->ir = inst->ir;
2798          setup->annotation = inst->annotation;
2799          inst->insert_before(setup);
2800
2801          /* Similarly, this will only populate the first 4 channels of the
2802           * result register (since we only use smear values from 0-3), but we
2803           * don't tell the optimizer.
2804           */
2805          inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2806          inst->src[1] = payload;
2807
2808          invalidate_live_intervals();
2809       } else {
2810          /* Before register allocation, we didn't tell the scheduler about the
2811           * MRF we use.  We know it's safe to use this MRF because nothing
2812           * else does except for register spill/unspill, which generates and
2813           * uses its MRF within a single IR instruction.
2814           */
2815          inst->base_mrf = 14;
2816          inst->mlen = 1;
2817       }
2818    }
2819 }
2820
2821 void
2822 fs_visitor::dump_instruction(backend_instruction *be_inst)
2823 {
2824    fs_inst *inst = (fs_inst *)be_inst;
2825
2826    if (inst->predicate) {
2827       printf("(%cf0.%d) ",
2828              inst->predicate_inverse ? '-' : '+',
2829              inst->flag_subreg);
2830    }
2831
2832    printf("%s", brw_instruction_name(inst->opcode));
2833    if (inst->saturate)
2834       printf(".sat");
2835    if (inst->conditional_mod) {
2836       printf("%s", conditional_modifier[inst->conditional_mod]);
2837       if (!inst->predicate &&
2838           (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2839                               inst->opcode != BRW_OPCODE_IF &&
2840                               inst->opcode != BRW_OPCODE_WHILE))) {
2841          printf(".f0.%d", inst->flag_subreg);
2842       }
2843    }
2844    printf(" ");
2845
2846
2847    switch (inst->dst.file) {
2848    case GRF:
2849       printf("vgrf%d", inst->dst.reg);
2850       if (inst->dst.reg_offset)
2851          printf("+%d", inst->dst.reg_offset);
2852       break;
2853    case MRF:
2854       printf("m%d", inst->dst.reg);
2855       break;
2856    case BAD_FILE:
2857       printf("(null)");
2858       break;
2859    case UNIFORM:
2860       printf("***u%d***", inst->dst.reg);
2861       break;
2862    case HW_REG:
2863       if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
2864          switch (inst->dst.fixed_hw_reg.nr) {
2865          case BRW_ARF_NULL:
2866             printf("null");
2867             break;
2868          case BRW_ARF_ADDRESS:
2869             printf("a0.%d", inst->dst.fixed_hw_reg.subnr);
2870             break;
2871          case BRW_ARF_ACCUMULATOR:
2872             printf("acc%d", inst->dst.fixed_hw_reg.subnr);
2873             break;
2874          case BRW_ARF_FLAG:
2875             printf("f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
2876                              inst->dst.fixed_hw_reg.subnr);
2877             break;
2878          default:
2879             printf("arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
2880                                inst->dst.fixed_hw_reg.subnr);
2881             break;
2882          }
2883       } else {
2884          printf("hw_reg%d", inst->dst.fixed_hw_reg.nr);
2885       }
2886       if (inst->dst.fixed_hw_reg.subnr)
2887          printf("+%d", inst->dst.fixed_hw_reg.subnr);
2888       break;
2889    default:
2890       printf("???");
2891       break;
2892    }
2893    printf(":%s, ", reg_encoding[inst->dst.type]);
2894
2895    for (int i = 0; i < 3 && inst->src[i].file != BAD_FILE; i++) {
2896       if (inst->src[i].negate)
2897          printf("-");
2898       if (inst->src[i].abs)
2899          printf("|");
2900       switch (inst->src[i].file) {
2901       case GRF:
2902          printf("vgrf%d", inst->src[i].reg);
2903          if (inst->src[i].reg_offset)
2904             printf("+%d", inst->src[i].reg_offset);
2905          break;
2906       case MRF:
2907          printf("***m%d***", inst->src[i].reg);
2908          break;
2909       case UNIFORM:
2910          printf("u%d", inst->src[i].reg);
2911          if (inst->src[i].reg_offset)
2912             printf(".%d", inst->src[i].reg_offset);
2913          break;
2914       case BAD_FILE:
2915          printf("(null)");
2916          break;
2917       case IMM:
2918          switch (inst->src[i].type) {
2919          case BRW_REGISTER_TYPE_F:
2920             printf("%ff", inst->src[i].imm.f);
2921             break;
2922          case BRW_REGISTER_TYPE_D:
2923             printf("%dd", inst->src[i].imm.i);
2924             break;
2925          case BRW_REGISTER_TYPE_UD:
2926             printf("%uu", inst->src[i].imm.u);
2927             break;
2928          default:
2929             printf("???");
2930             break;
2931          }
2932          break;
2933       case HW_REG:
2934          if (inst->src[i].fixed_hw_reg.negate)
2935             printf("-");
2936          if (inst->src[i].fixed_hw_reg.abs)
2937             printf("|");
2938          if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
2939             switch (inst->src[i].fixed_hw_reg.nr) {
2940             case BRW_ARF_NULL:
2941                printf("null");
2942                break;
2943             case BRW_ARF_ADDRESS:
2944                printf("a0.%d", inst->src[i].fixed_hw_reg.subnr);
2945                break;
2946             case BRW_ARF_ACCUMULATOR:
2947                printf("acc%d", inst->src[i].fixed_hw_reg.subnr);
2948                break;
2949             case BRW_ARF_FLAG:
2950                printf("f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
2951                                 inst->src[i].fixed_hw_reg.subnr);
2952                break;
2953             default:
2954                printf("arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
2955                                   inst->src[i].fixed_hw_reg.subnr);
2956                break;
2957             }
2958          } else {
2959             printf("hw_reg%d", inst->src[i].fixed_hw_reg.nr);
2960          }
2961          if (inst->src[i].fixed_hw_reg.subnr)
2962             printf("+%d", inst->src[i].fixed_hw_reg.subnr);
2963          if (inst->src[i].fixed_hw_reg.abs)
2964             printf("|");
2965          break;
2966       default:
2967          printf("???");
2968          break;
2969       }
2970       if (inst->src[i].abs)
2971          printf("|");
2972
2973       if (inst->src[i].file != IMM) {
2974          printf(":%s", reg_encoding[inst->src[i].type]);
2975       }
2976
2977       if (i < 2 && inst->src[i + 1].file != BAD_FILE)
2978          printf(", ");
2979    }
2980
2981    printf(" ");
2982
2983    if (inst->force_uncompressed)
2984       printf("1sthalf ");
2985
2986    if (inst->force_sechalf)
2987       printf("2ndhalf ");
2988
2989    printf("\n");
2990 }
2991
2992 /**
2993  * Possibly returns an instruction that set up @param reg.
2994  *
2995  * Sometimes we want to take the result of some expression/variable
2996  * dereference tree and rewrite the instruction generating the result
2997  * of the tree.  When processing the tree, we know that the
2998  * instructions generated are all writing temporaries that are dead
2999  * outside of this tree.  So, if we have some instructions that write
3000  * a temporary, we're free to point that temp write somewhere else.
3001  *
3002  * Note that this doesn't guarantee that the instruction generated
3003  * only reg -- it might be the size=4 destination of a texture instruction.
3004  */
3005 fs_inst *
3006 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3007                                            fs_inst *end,
3008                                            fs_reg reg)
3009 {
3010    if (end == start ||
3011        end->is_partial_write() ||
3012        reg.reladdr ||
3013        !reg.equals(end->dst)) {
3014       return NULL;
3015    } else {
3016       return end;
3017    }
3018 }
3019
3020 void
3021 fs_visitor::setup_payload_gen6()
3022 {
3023    bool uses_depth =
3024       (fp->Base.InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3025    unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
3026
3027    assert(brw->gen >= 6);
3028
3029    /* R0-1: masks, pixel X/Y coordinates. */
3030    c->nr_payload_regs = 2;
3031    /* R2: only for 32-pixel dispatch.*/
3032
3033    /* R3-26: barycentric interpolation coordinates.  These appear in the
3034     * same order that they appear in the brw_wm_barycentric_interp_mode
3035     * enum.  Each set of coordinates occupies 2 registers if dispatch width
3036     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
3037     * appear if they were enabled using the "Barycentric Interpolation
3038     * Mode" bits in WM_STATE.
3039     */
3040    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3041       if (barycentric_interp_modes & (1 << i)) {
3042          c->barycentric_coord_reg[i] = c->nr_payload_regs;
3043          c->nr_payload_regs += 2;
3044          if (dispatch_width == 16) {
3045             c->nr_payload_regs += 2;
3046          }
3047       }
3048    }
3049
3050    /* R27: interpolated depth if uses source depth */
3051    if (uses_depth) {
3052       c->source_depth_reg = c->nr_payload_regs;
3053       c->nr_payload_regs++;
3054       if (dispatch_width == 16) {
3055          /* R28: interpolated depth if not 8-wide. */
3056          c->nr_payload_regs++;
3057       }
3058    }
3059    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3060    if (uses_depth) {
3061       c->source_w_reg = c->nr_payload_regs;
3062       c->nr_payload_regs++;
3063       if (dispatch_width == 16) {
3064          /* R30: interpolated W if not 8-wide. */
3065          c->nr_payload_regs++;
3066       }
3067    }
3068
3069    c->prog_data.uses_pos_offset = c->key.compute_pos_offset;
3070    /* R31: MSAA position offsets. */
3071    if (c->prog_data.uses_pos_offset) {
3072       c->sample_pos_reg = c->nr_payload_regs;
3073       c->nr_payload_regs++;
3074    }
3075
3076    /* R32-: bary for 32-pixel. */
3077    /* R58-59: interp W for 32-pixel. */
3078
3079    if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3080       c->source_depth_to_render_target = true;
3081    }
3082 }
3083
3084 void
3085 fs_visitor::assign_binding_table_offsets()
3086 {
3087    uint32_t next_binding_table_offset = 0;
3088
3089    /* If there are no color regions, we still perform an FB write to a null
3090     * renderbuffer, which we place at surface index 0.
3091     */
3092    c->prog_data.binding_table.render_target_start = next_binding_table_offset;
3093    next_binding_table_offset += MAX2(c->key.nr_color_regions, 1);
3094
3095    assign_common_binding_table_offsets(next_binding_table_offset);
3096 }
3097
3098 bool
3099 fs_visitor::run()
3100 {
3101    sanity_param_count = fp->Base.Parameters->NumParameters;
3102    uint32_t orig_nr_params = c->prog_data.nr_params;
3103    bool allocated_without_spills;
3104
3105    assign_binding_table_offsets();
3106
3107    if (brw->gen >= 6)
3108       setup_payload_gen6();
3109    else
3110       setup_payload_gen4();
3111
3112    if (0) {
3113       emit_dummy_fs();
3114    } else {
3115       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3116          emit_shader_time_begin();
3117
3118       calculate_urb_setup();
3119       if (fp->Base.InputsRead > 0) {
3120          if (brw->gen < 6)
3121             emit_interpolation_setup_gen4();
3122          else
3123             emit_interpolation_setup_gen6();
3124       }
3125
3126       /* We handle discards by keeping track of the still-live pixels in f0.1.
3127        * Initialize it with the dispatched pixels.
3128        */
3129       if (fp->UsesKill || c->key.alpha_test_func) {
3130          fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
3131          discard_init->flag_subreg = 1;
3132       }
3133
3134       /* Generate FS IR for main().  (the visitor only descends into
3135        * functions called "main").
3136        */
3137       if (shader) {
3138          foreach_list(node, &*shader->ir) {
3139             ir_instruction *ir = (ir_instruction *)node;
3140             base_ir = ir;
3141             this->result = reg_undef;
3142             ir->accept(this);
3143          }
3144       } else {
3145          emit_fragment_program_code();
3146       }
3147       base_ir = NULL;
3148       if (failed)
3149          return false;
3150
3151       emit(FS_OPCODE_PLACEHOLDER_HALT);
3152
3153       if (c->key.alpha_test_func)
3154          emit_alpha_test();
3155
3156       emit_fb_writes();
3157
3158       split_virtual_grfs();
3159
3160       move_uniform_array_access_to_pull_constants();
3161       remove_dead_constants();
3162       setup_pull_constants();
3163
3164       bool progress;
3165       do {
3166          progress = false;
3167
3168          compact_virtual_grfs();
3169
3170          progress = remove_duplicate_mrf_writes() || progress;
3171
3172          progress = opt_algebraic() || progress;
3173          progress = opt_cse() || progress;
3174          progress = opt_copy_propagate() || progress;
3175          progress = opt_peephole_sel() || progress;
3176          progress = opt_peephole_predicated_break() || progress;
3177          progress = dead_code_eliminate() || progress;
3178          progress = dead_code_eliminate_local() || progress;
3179          progress = dead_control_flow_eliminate(this) || progress;
3180          progress = register_coalesce() || progress;
3181          progress = compute_to_mrf() || progress;
3182       } while (progress);
3183
3184       lower_uniform_pull_constant_loads();
3185
3186       assign_curb_setup();
3187       assign_urb_setup();
3188
3189       static enum instruction_scheduler_mode pre_modes[] = {
3190          SCHEDULE_PRE,
3191          SCHEDULE_PRE_NON_LIFO,
3192          SCHEDULE_PRE_LIFO,
3193       };
3194
3195       /* Try each scheduling heuristic to see if it can successfully register
3196        * allocate without spilling.  They should be ordered by decreasing
3197        * performance but increasing likelihood of allocating.
3198        */
3199       for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3200          schedule_instructions(pre_modes[i]);
3201
3202          if (0) {
3203             assign_regs_trivial();
3204             allocated_without_spills = true;
3205          } else {
3206             allocated_without_spills = assign_regs(false);
3207          }
3208          if (allocated_without_spills)
3209             break;
3210       }
3211
3212       if (!allocated_without_spills) {
3213          /* We assume that any spilling is worse than just dropping back to
3214           * SIMD8.  There's probably actually some intermediate point where
3215           * SIMD16 with a couple of spills is still better.
3216           */
3217          if (dispatch_width == 16) {
3218             fail("Failure to register allocate.  Reduce number of "
3219                  "live scalar values to avoid this.");
3220          }
3221
3222          /* Since we're out of heuristics, just go spill registers until we
3223           * get an allocation.
3224           */
3225          while (!assign_regs(true)) {
3226             if (failed)
3227                break;
3228          }
3229       }
3230    }
3231    assert(force_uncompressed_stack == 0);
3232
3233    /* This must come after all optimization and register allocation, since
3234     * it inserts dead code that happens to have side effects, and it does
3235     * so based on the actual physical registers in use.
3236     */
3237    insert_gen4_send_dependency_workarounds();
3238
3239    if (failed)
3240       return false;
3241
3242    if (!allocated_without_spills)
3243       schedule_instructions(SCHEDULE_POST);
3244
3245    if (dispatch_width == 8) {
3246       c->prog_data.reg_blocks = brw_register_blocks(grf_used);
3247    } else {
3248       c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
3249
3250       /* Make sure we didn't try to sneak in an extra uniform */
3251       assert(orig_nr_params == c->prog_data.nr_params);
3252       (void) orig_nr_params;
3253    }
3254
3255    /* If any state parameters were appended, then ParameterValues could have
3256     * been realloced, in which case the driver uniform storage set up by
3257     * _mesa_associate_uniform_storage() would point to freed memory.  Make
3258     * sure that didn't happen.
3259     */
3260    assert(sanity_param_count == fp->Base.Parameters->NumParameters);
3261
3262    return !failed;
3263 }
3264
3265 const unsigned *
3266 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
3267                struct gl_fragment_program *fp,
3268                struct gl_shader_program *prog,
3269                unsigned *final_assembly_size)
3270 {
3271    bool start_busy = false;
3272    float start_time = 0;
3273
3274    if (unlikely(brw->perf_debug)) {
3275       start_busy = (brw->batch.last_bo &&
3276                     drm_intel_bo_busy(brw->batch.last_bo));
3277       start_time = get_time();
3278    }
3279
3280    struct brw_shader *shader = NULL;
3281    if (prog)
3282       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3283
3284    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3285       if (prog) {
3286          printf("GLSL IR for native fragment shader %d:\n", prog->Name);
3287          _mesa_print_ir(shader->ir, NULL);
3288          printf("\n\n");
3289       } else {
3290          printf("ARB_fragment_program %d ir for native fragment shader\n",
3291                 fp->Base.Id);
3292          _mesa_print_program(&fp->Base);
3293       }
3294    }
3295
3296    /* Now the main event: Visit the shader IR and generate our FS IR for it.
3297     */
3298    fs_visitor v(brw, c, prog, fp, 8);
3299    if (!v.run()) {
3300       if (prog) {
3301          prog->LinkStatus = false;
3302          ralloc_strcat(&prog->InfoLog, v.fail_msg);
3303       }
3304
3305       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3306                     v.fail_msg);
3307
3308       return NULL;
3309    }
3310
3311    exec_list *simd16_instructions = NULL;
3312    fs_visitor v2(brw, c, prog, fp, 16);
3313    if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16))) {
3314       if (c->prog_data.nr_pull_params == 0) {
3315          /* Try a 16-wide compile */
3316          v2.import_uniforms(&v);
3317          if (!v2.run()) {
3318             perf_debug("16-wide shader failed to compile, falling back to "
3319                        "8-wide at a 10-20%% performance cost: %s", v2.fail_msg);
3320          } else {
3321             simd16_instructions = &v2.instructions;
3322          }
3323       } else {
3324          perf_debug("Skipping 16-wide due to pull parameters.\n");
3325       }
3326    }
3327
3328    fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
3329    const unsigned *generated = g.generate_assembly(&v.instructions,
3330                                                    simd16_instructions,
3331                                                    final_assembly_size);
3332
3333    if (unlikely(brw->perf_debug) && shader) {
3334       if (shader->compiled_once)
3335          brw_wm_debug_recompile(brw, prog, &c->key);
3336       shader->compiled_once = true;
3337
3338       if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
3339          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3340                     (get_time() - start_time) * 1000);
3341       }
3342    }
3343
3344    return generated;
3345 }
3346
3347 bool
3348 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
3349 {
3350    struct brw_context *brw = brw_context(ctx);
3351    struct brw_wm_prog_key key;
3352
3353    if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
3354       return true;
3355
3356    struct gl_fragment_program *fp = (struct gl_fragment_program *)
3357       prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
3358    struct brw_fragment_program *bfp = brw_fragment_program(fp);
3359    bool program_uses_dfdy = fp->UsesDFdy;
3360
3361    memset(&key, 0, sizeof(key));
3362
3363    if (brw->gen < 6) {
3364       if (fp->UsesKill)
3365          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3366
3367       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3368          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3369
3370       /* Just assume depth testing. */
3371       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3372       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3373    }
3374
3375    if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
3376                                          BRW_FS_VARYING_INPUT_MASK) > 16)
3377       key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
3378
3379    key.clamp_fragment_color = ctx->API == API_OPENGL_COMPAT;
3380
3381    unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
3382    for (unsigned i = 0; i < sampler_count; i++) {
3383       if (fp->Base.ShadowSamplers & (1 << i)) {
3384          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3385          key.tex.swizzles[i] =
3386             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3387       } else {
3388          /* Color sampler: assume no swizzling. */
3389          key.tex.swizzles[i] = SWIZZLE_XYZW;
3390       }
3391    }
3392
3393    if (fp->Base.InputsRead & VARYING_BIT_POS) {
3394       key.drawable_height = ctx->DrawBuffer->Height;
3395    }
3396
3397    if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3398       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
3399    }
3400
3401    key.nr_color_regions = 1;
3402
3403    /* GL_FRAGMENT_SHADER_DERIVATIVE_HINT is almost always GL_DONT_CARE.  The
3404     * quality of the derivatives is likely to be determined by the driconf
3405     * option.
3406     */
3407    key.high_quality_derivatives = brw->disable_derivative_optimization;
3408
3409    key.program_string_id = bfp->id;
3410
3411    uint32_t old_prog_offset = brw->wm.base.prog_offset;
3412    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3413
3414    bool success = do_wm_prog(brw, prog, bfp, &key);
3415
3416    brw->wm.base.prog_offset = old_prog_offset;
3417    brw->wm.prog_data = old_prog_data;
3418
3419    return success;
3420 }