src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 extern "C" {
  32
  33 #include <sys/types.h>
  34
  35 #include "main/hash_table.h"
  36 #include "main/macros.h"
  37 #include "main/shaderobj.h"
  38 #include "main/fbobject.h"
  39 #include "program/prog_parameter.h"
  40 #include "program/prog_print.h"
  41 #include "program/register_allocate.h"
  42 #include "program/sampler.h"
  43 #include "program/hash_table.h"
  44 #include "brw_context.h"
  45 #include "brw_eu.h"
  46 #include "brw_wm.h"
  47 }
  48 #include "brw_fs.h"
  49 #include "brw_dead_control_flow.h"
  50 #include "main/uniforms.h"
  51 #include "brw_fs_live_variables.h"
  52 #include "glsl/glsl_types.h"
  53
  54 void
  55 fs_inst::init()
  56 {
  57    memset(this, 0, sizeof(*this));
  58    this->opcode = BRW_OPCODE_NOP;
  59    this->conditional_mod = BRW_CONDITIONAL_NONE;
  60
  61    this->dst = reg_undef;
  62    this->src[0] = reg_undef;
  63    this->src[1] = reg_undef;
  64    this->src[2] = reg_undef;
  65
  66    /* This will be the case for almost all instructions. */
  67    this->regs_written = 1;
  68 }
  69
  70 fs_inst::fs_inst()
  71 {
  72    init();
  73 }
  74
  75 fs_inst::fs_inst(enum opcode opcode)
  76 {
  77    init();
  78    this->opcode = opcode;
  79 }
  80
  81 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
  82 {
  83    init();
  84    this->opcode = opcode;
  85    this->dst = dst;
  86
  87    if (dst.file == GRF)
  88       assert(dst.reg_offset >= 0);
  89 }
  90
  91 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
  92 {
  93    init();
  94    this->opcode = opcode;
  95    this->dst = dst;
  96    this->src[0] = src0;
  97
  98    if (dst.file == GRF)
  99       assert(dst.reg_offset >= 0);
 100    if (src[0].file == GRF)
 101       assert(src[0].reg_offset >= 0);
 102 }
 103
 104 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 105 {
 106    init();
 107    this->opcode = opcode;
 108    this->dst = dst;
 109    this->src[0] = src0;
 110    this->src[1] = src1;
 111
 112    if (dst.file == GRF)
 113       assert(dst.reg_offset >= 0);
 114    if (src[0].file == GRF)
 115       assert(src[0].reg_offset >= 0);
 116    if (src[1].file == GRF)
 117       assert(src[1].reg_offset >= 0);
 118 }
 119
 120 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
 121                  fs_reg src0, fs_reg src1, fs_reg src2)
 122 {
 123    init();
 124    this->opcode = opcode;
 125    this->dst = dst;
 126    this->src[0] = src0;
 127    this->src[1] = src1;
 128    this->src[2] = src2;
 129
 130    if (dst.file == GRF)
 131       assert(dst.reg_offset >= 0);
 132    if (src[0].file == GRF)
 133       assert(src[0].reg_offset >= 0);
 134    if (src[1].file == GRF)
 135       assert(src[1].reg_offset >= 0);
 136    if (src[2].file == GRF)
 137       assert(src[2].reg_offset >= 0);
 138 }
 139
 140 #define ALU1(op)                                                        \
 141    fs_inst *                                                            \
 142    fs_visitor::op(fs_reg dst, fs_reg src0)                              \
 143    {                                                                    \
 144       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
 145    }
 146
 147 #define ALU2(op)                                                        \
 148    fs_inst *                                                            \
 149    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1)                 \
 150    {                                                                    \
 151       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
 152    }
 153
 154 #define ALU3(op)                                                        \
 155    fs_inst *                                                            \
 156    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1, fs_reg src2)    \
 157    {                                                                    \
 158       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
 159    }
 160
 161 ALU1(NOT)
 162 ALU1(MOV)
 163 ALU1(FRC)
 164 ALU1(RNDD)
 165 ALU1(RNDE)
 166 ALU1(RNDZ)
 167 ALU2(ADD)
 168 ALU2(MUL)
 169 ALU2(MACH)
 170 ALU2(AND)
 171 ALU2(OR)
 172 ALU2(XOR)
 173 ALU2(SHL)
 174 ALU2(SHR)
 175 ALU2(ASR)
 176 ALU3(LRP)
 177 ALU1(BFREV)
 178 ALU3(BFE)
 179 ALU2(BFI1)
 180 ALU3(BFI2)
 181 ALU1(FBH)
 182 ALU1(FBL)
 183 ALU1(CBIT)
 184 ALU3(MAD)
 185 ALU2(ADDC)
 186 ALU2(SUBB)
 187 ALU2(SEL)
 188
 189 /** Gen4 predicated IF. */
 190 fs_inst *
 191 fs_visitor::IF(uint32_t predicate)
 192 {
 193    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
 194    inst->predicate = predicate;
 195    return inst;
 196 }
 197
 198 /** Gen6 IF with embedded comparison. */
 199 fs_inst *
 200 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
 201 {
 202    assert(brw->gen == 6);
 203    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
 204                                         reg_null_d, src0, src1);
 205    inst->conditional_mod = condition;
 206    return inst;
 207 }
 208
 209 /**
 210  * CMP: Sets the low bit of the destination channels with the result
 211  * of the comparison, while the upper bits are undefined, and updates
 212  * the flag register with the packed 16 bits of the result.
 213  */
 214 fs_inst *
 215 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
 216 {
 217    fs_inst *inst;
 218
 219    /* Take the instruction:
 220     *
 221     * CMP null<d> src0<f> src1<f>
 222     *
 223     * Original gen4 does type conversion to the destination type before
 224     * comparison, producing garbage results for floating point comparisons.
 225     * gen5 does the comparison on the execution type (resolved source types),
 226     * so dst type doesn't matter.  gen6 does comparison and then uses the
 227     * result as if it was the dst type with no conversion, which happens to
 228     * mostly work out for float-interpreted-as-int since our comparisons are
 229     * for >0, =0, <0.
 230     */
 231    if (brw->gen == 4) {
 232       dst.type = src0.type;
 233       if (dst.file == HW_REG)
 234          dst.fixed_hw_reg.type = dst.type;
 235    }
 236
 237    resolve_ud_negate(&src0);
 238    resolve_ud_negate(&src1);
 239
 240    inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
 241    inst->conditional_mod = condition;
 242
 243    return inst;
 244 }
 245
 246 exec_list
 247 fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
 248                                        fs_reg varying_offset,
 249                                        uint32_t const_offset)
 250 {
 251    exec_list instructions;
 252    fs_inst *inst;
 253
 254    /* We have our constant surface use a pitch of 4 bytes, so our index can
 255     * be any component of a vector, and then we load 4 contiguous
 256     * components starting from that.
 257     *
 258     * We break down the const_offset to a portion added to the variable
 259     * offset and a portion done using reg_offset, which means that if you
 260     * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
 261     * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
 262     * CSE can later notice that those loads are all the same and eliminate
 263     * the redundant ones.
 264     */
 265    fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
 266    instructions.push_tail(ADD(vec4_offset,
 267                               varying_offset, const_offset & ~3));
 268
 269    int scale = 1;
 270    if (brw->gen == 4 && dispatch_width == 8) {
 271       /* Pre-gen5, we can either use a SIMD8 message that requires (header,
 272        * u, v, r) as parameters, or we can just use the SIMD16 message
 273        * consisting of (header, u).  We choose the second, at the cost of a
 274        * longer return length.
 275        */
 276       scale = 2;
 277    }
 278
 279    enum opcode op;
 280    if (brw->gen >= 7)
 281       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
 282    else
 283       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
 284    fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4 * scale), dst.type);
 285    inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
 286    inst->regs_written = 4 * scale;
 287    instructions.push_tail(inst);
 288
 289    if (brw->gen < 7) {
 290       inst->base_mrf = 13;
 291       inst->header_present = true;
 292       if (brw->gen == 4)
 293          inst->mlen = 3;
 294       else
 295          inst->mlen = 1 + dispatch_width / 8;
 296    }
 297
 298    vec4_result.reg_offset += (const_offset & 3) * scale;
 299    instructions.push_tail(MOV(dst, vec4_result));
 300
 301    return instructions;
 302 }
 303
 304 /**
 305  * A helper for MOV generation for fixing up broken hardware SEND dependency
 306  * handling.
 307  */
 308 fs_inst *
 309 fs_visitor::DEP_RESOLVE_MOV(int grf)
 310 {
 311    fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
 312
 313    inst->ir = NULL;
 314    inst->annotation = "send dependency resolve";
 315
 316    /* The caller always wants uncompressed to emit the minimal extra
 317     * dependencies, and to avoid having to deal with aligning its regs to 2.
 318     */
 319    inst->force_uncompressed = true;
 320
 321    return inst;
 322 }
 323
 324 bool
 325 fs_inst::equals(fs_inst *inst)
 326 {
 327    return (opcode == inst->opcode &&
 328            dst.equals(inst->dst) &&
 329            src[0].equals(inst->src[0]) &&
 330            src[1].equals(inst->src[1]) &&
 331            src[2].equals(inst->src[2]) &&
 332            saturate == inst->saturate &&
 333            predicate == inst->predicate &&
 334            conditional_mod == inst->conditional_mod &&
 335            mlen == inst->mlen &&
 336            base_mrf == inst->base_mrf &&
 337            sampler == inst->sampler &&
 338            target == inst->target &&
 339            eot == inst->eot &&
 340            header_present == inst->header_present &&
 341            shadow_compare == inst->shadow_compare &&
 342            offset == inst->offset);
 343 }
 344
 345 bool
 346 fs_inst::overwrites_reg(const fs_reg &reg)
 347 {
 348    return (reg.file == dst.file &&
 349            reg.reg == dst.reg &&
 350            reg.reg_offset >= dst.reg_offset  &&
 351            reg.reg_offset < dst.reg_offset + regs_written);
 352 }
 353
 354 bool
 355 fs_inst::is_send_from_grf()
 356 {
 357    return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
 358            opcode == SHADER_OPCODE_SHADER_TIME_ADD ||
 359            (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
 360             src[1].file == GRF) ||
 361            (is_tex() && src[0].file == GRF));
 362 }
 363
 364 bool
 365 fs_visitor::can_do_source_mods(fs_inst *inst)
 366 {
 367    if (brw->gen == 6 && inst->is_math())
 368       return false;
 369
 370    if (inst->is_send_from_grf())
 371       return false;
 372
 373    if (!inst->can_do_source_mods())
 374       return false;
 375
 376    return true;
 377 }
 378
 379 void
 380 fs_reg::init()
 381 {
 382    memset(this, 0, sizeof(*this));
 383    this->smear = -1;
 384 }
 385
 386 /** Generic unset register constructor. */
 387 fs_reg::fs_reg()
 388 {
 389    init();
 390    this->file = BAD_FILE;
 391 }
 392
 393 /** Immediate value constructor. */
 394 fs_reg::fs_reg(float f)
 395 {
 396    init();
 397    this->file = IMM;
 398    this->type = BRW_REGISTER_TYPE_F;
 399    this->imm.f = f;
 400 }
 401
 402 /** Immediate value constructor. */
 403 fs_reg::fs_reg(int32_t i)
 404 {
 405    init();
 406    this->file = IMM;
 407    this->type = BRW_REGISTER_TYPE_D;
 408    this->imm.i = i;
 409 }
 410
 411 /** Immediate value constructor. */
 412 fs_reg::fs_reg(uint32_t u)
 413 {
 414    init();
 415    this->file = IMM;
 416    this->type = BRW_REGISTER_TYPE_UD;
 417    this->imm.u = u;
 418 }
 419
 420 /** Fixed brw_reg. */
 421 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
 422 {
 423    init();
 424    this->file = HW_REG;
 425    this->fixed_hw_reg = fixed_hw_reg;
 426    this->type = fixed_hw_reg.type;
 427 }
 428
 429 bool
 430 fs_reg::equals(const fs_reg &r) const
 431 {
 432    return (file == r.file &&
 433            reg == r.reg &&
 434            reg_offset == r.reg_offset &&
 435            type == r.type &&
 436            negate == r.negate &&
 437            abs == r.abs &&
 438            !reladdr && !r.reladdr &&
 439            memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
 440                   sizeof(fixed_hw_reg)) == 0 &&
 441            smear == r.smear &&
 442            imm.u == r.imm.u);
 443 }
 444
 445 fs_reg
 446 fs_reg::retype(uint32_t type)
 447 {
 448    fs_reg result = *this;
 449    result.type = type;
 450    return result;
 451 }
 452
 453 bool
 454 fs_reg::is_zero() const
 455 {
 456    if (file != IMM)
 457       return false;
 458
 459    return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
 460 }
 461
 462 bool
 463 fs_reg::is_one() const
 464 {
 465    if (file != IMM)
 466       return false;
 467
 468    return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
 469 }
 470
 471 bool
 472 fs_reg::is_null() const
 473 {
 474    return file == HW_REG &&
 475           fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
 476           fixed_hw_reg.nr == BRW_ARF_NULL;
 477 }
 478
 479 bool
 480 fs_reg::is_valid_3src() const
 481 {
 482    return file == GRF || file == UNIFORM;
 483 }
 484
 485 int
 486 fs_visitor::type_size(const struct glsl_type *type)
 487 {
 488    unsigned int size, i;
 489
 490    switch (type->base_type) {
 491    case GLSL_TYPE_UINT:
 492    case GLSL_TYPE_INT:
 493    case GLSL_TYPE_FLOAT:
 494    case GLSL_TYPE_BOOL:
 495       return type->components();
 496    case GLSL_TYPE_ARRAY:
 497       return type_size(type->fields.array) * type->length;
 498    case GLSL_TYPE_STRUCT:
 499       size = 0;
 500       for (i = 0; i < type->length; i++) {
 501          size += type_size(type->fields.structure[i].type);
 502       }
 503       return size;
 504    case GLSL_TYPE_SAMPLER:
 505       /* Samplers take up no register space, since they're baked in at
 506        * link time.
 507        */
 508       return 0;
 509    case GLSL_TYPE_ATOMIC_UINT:
 510       return 0;
 511    case GLSL_TYPE_VOID:
 512    case GLSL_TYPE_ERROR:
 513    case GLSL_TYPE_INTERFACE:
 514       assert(!"not reached");
 515       break;
 516    }
 517
 518    return 0;
 519 }
 520
 521 fs_reg
 522 fs_visitor::get_timestamp()
 523 {
 524    assert(brw->gen >= 7);
 525
 526    fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 527                                           BRW_ARF_TIMESTAMP,
 528                                           0),
 529                              BRW_REGISTER_TYPE_UD));
 530
 531    fs_reg dst = fs_reg(this, glsl_type::uint_type);
 532
 533    fs_inst *mov = emit(MOV(dst, ts));
 534    /* We want to read the 3 fields we care about (mostly field 0, but also 2)
 535     * even if it's not enabled in the dispatch.
 536     */
 537    mov->force_writemask_all = true;
 538    mov->force_uncompressed = true;
 539
 540    /* The caller wants the low 32 bits of the timestamp.  Since it's running
 541     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
 542     * which is plenty of time for our purposes.  It is identical across the
 543     * EUs, but since it's tracking GPU core speed it will increment at a
 544     * varying rate as render P-states change.
 545     *
 546     * The caller could also check if render P-states have changed (or anything
 547     * else that might disrupt timing) by setting smear to 2 and checking if
 548     * that field is != 0.
 549     */
 550    dst.smear = 0;
 551
 552    return dst;
 553 }
 554
 555 void
 556 fs_visitor::emit_shader_time_begin()
 557 {
 558    current_annotation = "shader time start";
 559    shader_start_time = get_timestamp();
 560 }
 561
 562 void
 563 fs_visitor::emit_shader_time_end()
 564 {
 565    current_annotation = "shader time end";
 566
 567    enum shader_time_shader_type type, written_type, reset_type;
 568    if (dispatch_width == 8) {
 569       type = ST_FS8;
 570       written_type = ST_FS8_WRITTEN;
 571       reset_type = ST_FS8_RESET;
 572    } else {
 573       assert(dispatch_width == 16);
 574       type = ST_FS16;
 575       written_type = ST_FS16_WRITTEN;
 576       reset_type = ST_FS16_RESET;
 577    }
 578
 579    fs_reg shader_end_time = get_timestamp();
 580
 581    /* Check that there weren't any timestamp reset events (assuming these
 582     * were the only two timestamp reads that happened).
 583     */
 584    fs_reg reset = shader_end_time;
 585    reset.smear = 2;
 586    fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
 587    test->conditional_mod = BRW_CONDITIONAL_Z;
 588    emit(IF(BRW_PREDICATE_NORMAL));
 589
 590    push_force_uncompressed();
 591    fs_reg start = shader_start_time;
 592    start.negate = true;
 593    fs_reg diff = fs_reg(this, glsl_type::uint_type);
 594    emit(ADD(diff, start, shader_end_time));
 595
 596    /* If there were no instructions between the two timestamp gets, the diff
 597     * is 2 cycles.  Remove that overhead, so I can forget about that when
 598     * trying to determine the time taken for single instructions.
 599     */
 600    emit(ADD(diff, diff, fs_reg(-2u)));
 601
 602    emit_shader_time_write(type, diff);
 603    emit_shader_time_write(written_type, fs_reg(1u));
 604    emit(BRW_OPCODE_ELSE);
 605    emit_shader_time_write(reset_type, fs_reg(1u));
 606    emit(BRW_OPCODE_ENDIF);
 607
 608    pop_force_uncompressed();
 609 }
 610
 611 void
 612 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
 613                                    fs_reg value)
 614 {
 615    int shader_time_index =
 616       brw_get_shader_time_index(brw, shader_prog, &fp->Base, type);
 617    fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
 618
 619    fs_reg payload;
 620    if (dispatch_width == 8)
 621       payload = fs_reg(this, glsl_type::uvec2_type);
 622    else
 623       payload = fs_reg(this, glsl_type::uint_type);
 624
 625    emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
 626                 fs_reg(), payload, offset, value));
 627 }
 628
 629 void
 630 fs_visitor::fail(const char *format, ...)
 631 {
 632    va_list va;
 633    char *msg;
 634
 635    if (failed)
 636       return;
 637
 638    failed = true;
 639
 640    va_start(va, format);
 641    msg = ralloc_vasprintf(mem_ctx, format, va);
 642    va_end(va);
 643    msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
 644
 645    this->fail_msg = msg;
 646
 647    if (INTEL_DEBUG & DEBUG_WM) {
 648       fprintf(stderr, "%s",  msg);
 649    }
 650 }
 651
 652 fs_inst *
 653 fs_visitor::emit(enum opcode opcode)
 654 {
 655    return emit(fs_inst(opcode));
 656 }
 657
 658 fs_inst *
 659 fs_visitor::emit(enum opcode opcode, fs_reg dst)
 660 {
 661    return emit(fs_inst(opcode, dst));
 662 }
 663
 664 fs_inst *
 665 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
 666 {
 667    return emit(fs_inst(opcode, dst, src0));
 668 }
 669
 670 fs_inst *
 671 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 672 {
 673    return emit(fs_inst(opcode, dst, src0, src1));
 674 }
 675
 676 fs_inst *
 677 fs_visitor::emit(enum opcode opcode, fs_reg dst,
 678                  fs_reg src0, fs_reg src1, fs_reg src2)
 679 {
 680    return emit(fs_inst(opcode, dst, src0, src1, src2));
 681 }
 682
 683 void
 684 fs_visitor::push_force_uncompressed()
 685 {
 686    force_uncompressed_stack++;
 687 }
 688
 689 void
 690 fs_visitor::pop_force_uncompressed()
 691 {
 692    force_uncompressed_stack--;
 693    assert(force_uncompressed_stack >= 0);
 694 }
 695
 696 /**
 697  * Returns true if the instruction has a flag that means it won't
 698  * update an entire destination register.
 699  *
 700  * For example, dead code elimination and live variable analysis want to know
 701  * when a write to a variable screens off any preceding values that were in
 702  * it.
 703  */
 704 bool
 705 fs_inst::is_partial_write()
 706 {
 707    return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
 708            this->force_uncompressed ||
 709            this->force_sechalf);
 710 }
 711
 712 int
 713 fs_inst::regs_read(fs_visitor *v, int arg)
 714 {
 715    if (is_tex() && arg == 0 && src[0].file == GRF) {
 716       if (v->dispatch_width == 16)
 717          return (mlen + 1) / 2;
 718       else
 719          return mlen;
 720    }
 721    return 1;
 722 }
 723
 724 bool
 725 fs_inst::reads_flag()
 726 {
 727    return predicate;
 728 }
 729
 730 bool
 731 fs_inst::writes_flag()
 732 {
 733    return (conditional_mod && opcode != BRW_OPCODE_SEL) ||
 734           opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
 735 }
 736
 737 /**
 738  * Returns how many MRFs an FS opcode will write over.
 739  *
 740  * Note that this is not the 0 or 1 implied writes in an actual gen
 741  * instruction -- the FS opcodes often generate MOVs in addition.
 742  */
 743 int
 744 fs_visitor::implied_mrf_writes(fs_inst *inst)
 745 {
 746    if (inst->mlen == 0)
 747       return 0;
 748
 749    if (inst->base_mrf == -1)
 750       return 0;
 751
 752    switch (inst->opcode) {
 753    case SHADER_OPCODE_RCP:
 754    case SHADER_OPCODE_RSQ:
 755    case SHADER_OPCODE_SQRT:
 756    case SHADER_OPCODE_EXP2:
 757    case SHADER_OPCODE_LOG2:
 758    case SHADER_OPCODE_SIN:
 759    case SHADER_OPCODE_COS:
 760       return 1 * dispatch_width / 8;
 761    case SHADER_OPCODE_POW:
 762    case SHADER_OPCODE_INT_QUOTIENT:
 763    case SHADER_OPCODE_INT_REMAINDER:
 764       return 2 * dispatch_width / 8;
 765    case SHADER_OPCODE_TEX:
 766    case FS_OPCODE_TXB:
 767    case SHADER_OPCODE_TXD:
 768    case SHADER_OPCODE_TXF:
 769    case SHADER_OPCODE_TXF_MS:
 770    case SHADER_OPCODE_TXF_MCS:
 771    case SHADER_OPCODE_TG4:
 772    case SHADER_OPCODE_TG4_OFFSET:
 773    case SHADER_OPCODE_TXL:
 774    case SHADER_OPCODE_TXS:
 775    case SHADER_OPCODE_LOD:
 776       return 1;
 777    case FS_OPCODE_FB_WRITE:
 778       return 2;
 779    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 780    case SHADER_OPCODE_GEN4_SCRATCH_READ:
 781       return 1;
 782    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
 783       return inst->mlen;
 784    case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
 785       return 2;
 786    case SHADER_OPCODE_UNTYPED_ATOMIC:
 787    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
 788       return 0;
 789    default:
 790       assert(!"not reached");
 791       return inst->mlen;
 792    }
 793 }
 794
 795 int
 796 fs_visitor::virtual_grf_alloc(int size)
 797 {
 798    if (virtual_grf_array_size <= virtual_grf_count) {
 799       if (virtual_grf_array_size == 0)
 800          virtual_grf_array_size = 16;
 801       else
 802          virtual_grf_array_size *= 2;
 803       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 804                                    virtual_grf_array_size);
 805    }
 806    virtual_grf_sizes[virtual_grf_count] = size;
 807    return virtual_grf_count++;
 808 }
 809
 810 /** Fixed HW reg constructor. */
 811 fs_reg::fs_reg(enum register_file file, int reg)
 812 {
 813    init();
 814    this->file = file;
 815    this->reg = reg;
 816    this->type = BRW_REGISTER_TYPE_F;
 817 }
 818
 819 /** Fixed HW reg constructor. */
 820 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
 821 {
 822    init();
 823    this->file = file;
 824    this->reg = reg;
 825    this->type = type;
 826 }
 827
 828 /** Automatic reg constructor. */
 829 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
 830 {
 831    init();
 832
 833    this->file = GRF;
 834    this->reg = v->virtual_grf_alloc(v->type_size(type));
 835    this->reg_offset = 0;
 836    this->type = brw_type_for_base_type(type);
 837 }
 838
 839 fs_reg *
 840 fs_visitor::variable_storage(ir_variable *var)
 841 {
 842    return (fs_reg *)hash_table_find(this->variable_ht, var);
 843 }
 844
 845 void
 846 import_uniforms_callback(const void *key,
 847                          void *data,
 848                          void *closure)
 849 {
 850    struct hash_table *dst_ht = (struct hash_table *)closure;
 851    const fs_reg *reg = (const fs_reg *)data;
 852
 853    if (reg->file != UNIFORM)
 854       return;
 855
 856    hash_table_insert(dst_ht, data, key);
 857 }
 858
 859 /* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
 860  * This brings in those uniform definitions
 861  */
 862 void
 863 fs_visitor::import_uniforms(fs_visitor *v)
 864 {
 865    hash_table_call_foreach(v->variable_ht,
 866                            import_uniforms_callback,
 867                            variable_ht);
 868    this->params_remap = v->params_remap;
 869    this->nr_params_remap = v->nr_params_remap;
 870 }
 871
 872 /* Our support for uniforms is piggy-backed on the struct
 873  * gl_fragment_program, because that's where the values actually
 874  * get stored, rather than in some global gl_shader_program uniform
 875  * store.
 876  */
 877 void
 878 fs_visitor::setup_uniform_values(ir_variable *ir)
 879 {
 880    int namelen = strlen(ir->name);
 881
 882    /* The data for our (non-builtin) uniforms is stored in a series of
 883     * gl_uniform_driver_storage structs for each subcomponent that
 884     * glGetUniformLocation() could name.  We know it's been set up in the same
 885     * order we'd walk the type, so walk the list of storage and find anything
 886     * with our name, or the prefix of a component that starts with our name.
 887     */
 888    unsigned params_before = c->prog_data.nr_params;
 889    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
 890       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 891
 892       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 893           (storage->name[namelen] != 0 &&
 894            storage->name[namelen] != '.' &&
 895            storage->name[namelen] != '[')) {
 896          continue;
 897       }
 898
 899       unsigned slots = storage->type->component_slots();
 900       if (storage->array_elements)
 901          slots *= storage->array_elements;
 902
 903       for (unsigned i = 0; i < slots; i++) {
 904          c->prog_data.param[c->prog_data.nr_params++] =
 905             &storage->storage[i].f;
 906       }
 907    }
 908
 909    /* Make sure we actually initialized the right amount of stuff here. */
 910    assert(params_before + ir->type->component_slots() ==
 911           c->prog_data.nr_params);
 912    (void)params_before;
 913 }
 914
 915
 916 /* Our support for builtin uniforms is even scarier than non-builtin.
 917  * It sits on top of the PROG_STATE_VAR parameters that are
 918  * automatically updated from GL context state.
 919  */
 920 void
 921 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
 922 {
 923    const ir_state_slot *const slots = ir->state_slots;
 924    assert(ir->state_slots != NULL);
 925
 926    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 927       /* This state reference has already been setup by ir_to_mesa, but we'll
 928        * get the same index back here.
 929        */
 930       int index = _mesa_add_state_reference(this->fp->Base.Parameters,
 931                                             (gl_state_index *)slots[i].tokens);
 932
 933       /* Add each of the unique swizzles of the element as a parameter.
 934        * This'll end up matching the expected layout of the
 935        * array/matrix/structure we're trying to fill in.
 936        */
 937       int last_swiz = -1;
 938       for (unsigned int j = 0; j < 4; j++) {
 939          int swiz = GET_SWZ(slots[i].swizzle, j);
 940          if (swiz == last_swiz)
 941             break;
 942          last_swiz = swiz;
 943
 944          c->prog_data.param[c->prog_data.nr_params++] =
 945             &fp->Base.Parameters->ParameterValues[index][swiz].f;
 946       }
 947    }
 948 }
 949
 950 fs_reg *
 951 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
 952 {
 953    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 954    fs_reg wpos = *reg;
 955    bool flip = !ir->data.origin_upper_left ^ c->key.render_to_fbo;
 956
 957    /* gl_FragCoord.x */
 958    if (ir->data.pixel_center_integer) {
 959       emit(MOV(wpos, this->pixel_x));
 960    } else {
 961       emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
 962    }
 963    wpos.reg_offset++;
 964
 965    /* gl_FragCoord.y */
 966    if (!flip && ir->data.pixel_center_integer) {
 967       emit(MOV(wpos, this->pixel_y));
 968    } else {
 969       fs_reg pixel_y = this->pixel_y;
 970       float offset = (ir->data.pixel_center_integer ? 0.0 : 0.5);
 971
 972       if (flip) {
 973          pixel_y.negate = true;
 974          offset += c->key.drawable_height - 1.0;
 975       }
 976
 977       emit(ADD(wpos, pixel_y, fs_reg(offset)));
 978    }
 979    wpos.reg_offset++;
 980
 981    /* gl_FragCoord.z */
 982    if (brw->gen >= 6) {
 983       emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
 984    } else {
 985       emit(FS_OPCODE_LINTERP, wpos,
 986            this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
 987            this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
 988            interp_reg(VARYING_SLOT_POS, 2));
 989    }
 990    wpos.reg_offset++;
 991
 992    /* gl_FragCoord.w: Already set up in emit_interpolation */
 993    emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
 994
 995    return reg;
 996 }
 997
 998 fs_inst *
 999 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1000                          glsl_interp_qualifier interpolation_mode,
1001                          bool is_centroid)
1002 {
1003    brw_wm_barycentric_interp_mode barycoord_mode;
1004    if (brw->gen >= 6) {
1005       if (is_centroid) {
1006          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1007             barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1008          else
1009             barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1010       } else {
1011          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1012             barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1013          else
1014             barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1015       }
1016    } else {
1017       /* On Ironlake and below, there is only one interpolation mode.
1018        * Centroid interpolation doesn't mean anything on this hardware --
1019        * there is no multisampling.
1020        */
1021       barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1022    }
1023    return emit(FS_OPCODE_LINTERP, attr,
1024                this->delta_x[barycoord_mode],
1025                this->delta_y[barycoord_mode], interp);
1026 }
1027
1028 fs_reg *
1029 fs_visitor::emit_general_interpolation(ir_variable *ir)
1030 {
1031    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1032    reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
1033    fs_reg attr = *reg;
1034
1035    unsigned int array_elements;
1036    const glsl_type *type;
1037
1038    if (ir->type->is_array()) {
1039       array_elements = ir->type->length;
1040       if (array_elements == 0) {
1041          fail("dereferenced array '%s' has length 0\n", ir->name);
1042       }
1043       type = ir->type->fields.array;
1044    } else {
1045       array_elements = 1;
1046       type = ir->type;
1047    }
1048
1049    glsl_interp_qualifier interpolation_mode =
1050       ir->determine_interpolation_mode(c->key.flat_shade);
1051
1052    int location = ir->data.location;
1053    for (unsigned int i = 0; i < array_elements; i++) {
1054       for (unsigned int j = 0; j < type->matrix_columns; j++) {
1055          if (c->prog_data.urb_setup[location] == -1) {
1056             /* If there's no incoming setup data for this slot, don't
1057              * emit interpolation for it.
1058              */
1059             attr.reg_offset += type->vector_elements;
1060             location++;
1061             continue;
1062          }
1063
1064          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1065             /* Constant interpolation (flat shading) case. The SF has
1066              * handed us defined values in only the constant offset
1067              * field of the setup reg.
1068              */
1069             for (unsigned int k = 0; k < type->vector_elements; k++) {
1070                struct brw_reg interp = interp_reg(location, k);
1071                interp = suboffset(interp, 3);
1072                interp.type = reg->type;
1073                emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1074                attr.reg_offset++;
1075             }
1076          } else {
1077             /* Smooth/noperspective interpolation case. */
1078             for (unsigned int k = 0; k < type->vector_elements; k++) {
1079                /* FINISHME: At some point we probably want to push
1080                 * this farther by giving similar treatment to the
1081                 * other potentially constant components of the
1082                 * attribute, as well as making brw_vs_constval.c
1083                 * handle varyings other than gl_TexCoord.
1084                 */
1085                struct brw_reg interp = interp_reg(location, k);
1086                emit_linterp(attr, fs_reg(interp), interpolation_mode,
1087                             ir->data.centroid);
1088                if (brw->needs_unlit_centroid_workaround && ir->data.centroid) {
1089                   /* Get the pixel/sample mask into f0 so that we know
1090                    * which pixels are lit.  Then, for each channel that is
1091                    * unlit, replace the centroid data with non-centroid
1092                    * data.
1093                    */
1094                   emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1095                   fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1096                                                interpolation_mode, false);
1097                   inst->predicate = BRW_PREDICATE_NORMAL;
1098                   inst->predicate_inverse = true;
1099                }
1100                if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1101                   emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1102                }
1103                attr.reg_offset++;
1104             }
1105
1106          }
1107          location++;
1108       }
1109    }
1110
1111    return reg;
1112 }
1113
1114 fs_reg *
1115 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1116 {
1117    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1118
1119    /* The frontfacing comes in as a bit in the thread payload. */
1120    if (brw->gen >= 6) {
1121       emit(BRW_OPCODE_ASR, *reg,
1122            fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1123            fs_reg(15));
1124       emit(BRW_OPCODE_NOT, *reg, *reg);
1125       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1126    } else {
1127       struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1128       /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1129        * us front face
1130        */
1131       emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1132       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1133    }
1134
1135    return reg;
1136 }
1137
1138 void
1139 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1140 {
1141    assert(dst.type == BRW_REGISTER_TYPE_F);
1142
1143    if (c->key.compute_pos_offset) {
1144       /* Convert int_sample_pos to floating point */
1145       emit(MOV(dst, int_sample_pos));
1146       /* Scale to the range [0, 1] */
1147       emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1148    }
1149    else {
1150       /* From ARB_sample_shading specification:
1151        * "When rendering to a non-multisample buffer, or if multisample
1152        *  rasterization is disabled, gl_SamplePosition will always be
1153        *  (0.5, 0.5).
1154        */
1155       emit(MOV(dst, fs_reg(0.5f)));
1156    }
1157 }
1158
1159 fs_reg *
1160 fs_visitor::emit_samplepos_setup(ir_variable *ir)
1161 {
1162    assert(brw->gen >= 6);
1163    assert(ir->type == glsl_type::vec2_type);
1164
1165    this->current_annotation = "compute sample position";
1166    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1167    fs_reg pos = *reg;
1168    fs_reg int_sample_x = fs_reg(this, glsl_type::int_type);
1169    fs_reg int_sample_y = fs_reg(this, glsl_type::int_type);
1170
1171    /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1172     * mode will be enabled.
1173     *
1174     * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1175     * R31.1:0         Position Offset X/Y for Slot[3:0]
1176     * R31.3:2         Position Offset X/Y for Slot[7:4]
1177     * .....
1178     *
1179     * The X, Y sample positions come in as bytes in  thread payload. So, read
1180     * the positions using vstride=16, width=8, hstride=2.
1181     */
1182    struct brw_reg sample_pos_reg =
1183       stride(retype(brw_vec1_grf(c->sample_pos_reg, 0),
1184                     BRW_REGISTER_TYPE_B), 16, 8, 2);
1185
1186    emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1187    if (dispatch_width == 16) {
1188       int_sample_x.sechalf = true;
1189       fs_inst *inst = emit(MOV(int_sample_x,
1190                                fs_reg(suboffset(sample_pos_reg, 16))));
1191       inst->force_sechalf = true;
1192       int_sample_x.sechalf = false;
1193    }
1194    /* Compute gl_SamplePosition.x */
1195    compute_sample_position(pos, int_sample_x);
1196    pos.reg_offset++;
1197    emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1198    if (dispatch_width == 16) {
1199       int_sample_y.sechalf = true;
1200       fs_inst *inst = emit(MOV(int_sample_y,
1201                                fs_reg(suboffset(sample_pos_reg, 17))));
1202       inst->force_sechalf = true;
1203       int_sample_y.sechalf = false;
1204    }
1205    /* Compute gl_SamplePosition.y */
1206    compute_sample_position(pos, int_sample_y);
1207    return reg;
1208 }
1209
1210 fs_reg *
1211 fs_visitor::emit_sampleid_setup(ir_variable *ir)
1212 {
1213    assert(brw->gen >= 6);
1214
1215    this->current_annotation = "compute sample id";
1216    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1217
1218    if (c->key.compute_sample_id) {
1219       fs_reg t1 = fs_reg(this, glsl_type::int_type);
1220       fs_reg t2 = fs_reg(this, glsl_type::int_type);
1221       t2.type = BRW_REGISTER_TYPE_UW;
1222
1223       /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1224        * 8x multisampling, subspan 0 will represent sample N (where N
1225        * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1226        * 7. We can find the value of N by looking at R0.0 bits 7:6
1227        * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1228        * (since samples are always delivered in pairs). That is, we
1229        * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1230        * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1231        * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1232        * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1233        * populating a temporary variable with the sequence (0, 1, 2, 3),
1234        * and then reading from it using vstride=1, width=4, hstride=0.
1235        * These computations hold good for 4x multisampling as well.
1236        */
1237       emit(BRW_OPCODE_AND, t1,
1238            fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1239            fs_reg(brw_imm_d(0xc0)));
1240       emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1241       /* This works for both SIMD8 and SIMD16 */
1242       emit(MOV(t2, brw_imm_v(0x3210)));
1243       /* This special instruction takes care of setting vstride=1,
1244        * width=4, hstride=0 of t2 during an ADD instruction.
1245        */
1246       emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1247    } else {
1248       /* As per GL_ARB_sample_shading specification:
1249        * "When rendering to a non-multisample buffer, or if multisample
1250        *  rasterization is disabled, gl_SampleID will always be zero."
1251        */
1252       emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1253    }
1254
1255    return reg;
1256 }
1257
1258 fs_reg *
1259 fs_visitor::emit_samplemaskin_setup(ir_variable *ir)
1260 {
1261    assert(brw->gen >= 7);
1262    this->current_annotation = "compute gl_SampleMaskIn";
1263    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1264    emit(MOV(*reg, fs_reg(retype(brw_vec8_grf(c->sample_mask_reg, 0), BRW_REGISTER_TYPE_D))));
1265    return reg;
1266 }
1267
1268 fs_reg
1269 fs_visitor::fix_math_operand(fs_reg src)
1270 {
1271    /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1272     * might be able to do better by doing execsize = 1 math and then
1273     * expanding that result out, but we would need to be careful with
1274     * masking.
1275     *
1276     * The hardware ignores source modifiers (negate and abs) on math
1277     * instructions, so we also move to a temp to set those up.
1278     */
1279    if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1280        !src.abs && !src.negate)
1281       return src;
1282
1283    /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1284     * operands to math
1285     */
1286    if (brw->gen >= 7 && src.file != IMM)
1287       return src;
1288
1289    fs_reg expanded = fs_reg(this, glsl_type::float_type);
1290    expanded.type = src.type;
1291    emit(BRW_OPCODE_MOV, expanded, src);
1292    return expanded;
1293 }
1294
1295 fs_inst *
1296 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1297 {
1298    switch (opcode) {
1299    case SHADER_OPCODE_RCP:
1300    case SHADER_OPCODE_RSQ:
1301    case SHADER_OPCODE_SQRT:
1302    case SHADER_OPCODE_EXP2:
1303    case SHADER_OPCODE_LOG2:
1304    case SHADER_OPCODE_SIN:
1305    case SHADER_OPCODE_COS:
1306       break;
1307    default:
1308       assert(!"not reached: bad math opcode");
1309       return NULL;
1310    }
1311
1312    /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
1313     * might be able to do better by doing execsize = 1 math and then
1314     * expanding that result out, but we would need to be careful with
1315     * masking.
1316     *
1317     * Gen 6 hardware ignores source modifiers (negate and abs) on math
1318     * instructions, so we also move to a temp to set those up.
1319     */
1320    if (brw->gen >= 6)
1321       src = fix_math_operand(src);
1322
1323    fs_inst *inst = emit(opcode, dst, src);
1324
1325    if (brw->gen < 6) {
1326       inst->base_mrf = 2;
1327       inst->mlen = dispatch_width / 8;
1328    }
1329
1330    return inst;
1331 }
1332
1333 fs_inst *
1334 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1335 {
1336    int base_mrf = 2;
1337    fs_inst *inst;
1338
1339    switch (opcode) {
1340    case SHADER_OPCODE_INT_QUOTIENT:
1341    case SHADER_OPCODE_INT_REMAINDER:
1342       if (brw->gen >= 7 && dispatch_width == 16)
1343          fail("16-wide INTDIV unsupported\n");
1344       break;
1345    case SHADER_OPCODE_POW:
1346       break;
1347    default:
1348       assert(!"not reached: unsupported binary math opcode.");
1349       return NULL;
1350    }
1351
1352    if (brw->gen >= 6) {
1353       src0 = fix_math_operand(src0);
1354       src1 = fix_math_operand(src1);
1355
1356       inst = emit(opcode, dst, src0, src1);
1357    } else {
1358       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1359        * "Message Payload":
1360        *
1361        * "Operand0[7].  For the INT DIV functions, this operand is the
1362        *  denominator."
1363        *  ...
1364        * "Operand1[7].  For the INT DIV functions, this operand is the
1365        *  numerator."
1366        */
1367       bool is_int_div = opcode != SHADER_OPCODE_POW;
1368       fs_reg &op0 = is_int_div ? src1 : src0;
1369       fs_reg &op1 = is_int_div ? src0 : src1;
1370
1371       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1372       inst = emit(opcode, dst, op0, reg_null_f);
1373
1374       inst->base_mrf = base_mrf;
1375       inst->mlen = 2 * dispatch_width / 8;
1376    }
1377    return inst;
1378 }
1379
1380 void
1381 fs_visitor::assign_curb_setup()
1382 {
1383    c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
1384    if (dispatch_width == 8) {
1385       c->prog_data.first_curbe_grf = c->nr_payload_regs;
1386    } else {
1387       c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1388    }
1389
1390    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1391    foreach_list(node, &this->instructions) {
1392       fs_inst *inst = (fs_inst *)node;
1393
1394       for (unsigned int i = 0; i < 3; i++) {
1395          if (inst->src[i].file == UNIFORM) {
1396             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1397             struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1398                                                   constant_nr / 8,
1399                                                   constant_nr % 8);
1400
1401             inst->src[i].file = HW_REG;
1402             inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
1403          }
1404       }
1405    }
1406 }
1407
1408 void
1409 fs_visitor::calculate_urb_setup()
1410 {
1411    for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1412       c->prog_data.urb_setup[i] = -1;
1413    }
1414
1415    int urb_next = 0;
1416    /* Figure out where each of the incoming setup attributes lands. */
1417    if (brw->gen >= 6) {
1418       if (_mesa_bitcount_64(fp->Base.InputsRead &
1419                             BRW_FS_VARYING_INPUT_MASK) <= 16) {
1420          /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1421           * first 16 varying inputs, so we can put them wherever we want.
1422           * Just put them in order.
1423           *
1424           * This is useful because it means that (a) inputs not used by the
1425           * fragment shader won't take up valuable register space, and (b) we
1426           * won't have to recompile the fragment shader if it gets paired with
1427           * a different vertex (or geometry) shader.
1428           */
1429          for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1430             if (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1431                 BITFIELD64_BIT(i)) {
1432                c->prog_data.urb_setup[i] = urb_next++;
1433             }
1434          }
1435       } else {
1436          /* We have enough input varyings that the SF/SBE pipeline stage can't
1437           * arbitrarily rearrange them to suit our whim; we have to put them
1438           * in an order that matches the output of the previous pipeline stage
1439           * (geometry or vertex shader).
1440           */
1441          struct brw_vue_map prev_stage_vue_map;
1442          brw_compute_vue_map(brw, &prev_stage_vue_map,
1443                              c->key.input_slots_valid);
1444          int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1445          assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1446          for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1447               slot++) {
1448             int varying = prev_stage_vue_map.slot_to_varying[slot];
1449             /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1450              * unused.
1451              */
1452             if (varying != BRW_VARYING_SLOT_COUNT &&
1453                 (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1454                  BITFIELD64_BIT(varying))) {
1455                c->prog_data.urb_setup[varying] = slot - first_slot;
1456             }
1457          }
1458          urb_next = prev_stage_vue_map.num_slots - first_slot;
1459       }
1460    } else {
1461       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1462       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1463          /* Point size is packed into the header, not as a general attribute */
1464          if (i == VARYING_SLOT_PSIZ)
1465             continue;
1466
1467          if (c->key.input_slots_valid & BITFIELD64_BIT(i)) {
1468             /* The back color slot is skipped when the front color is
1469              * also written to.  In addition, some slots can be
1470              * written in the vertex shader and not read in the
1471              * fragment shader.  So the register number must always be
1472              * incremented, mapped or not.
1473              */
1474             if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1475                c->prog_data.urb_setup[i] = urb_next;
1476             urb_next++;
1477          }
1478       }
1479
1480       /*
1481        * It's a FS only attribute, and we did interpolation for this attribute
1482        * in SF thread. So, count it here, too.
1483        *
1484        * See compile_sf_prog() for more info.
1485        */
1486       if (fp->Base.InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1487          c->prog_data.urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1488    }
1489
1490    c->prog_data.num_varying_inputs = urb_next;
1491 }
1492
1493 void
1494 fs_visitor::assign_urb_setup()
1495 {
1496    int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1497
1498    /* Offset all the urb_setup[] index by the actual position of the
1499     * setup regs, now that the location of the constants has been chosen.
1500     */
1501    foreach_list(node, &this->instructions) {
1502       fs_inst *inst = (fs_inst *)node;
1503
1504       if (inst->opcode == FS_OPCODE_LINTERP) {
1505          assert(inst->src[2].file == HW_REG);
1506          inst->src[2].fixed_hw_reg.nr += urb_start;
1507       }
1508
1509       if (inst->opcode == FS_OPCODE_CINTERP) {
1510          assert(inst->src[0].file == HW_REG);
1511          inst->src[0].fixed_hw_reg.nr += urb_start;
1512       }
1513    }
1514
1515    /* Each attribute is 4 setup channels, each of which is half a reg. */
1516    this->first_non_payload_grf =
1517       urb_start + c->prog_data.num_varying_inputs * 2;
1518 }
1519
1520 /**
1521  * Split large virtual GRFs into separate components if we can.
1522  *
1523  * This is mostly duplicated with what brw_fs_vector_splitting does,
1524  * but that's really conservative because it's afraid of doing
1525  * splitting that doesn't result in real progress after the rest of
1526  * the optimization phases, which would cause infinite looping in
1527  * optimization.  We can do it once here, safely.  This also has the
1528  * opportunity to split interpolated values, or maybe even uniforms,
1529  * which we don't have at the IR level.
1530  *
1531  * We want to split, because virtual GRFs are what we register
1532  * allocate and spill (due to contiguousness requirements for some
1533  * instructions), and they're what we naturally generate in the
1534  * codegen process, but most virtual GRFs don't actually need to be
1535  * contiguous sets of GRFs.  If we split, we'll end up with reduced
1536  * live intervals and better dead code elimination and coalescing.
1537  */
1538 void
1539 fs_visitor::split_virtual_grfs()
1540 {
1541    int num_vars = this->virtual_grf_count;
1542    bool split_grf[num_vars];
1543    int new_virtual_grf[num_vars];
1544
1545    /* Try to split anything > 0 sized. */
1546    for (int i = 0; i < num_vars; i++) {
1547       if (this->virtual_grf_sizes[i] != 1)
1548          split_grf[i] = true;
1549       else
1550          split_grf[i] = false;
1551    }
1552
1553    if (brw->has_pln &&
1554        this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1555       /* PLN opcodes rely on the delta_xy being contiguous.  We only have to
1556        * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1557        * Gen6, that was the only supported interpolation mode, and since Gen6,
1558        * delta_x and delta_y are in fixed hardware registers.
1559        */
1560       split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1561          false;
1562    }
1563
1564    foreach_list(node, &this->instructions) {
1565       fs_inst *inst = (fs_inst *)node;
1566
1567       /* If there's a SEND message that requires contiguous destination
1568        * registers, no splitting is allowed.
1569        */
1570       if (inst->regs_written > 1) {
1571          split_grf[inst->dst.reg] = false;
1572       }
1573
1574       /* If we're sending from a GRF, don't split it, on the assumption that
1575        * the send is reading the whole thing.
1576        */
1577       if (inst->is_send_from_grf()) {
1578          for (int i = 0; i < 3; i++) {
1579             if (inst->src[i].file == GRF) {
1580                split_grf[inst->src[i].reg] = false;
1581             }
1582          }
1583       }
1584    }
1585
1586    /* Allocate new space for split regs.  Note that the virtual
1587     * numbers will be contiguous.
1588     */
1589    for (int i = 0; i < num_vars; i++) {
1590       if (split_grf[i]) {
1591          new_virtual_grf[i] = virtual_grf_alloc(1);
1592          for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1593             int reg = virtual_grf_alloc(1);
1594             assert(reg == new_virtual_grf[i] + j - 1);
1595             (void) reg;
1596          }
1597          this->virtual_grf_sizes[i] = 1;
1598       }
1599    }
1600
1601    foreach_list(node, &this->instructions) {
1602       fs_inst *inst = (fs_inst *)node;
1603
1604       if (inst->dst.file == GRF &&
1605           split_grf[inst->dst.reg] &&
1606           inst->dst.reg_offset != 0) {
1607          inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1608                           inst->dst.reg_offset - 1);
1609          inst->dst.reg_offset = 0;
1610       }
1611       for (int i = 0; i < 3; i++) {
1612          if (inst->src[i].file == GRF &&
1613              split_grf[inst->src[i].reg] &&
1614              inst->src[i].reg_offset != 0) {
1615             inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1616                                 inst->src[i].reg_offset - 1);
1617             inst->src[i].reg_offset = 0;
1618          }
1619       }
1620    }
1621    invalidate_live_intervals();
1622 }
1623
1624 /**
1625  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1626  *
1627  * During code generation, we create tons of temporary variables, many of
1628  * which get immediately killed and are never used again.  Yet, in later
1629  * optimization and analysis passes, such as compute_live_intervals, we need
1630  * to loop over all the virtual GRFs.  Compacting them can save a lot of
1631  * overhead.
1632  */
1633 void
1634 fs_visitor::compact_virtual_grfs()
1635 {
1636    /* Mark which virtual GRFs are used, and count how many. */
1637    int remap_table[this->virtual_grf_count];
1638    memset(remap_table, -1, sizeof(remap_table));
1639
1640    foreach_list(node, &this->instructions) {
1641       const fs_inst *inst = (const fs_inst *) node;
1642
1643       if (inst->dst.file == GRF)
1644          remap_table[inst->dst.reg] = 0;
1645
1646       for (int i = 0; i < 3; i++) {
1647          if (inst->src[i].file == GRF)
1648             remap_table[inst->src[i].reg] = 0;
1649       }
1650    }
1651
1652    /* In addition to registers used in instructions, fs_visitor keeps
1653     * direct references to certain special values which must be patched:
1654     */
1655    fs_reg *special[] = {
1656       &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1657       &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1658       &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1659       &delta_x[0], &delta_x[1], &delta_x[2],
1660       &delta_x[3], &delta_x[4], &delta_x[5],
1661       &delta_y[0], &delta_y[1], &delta_y[2],
1662       &delta_y[3], &delta_y[4], &delta_y[5],
1663    };
1664    STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1665    STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1666
1667    /* Treat all special values as used, to be conservative */
1668    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1669       if (special[i]->file == GRF)
1670          remap_table[special[i]->reg] = 0;
1671    }
1672
1673    /* Compact the GRF arrays. */
1674    int new_index = 0;
1675    for (int i = 0; i < this->virtual_grf_count; i++) {
1676       if (remap_table[i] != -1) {
1677          remap_table[i] = new_index;
1678          virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1679          invalidate_live_intervals();
1680          ++new_index;
1681       }
1682    }
1683
1684    this->virtual_grf_count = new_index;
1685
1686    /* Patch all the instructions to use the newly renumbered registers */
1687    foreach_list(node, &this->instructions) {
1688       fs_inst *inst = (fs_inst *) node;
1689
1690       if (inst->dst.file == GRF)
1691          inst->dst.reg = remap_table[inst->dst.reg];
1692
1693       for (int i = 0; i < 3; i++) {
1694          if (inst->src[i].file == GRF)
1695             inst->src[i].reg = remap_table[inst->src[i].reg];
1696       }
1697    }
1698
1699    /* Patch all the references to special values */
1700    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1701       if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1702          special[i]->reg = remap_table[special[i]->reg];
1703    }
1704 }
1705
1706 bool
1707 fs_visitor::remove_dead_constants()
1708 {
1709    if (dispatch_width == 8) {
1710       this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
1711       this->nr_params_remap = c->prog_data.nr_params;
1712
1713       for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
1714          this->params_remap[i] = -1;
1715
1716       /* Find which params are still in use. */
1717       foreach_list(node, &this->instructions) {
1718          fs_inst *inst = (fs_inst *)node;
1719
1720          for (int i = 0; i < 3; i++) {
1721             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1722
1723             if (inst->src[i].file != UNIFORM)
1724                continue;
1725
1726             /* Section 5.11 of the OpenGL 4.3 spec says:
1727              *
1728              *     "Out-of-bounds reads return undefined values, which include
1729              *     values from other variables of the active program or zero."
1730              */
1731             if (constant_nr < 0 || constant_nr >= (int)c->prog_data.nr_params) {
1732                constant_nr = 0;
1733             }
1734
1735             /* For now, set this to non-negative.  We'll give it the
1736              * actual new number in a moment, in order to keep the
1737              * register numbers nicely ordered.
1738              */
1739             this->params_remap[constant_nr] = 0;
1740          }
1741       }
1742
1743       /* Figure out what the new numbers for the params will be.  At some
1744        * point when we're doing uniform array access, we're going to want
1745        * to keep the distinction between .reg and .reg_offset, but for
1746        * now we don't care.
1747        */
1748       unsigned int new_nr_params = 0;
1749       for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1750          if (this->params_remap[i] != -1) {
1751             this->params_remap[i] = new_nr_params++;
1752          }
1753       }
1754
1755       /* Update the list of params to be uploaded to match our new numbering. */
1756       for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1757          int remapped = this->params_remap[i];
1758
1759          if (remapped == -1)
1760             continue;
1761
1762          c->prog_data.param[remapped] = c->prog_data.param[i];
1763       }
1764
1765       c->prog_data.nr_params = new_nr_params;
1766    } else {
1767       /* This should have been generated in the 8-wide pass already. */
1768       assert(this->params_remap);
1769    }
1770
1771    /* Now do the renumbering of the shader to remove unused params. */
1772    foreach_list(node, &this->instructions) {
1773       fs_inst *inst = (fs_inst *)node;
1774
1775       for (int i = 0; i < 3; i++) {
1776          int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1777
1778          if (inst->src[i].file != UNIFORM)
1779             continue;
1780
1781          /* as above alias to 0 */
1782          if (constant_nr < 0 || constant_nr >= (int)this->nr_params_remap) {
1783             constant_nr = 0;
1784          }
1785          assert(this->params_remap[constant_nr] != -1);
1786          inst->src[i].reg = this->params_remap[constant_nr];
1787          inst->src[i].reg_offset = 0;
1788       }
1789    }
1790
1791    return true;
1792 }
1793
1794 /*
1795  * Implements array access of uniforms by inserting a
1796  * PULL_CONSTANT_LOAD instruction.
1797  *
1798  * Unlike temporary GRF array access (where we don't support it due to
1799  * the difficulty of doing relative addressing on instruction
1800  * destinations), we could potentially do array access of uniforms
1801  * that were loaded in GRF space as push constants.  In real-world
1802  * usage we've seen, though, the arrays being used are always larger
1803  * than we could load as push constants, so just always move all
1804  * uniform array access out to a pull constant buffer.
1805  */
1806 void
1807 fs_visitor::move_uniform_array_access_to_pull_constants()
1808 {
1809    int pull_constant_loc[c->prog_data.nr_params];
1810
1811    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1812       pull_constant_loc[i] = -1;
1813    }
1814
1815    /* Walk through and find array access of uniforms.  Put a copy of that
1816     * uniform in the pull constant buffer.
1817     *
1818     * Note that we don't move constant-indexed accesses to arrays.  No
1819     * testing has been done of the performance impact of this choice.
1820     */
1821    foreach_list_safe(node, &this->instructions) {
1822       fs_inst *inst = (fs_inst *)node;
1823
1824       for (int i = 0 ; i < 3; i++) {
1825          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1826             continue;
1827
1828          int uniform = inst->src[i].reg;
1829
1830          /* If this array isn't already present in the pull constant buffer,
1831           * add it.
1832           */
1833          if (pull_constant_loc[uniform] == -1) {
1834             const float **values = &c->prog_data.param[uniform];
1835
1836             pull_constant_loc[uniform] = c->prog_data.nr_pull_params;
1837
1838             assert(param_size[uniform]);
1839
1840             for (int j = 0; j < param_size[uniform]; j++) {
1841                c->prog_data.pull_param[c->prog_data.nr_pull_params++] =
1842                   values[j];
1843             }
1844          }
1845
1846          /* Set up the annotation tracking for new generated instructions. */
1847          base_ir = inst->ir;
1848          current_annotation = inst->annotation;
1849
1850          fs_reg surf_index = fs_reg(c->prog_data.base.binding_table.pull_constants_start);
1851          fs_reg temp = fs_reg(this, glsl_type::float_type);
1852          exec_list list = VARYING_PULL_CONSTANT_LOAD(temp,
1853                                                      surf_index,
1854                                                      *inst->src[i].reladdr,
1855                                                      pull_constant_loc[uniform] +
1856                                                      inst->src[i].reg_offset);
1857          inst->insert_before(&list);
1858
1859          inst->src[i].file = temp.file;
1860          inst->src[i].reg = temp.reg;
1861          inst->src[i].reg_offset = temp.reg_offset;
1862          inst->src[i].reladdr = NULL;
1863       }
1864    }
1865 }
1866
1867 /**
1868  * Choose accesses from the UNIFORM file to demote to using the pull
1869  * constant buffer.
1870  *
1871  * We allow a fragment shader to have more than the specified minimum
1872  * maximum number of fragment shader uniform components (64).  If
1873  * there are too many of these, they'd fill up all of register space.
1874  * So, this will push some of them out to the pull constant buffer and
1875  * update the program to load them.
1876  */
1877 void
1878 fs_visitor::setup_pull_constants()
1879 {
1880    /* Only allow 16 registers (128 uniform components) as push constants. */
1881    unsigned int max_uniform_components = 16 * 8;
1882    if (c->prog_data.nr_params <= max_uniform_components)
1883       return;
1884
1885    if (dispatch_width == 16) {
1886       fail("Pull constants not supported in 16-wide\n");
1887       return;
1888    }
1889
1890    /* Just demote the end of the list.  We could probably do better
1891     * here, demoting things that are rarely used in the program first.
1892     */
1893    unsigned int pull_uniform_base = max_uniform_components;
1894
1895    int pull_constant_loc[c->prog_data.nr_params];
1896    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1897       if (i < pull_uniform_base) {
1898          pull_constant_loc[i] = -1;
1899       } else {
1900          pull_constant_loc[i] = -1;
1901          /* If our constant is already being uploaded for reladdr purposes,
1902           * reuse it.
1903           */
1904          for (unsigned int j = 0; j < c->prog_data.nr_pull_params; j++) {
1905             if (c->prog_data.pull_param[j] == c->prog_data.param[i]) {
1906                pull_constant_loc[i] = j;
1907                break;
1908             }
1909          }
1910          if (pull_constant_loc[i] == -1) {
1911             int pull_index = c->prog_data.nr_pull_params++;
1912             c->prog_data.pull_param[pull_index] = c->prog_data.param[i];
1913             pull_constant_loc[i] = pull_index;;
1914          }
1915       }
1916    }
1917    c->prog_data.nr_params = pull_uniform_base;
1918
1919    foreach_list(node, &this->instructions) {
1920       fs_inst *inst = (fs_inst *)node;
1921
1922       for (int i = 0; i < 3; i++) {
1923          if (inst->src[i].file != UNIFORM)
1924             continue;
1925
1926          int pull_index = pull_constant_loc[inst->src[i].reg +
1927                                             inst->src[i].reg_offset];
1928          if (pull_index == -1)
1929             continue;
1930
1931          assert(!inst->src[i].reladdr);
1932
1933          fs_reg dst = fs_reg(this, glsl_type::float_type);
1934          fs_reg index = fs_reg(c->prog_data.base.binding_table.pull_constants_start);
1935          fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1936          fs_inst *pull =
1937             new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1938                                  dst, index, offset);
1939          pull->ir = inst->ir;
1940          pull->annotation = inst->annotation;
1941
1942          inst->insert_before(pull);
1943
1944          inst->src[i].file = GRF;
1945          inst->src[i].reg = dst.reg;
1946          inst->src[i].reg_offset = 0;
1947          inst->src[i].smear = pull_index & 3;
1948       }
1949    }
1950 }
1951
1952 bool
1953 fs_visitor::opt_algebraic()
1954 {
1955    bool progress = false;
1956
1957    foreach_list(node, &this->instructions) {
1958       fs_inst *inst = (fs_inst *)node;
1959
1960       switch (inst->opcode) {
1961       case BRW_OPCODE_MUL:
1962          if (inst->src[1].file != IMM)
1963             continue;
1964
1965          /* a * 1.0 = a */
1966          if (inst->src[1].is_one()) {
1967             inst->opcode = BRW_OPCODE_MOV;
1968             inst->src[1] = reg_undef;
1969             progress = true;
1970             break;
1971          }
1972
1973          /* a * 0.0 = 0.0 */
1974          if (inst->src[1].is_zero()) {
1975             inst->opcode = BRW_OPCODE_MOV;
1976             inst->src[0] = inst->src[1];
1977             inst->src[1] = reg_undef;
1978             progress = true;
1979             break;
1980          }
1981
1982          break;
1983       case BRW_OPCODE_ADD:
1984          if (inst->src[1].file != IMM)
1985             continue;
1986
1987          /* a + 0.0 = a */
1988          if (inst->src[1].is_zero()) {
1989             inst->opcode = BRW_OPCODE_MOV;
1990             inst->src[1] = reg_undef;
1991             progress = true;
1992             break;
1993          }
1994          break;
1995       case BRW_OPCODE_OR:
1996          if (inst->src[0].equals(inst->src[1])) {
1997             inst->opcode = BRW_OPCODE_MOV;
1998             inst->src[1] = reg_undef;
1999             progress = true;
2000             break;
2001          }
2002          break;
2003       case BRW_OPCODE_SEL:
2004          if (inst->saturate && inst->src[1].file == IMM) {
2005             switch (inst->conditional_mod) {
2006             case BRW_CONDITIONAL_LE:
2007             case BRW_CONDITIONAL_L:
2008                switch (inst->src[1].type) {
2009                case BRW_REGISTER_TYPE_F:
2010                   if (inst->src[1].imm.f >= 1.0f) {
2011                      inst->opcode = BRW_OPCODE_MOV;
2012                      inst->src[1] = reg_undef;
2013                      progress = true;
2014                   }
2015                   break;
2016                default:
2017                   break;
2018                }
2019                break;
2020             case BRW_CONDITIONAL_GE:
2021             case BRW_CONDITIONAL_G:
2022                switch (inst->src[1].type) {
2023                case BRW_REGISTER_TYPE_F:
2024                   if (inst->src[1].imm.f <= 0.0f) {
2025                      inst->opcode = BRW_OPCODE_MOV;
2026                      inst->src[1] = reg_undef;
2027                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
2028                      progress = true;
2029                   }
2030                   break;
2031                default:
2032                   break;
2033                }
2034             default:
2035                break;
2036             }
2037          }
2038          break;
2039       default:
2040          break;
2041       }
2042    }
2043
2044    return progress;
2045 }
2046
2047 /**
2048  * Removes any instructions writing a VGRF where that VGRF is not used by any
2049  * later instruction.
2050  */
2051 bool
2052 fs_visitor::dead_code_eliminate()
2053 {
2054    bool progress = false;
2055    int pc = 0;
2056
2057    calculate_live_intervals();
2058
2059    foreach_list_safe(node, &this->instructions) {
2060       fs_inst *inst = (fs_inst *)node;
2061
2062       if (inst->dst.file == GRF && !inst->has_side_effects()) {
2063          bool dead = true;
2064
2065          for (int i = 0; i < inst->regs_written; i++) {
2066             int var = live_intervals->var_from_vgrf[inst->dst.reg];
2067             assert(live_intervals->end[var + inst->dst.reg_offset + i] >= pc);
2068             if (live_intervals->end[var + inst->dst.reg_offset + i] != pc) {
2069                dead = false;
2070                break;
2071             }
2072          }
2073
2074          if (dead) {
2075             /* Don't dead code eliminate instructions that write to the
2076              * accumulator as a side-effect. Instead just set the destination
2077              * to the null register to free it.
2078              */
2079             switch (inst->opcode) {
2080             case BRW_OPCODE_ADDC:
2081             case BRW_OPCODE_SUBB:
2082             case BRW_OPCODE_MACH:
2083                inst->dst = fs_reg(retype(brw_null_reg(), inst->dst.type));
2084                break;
2085             default:
2086                inst->remove();
2087                progress = true;
2088                break;
2089             }
2090          }
2091       }
2092
2093       pc++;
2094    }
2095
2096    if (progress)
2097       invalidate_live_intervals();
2098
2099    return progress;
2100 }
2101
2102 struct dead_code_hash_key
2103 {
2104    int vgrf;
2105    int reg_offset;
2106 };
2107
2108 static bool
2109 dead_code_hash_compare(const void *a, const void *b)
2110 {
2111    return memcmp(a, b, sizeof(struct dead_code_hash_key)) == 0;
2112 }
2113
2114 static void
2115 clear_dead_code_hash(struct hash_table *ht)
2116 {
2117    struct hash_entry *entry;
2118
2119    hash_table_foreach(ht, entry) {
2120       _mesa_hash_table_remove(ht, entry);
2121    }
2122 }
2123
2124 static void
2125 insert_dead_code_hash(struct hash_table *ht,
2126                       int vgrf, int reg_offset, fs_inst *inst)
2127 {
2128    /* We don't bother freeing keys, because they'll be GCed with the ht. */
2129    struct dead_code_hash_key *key = ralloc(ht, struct dead_code_hash_key);
2130
2131    key->vgrf = vgrf;
2132    key->reg_offset = reg_offset;
2133
2134    _mesa_hash_table_insert(ht, _mesa_hash_data(key, sizeof(*key)), key, inst);
2135 }
2136
2137 static struct hash_entry *
2138 get_dead_code_hash_entry(struct hash_table *ht, int vgrf, int reg_offset)
2139 {
2140    struct dead_code_hash_key key;
2141
2142    key.vgrf = vgrf;
2143    key.reg_offset = reg_offset;
2144
2145    return _mesa_hash_table_search(ht, _mesa_hash_data(&key, sizeof(key)), &key);
2146 }
2147
2148 static void
2149 remove_dead_code_hash(struct hash_table *ht,
2150                       int vgrf, int reg_offset)
2151 {
2152    struct hash_entry *entry = get_dead_code_hash_entry(ht, vgrf, reg_offset);
2153    if (!entry)
2154       return;
2155
2156    _mesa_hash_table_remove(ht, entry);
2157 }
2158
2159 /**
2160  * Walks basic blocks, removing any regs that are written but not read before
2161  * being redefined.
2162  *
2163  * The dead_code_eliminate() function implements a global dead code
2164  * elimination, but it only handles the removing the last write to a register
2165  * if it's never read.  This one can handle intermediate writes, but only
2166  * within a basic block.
2167  */
2168 bool
2169 fs_visitor::dead_code_eliminate_local()
2170 {
2171    struct hash_table *ht;
2172    bool progress = false;
2173
2174    ht = _mesa_hash_table_create(mem_ctx, dead_code_hash_compare);
2175
2176    foreach_list_safe(node, &this->instructions) {
2177       fs_inst *inst = (fs_inst *)node;
2178
2179       /* At a basic block, empty the HT since we don't understand dataflow
2180        * here.
2181        */
2182       if (inst->is_control_flow()) {
2183          clear_dead_code_hash(ht);
2184          continue;
2185       }
2186
2187       /* Clear the HT of any instructions that got read. */
2188       for (int i = 0; i < 3; i++) {
2189          fs_reg src = inst->src[i];
2190          if (src.file != GRF)
2191             continue;
2192
2193          int read = 1;
2194          if (inst->is_send_from_grf())
2195             read = virtual_grf_sizes[src.reg] - src.reg_offset;
2196
2197          for (int reg_offset = src.reg_offset;
2198               reg_offset < src.reg_offset + read;
2199               reg_offset++) {
2200             remove_dead_code_hash(ht, src.reg, reg_offset);
2201          }
2202       }
2203
2204       /* Add any update of a GRF to the HT, removing a previous write if it
2205        * wasn't read.
2206        */
2207       if (inst->dst.file == GRF) {
2208          if (inst->regs_written > 1) {
2209             /* We don't know how to trim channels from an instruction's
2210              * writes, so we can't incrementally remove unread channels from
2211              * it.  Just remove whatever it overwrites from the table
2212              */
2213             for (int i = 0; i < inst->regs_written; i++) {
2214                remove_dead_code_hash(ht,
2215                                      inst->dst.reg,
2216                                      inst->dst.reg_offset + i);
2217             }
2218          } else {
2219             struct hash_entry *entry =
2220                get_dead_code_hash_entry(ht, inst->dst.reg,
2221                                         inst->dst.reg_offset);
2222
2223             if (entry) {
2224                if (inst->is_partial_write()) {
2225                   /* For a partial write, we can't remove any previous dead code
2226                    * candidate, since we're just modifying their result.
2227                    */
2228                } else {
2229                   /* We're completely updating a channel, and there was a
2230                    * previous write to the channel that wasn't read.  Kill it!
2231                    */
2232                   fs_inst *inst = (fs_inst *)entry->data;
2233                   inst->remove();
2234                   progress = true;
2235                }
2236
2237                _mesa_hash_table_remove(ht, entry);
2238             }
2239
2240             if (!inst->has_side_effects())
2241                insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
2242                                      inst);
2243          }
2244       }
2245    }
2246
2247    _mesa_hash_table_destroy(ht, NULL);
2248
2249    if (progress)
2250       invalidate_live_intervals();
2251
2252    return progress;
2253 }
2254
2255 /**
2256  * Implements register coalescing: Checks if the two registers involved in a
2257  * raw move don't interfere, in which case they can both be stored in the same
2258  * place and the MOV removed.
2259  */
2260 bool
2261 fs_visitor::register_coalesce()
2262 {
2263    bool progress = false;
2264
2265    calculate_live_intervals();
2266
2267    foreach_list_safe(node, &this->instructions) {
2268       fs_inst *inst = (fs_inst *)node;
2269
2270       if (inst->opcode != BRW_OPCODE_MOV ||
2271           inst->is_partial_write() ||
2272           inst->saturate ||
2273           inst->src[0].file != GRF ||
2274           inst->src[0].negate ||
2275           inst->src[0].abs ||
2276           inst->src[0].smear != -1 ||
2277           inst->dst.file != GRF ||
2278           inst->dst.type != inst->src[0].type ||
2279           virtual_grf_sizes[inst->src[0].reg] != 1) {
2280          continue;
2281       }
2282
2283       int var_from = live_intervals->var_from_reg(&inst->src[0]);
2284       int var_to = live_intervals->var_from_reg(&inst->dst);
2285
2286       if (live_intervals->vars_interfere(var_from, var_to) &&
2287           !inst->dst.equals(inst->src[0]))
2288          continue;
2289
2290       int reg_from = inst->src[0].reg;
2291       assert(inst->src[0].reg_offset == 0);
2292       int reg_to = inst->dst.reg;
2293       int reg_to_offset = inst->dst.reg_offset;
2294
2295       foreach_list(node, &this->instructions) {
2296          fs_inst *scan_inst = (fs_inst *)node;
2297
2298          if (scan_inst->dst.file == GRF &&
2299              scan_inst->dst.reg == reg_from) {
2300             scan_inst->dst.reg = reg_to;
2301             scan_inst->dst.reg_offset = reg_to_offset;
2302          }
2303          for (int i = 0; i < 3; i++) {
2304             if (scan_inst->src[i].file == GRF &&
2305                 scan_inst->src[i].reg == reg_from) {
2306                scan_inst->src[i].reg = reg_to;
2307                scan_inst->src[i].reg_offset = reg_to_offset;
2308             }
2309          }
2310       }
2311
2312       inst->remove();
2313       progress = true;
2314       continue;
2315    }
2316
2317    if (progress)
2318       invalidate_live_intervals();
2319
2320    return progress;
2321 }
2322
2323 bool
2324 fs_visitor::compute_to_mrf()
2325 {
2326    bool progress = false;
2327    int next_ip = 0;
2328
2329    calculate_live_intervals();
2330
2331    foreach_list_safe(node, &this->instructions) {
2332       fs_inst *inst = (fs_inst *)node;
2333
2334       int ip = next_ip;
2335       next_ip++;
2336
2337       if (inst->opcode != BRW_OPCODE_MOV ||
2338           inst->is_partial_write() ||
2339           inst->dst.file != MRF || inst->src[0].file != GRF ||
2340           inst->dst.type != inst->src[0].type ||
2341           inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
2342          continue;
2343
2344       /* Work out which hardware MRF registers are written by this
2345        * instruction.
2346        */
2347       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2348       int mrf_high;
2349       if (inst->dst.reg & BRW_MRF_COMPR4) {
2350          mrf_high = mrf_low + 4;
2351       } else if (dispatch_width == 16 &&
2352                  (!inst->force_uncompressed && !inst->force_sechalf)) {
2353          mrf_high = mrf_low + 1;
2354       } else {
2355          mrf_high = mrf_low;
2356       }
2357
2358       /* Can't compute-to-MRF this GRF if someone else was going to
2359        * read it later.
2360        */
2361       if (this->virtual_grf_end[inst->src[0].reg] > ip)
2362          continue;
2363
2364       /* Found a move of a GRF to a MRF.  Let's see if we can go
2365        * rewrite the thing that made this GRF to write into the MRF.
2366        */
2367       fs_inst *scan_inst;
2368       for (scan_inst = (fs_inst *)inst->prev;
2369            scan_inst->prev != NULL;
2370            scan_inst = (fs_inst *)scan_inst->prev) {
2371          if (scan_inst->dst.file == GRF &&
2372              scan_inst->dst.reg == inst->src[0].reg) {
2373             /* Found the last thing to write our reg we want to turn
2374              * into a compute-to-MRF.
2375              */
2376
2377             /* If this one instruction didn't populate all the
2378              * channels, bail.  We might be able to rewrite everything
2379              * that writes that reg, but it would require smarter
2380              * tracking to delay the rewriting until complete success.
2381              */
2382             if (scan_inst->is_partial_write())
2383                break;
2384
2385             /* Things returning more than one register would need us to
2386              * understand coalescing out more than one MOV at a time.
2387              */
2388             if (scan_inst->regs_written > 1)
2389                break;
2390
2391             /* SEND instructions can't have MRF as a destination. */
2392             if (scan_inst->mlen)
2393                break;
2394
2395             if (brw->gen == 6) {
2396                /* gen6 math instructions must have the destination be
2397                 * GRF, so no compute-to-MRF for them.
2398                 */
2399                if (scan_inst->is_math()) {
2400                   break;
2401                }
2402             }
2403
2404             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2405                /* Found the creator of our MRF's source value. */
2406                scan_inst->dst.file = MRF;
2407                scan_inst->dst.reg = inst->dst.reg;
2408                scan_inst->saturate |= inst->saturate;
2409                inst->remove();
2410                progress = true;
2411             }
2412             break;
2413          }
2414
2415          /* We don't handle control flow here.  Most computation of
2416           * values that end up in MRFs are shortly before the MRF
2417           * write anyway.
2418           */
2419          if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2420             break;
2421
2422          /* You can't read from an MRF, so if someone else reads our
2423           * MRF's source GRF that we wanted to rewrite, that stops us.
2424           */
2425          bool interfered = false;
2426          for (int i = 0; i < 3; i++) {
2427             if (scan_inst->src[i].file == GRF &&
2428                 scan_inst->src[i].reg == inst->src[0].reg &&
2429                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2430                interfered = true;
2431             }
2432          }
2433          if (interfered)
2434             break;
2435
2436          if (scan_inst->dst.file == MRF) {
2437             /* If somebody else writes our MRF here, we can't
2438              * compute-to-MRF before that.
2439              */
2440             int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2441             int scan_mrf_high;
2442
2443             if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2444                scan_mrf_high = scan_mrf_low + 4;
2445             } else if (dispatch_width == 16 &&
2446                        (!scan_inst->force_uncompressed &&
2447                         !scan_inst->force_sechalf)) {
2448                scan_mrf_high = scan_mrf_low + 1;
2449             } else {
2450                scan_mrf_high = scan_mrf_low;
2451             }
2452
2453             if (mrf_low == scan_mrf_low ||
2454                 mrf_low == scan_mrf_high ||
2455                 mrf_high == scan_mrf_low ||
2456                 mrf_high == scan_mrf_high) {
2457                break;
2458             }
2459          }
2460
2461          if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2462             /* Found a SEND instruction, which means that there are
2463              * live values in MRFs from base_mrf to base_mrf +
2464              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
2465              * above it.
2466              */
2467             if (mrf_low >= scan_inst->base_mrf &&
2468                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2469                break;
2470             }
2471             if (mrf_high >= scan_inst->base_mrf &&
2472                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2473                break;
2474             }
2475          }
2476       }
2477    }
2478
2479    if (progress)
2480       invalidate_live_intervals();
2481
2482    return progress;
2483 }
2484
2485 /**
2486  * Walks through basic blocks, looking for repeated MRF writes and
2487  * removing the later ones.
2488  */
2489 bool
2490 fs_visitor::remove_duplicate_mrf_writes()
2491 {
2492    fs_inst *last_mrf_move[16];
2493    bool progress = false;
2494
2495    /* Need to update the MRF tracking for compressed instructions. */
2496    if (dispatch_width == 16)
2497       return false;
2498
2499    memset(last_mrf_move, 0, sizeof(last_mrf_move));
2500
2501    foreach_list_safe(node, &this->instructions) {
2502       fs_inst *inst = (fs_inst *)node;
2503
2504       if (inst->is_control_flow()) {
2505          memset(last_mrf_move, 0, sizeof(last_mrf_move));
2506       }
2507
2508       if (inst->opcode == BRW_OPCODE_MOV &&
2509           inst->dst.file == MRF) {
2510          fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2511          if (prev_inst && inst->equals(prev_inst)) {
2512             inst->remove();
2513             progress = true;
2514             continue;
2515          }
2516       }
2517
2518       /* Clear out the last-write records for MRFs that were overwritten. */
2519       if (inst->dst.file == MRF) {
2520          last_mrf_move[inst->dst.reg] = NULL;
2521       }
2522
2523       if (inst->mlen > 0 && inst->base_mrf != -1) {
2524          /* Found a SEND instruction, which will include two or fewer
2525           * implied MRF writes.  We could do better here.
2526           */
2527          for (int i = 0; i < implied_mrf_writes(inst); i++) {
2528             last_mrf_move[inst->base_mrf + i] = NULL;
2529          }
2530       }
2531
2532       /* Clear out any MRF move records whose sources got overwritten. */
2533       if (inst->dst.file == GRF) {
2534          for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2535             if (last_mrf_move[i] &&
2536                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2537                last_mrf_move[i] = NULL;
2538             }
2539          }
2540       }
2541
2542       if (inst->opcode == BRW_OPCODE_MOV &&
2543           inst->dst.file == MRF &&
2544           inst->src[0].file == GRF &&
2545           !inst->is_partial_write()) {
2546          last_mrf_move[inst->dst.reg] = inst;
2547       }
2548    }
2549
2550    if (progress)
2551       invalidate_live_intervals();
2552
2553    return progress;
2554 }
2555
2556 static void
2557 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2558                         int first_grf, int grf_len)
2559 {
2560    bool inst_16wide = (dispatch_width > 8 &&
2561                        !inst->force_uncompressed &&
2562                        !inst->force_sechalf);
2563
2564    /* Clear the flag for registers that actually got read (as expected). */
2565    for (int i = 0; i < 3; i++) {
2566       int grf;
2567       if (inst->src[i].file == GRF) {
2568          grf = inst->src[i].reg;
2569       } else if (inst->src[i].file == HW_REG &&
2570                  inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2571          grf = inst->src[i].fixed_hw_reg.nr;
2572       } else {
2573          continue;
2574       }
2575
2576       if (grf >= first_grf &&
2577           grf < first_grf + grf_len) {
2578          deps[grf - first_grf] = false;
2579          if (inst_16wide)
2580             deps[grf - first_grf + 1] = false;
2581       }
2582    }
2583 }
2584
2585 /**
2586  * Implements this workaround for the original 965:
2587  *
2588  *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2589  *      check for post destination dependencies on this instruction, software
2590  *      must ensure that there is no destination hazard for the case of ‘write
2591  *      followed by a posted write’ shown in the following example.
2592  *
2593  *      1. mov r3 0
2594  *      2. send r3.xy <rest of send instruction>
2595  *      3. mov r2 r3
2596  *
2597  *      Due to no post-destination dependency check on the ‘send’, the above
2598  *      code sequence could have two instructions (1 and 2) in flight at the
2599  *      same time that both consider ‘r3’ as the target of their final writes.
2600  */
2601 void
2602 fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
2603 {
2604    int reg_size = dispatch_width / 8;
2605    int write_len = inst->regs_written * reg_size;
2606    int first_write_grf = inst->dst.reg;
2607    bool needs_dep[BRW_MAX_MRF];
2608    assert(write_len < (int)sizeof(needs_dep) - 1);
2609
2610    memset(needs_dep, false, sizeof(needs_dep));
2611    memset(needs_dep, true, write_len);
2612
2613    clear_deps_for_inst_src(inst, dispatch_width,
2614                            needs_dep, first_write_grf, write_len);
2615
2616    /* Walk backwards looking for writes to registers we're writing which
2617     * aren't read since being written.  If we hit the start of the program,
2618     * we assume that there are no outstanding dependencies on entry to the
2619     * program.
2620     */
2621    for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2622         scan_inst != NULL;
2623         scan_inst = (fs_inst *)scan_inst->prev) {
2624
2625       /* If we hit control flow, assume that there *are* outstanding
2626        * dependencies, and force their cleanup before our instruction.
2627        */
2628       if (scan_inst->is_control_flow()) {
2629          for (int i = 0; i < write_len; i++) {
2630             if (needs_dep[i]) {
2631                inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2632             }
2633          }
2634          return;
2635       }
2636
2637       bool scan_inst_16wide = (dispatch_width > 8 &&
2638                                !scan_inst->force_uncompressed &&
2639                                !scan_inst->force_sechalf);
2640
2641       /* We insert our reads as late as possible on the assumption that any
2642        * instruction but a MOV that might have left us an outstanding
2643        * dependency has more latency than a MOV.
2644        */
2645       if (scan_inst->dst.file == GRF) {
2646          for (int i = 0; i < scan_inst->regs_written; i++) {
2647             int reg = scan_inst->dst.reg + i * reg_size;
2648
2649             if (reg >= first_write_grf &&
2650                 reg < first_write_grf + write_len &&
2651                 needs_dep[reg - first_write_grf]) {
2652                inst->insert_before(DEP_RESOLVE_MOV(reg));
2653                needs_dep[reg - first_write_grf] = false;
2654                if (scan_inst_16wide)
2655                   needs_dep[reg - first_write_grf + 1] = false;
2656             }
2657          }
2658       }
2659
2660       /* Clear the flag for registers that actually got read (as expected). */
2661       clear_deps_for_inst_src(scan_inst, dispatch_width,
2662                               needs_dep, first_write_grf, write_len);
2663
2664       /* Continue the loop only if we haven't resolved all the dependencies */
2665       int i;
2666       for (i = 0; i < write_len; i++) {
2667          if (needs_dep[i])
2668             break;
2669       }
2670       if (i == write_len)
2671          return;
2672    }
2673 }
2674
2675 /**
2676  * Implements this workaround for the original 965:
2677  *
2678  *     "[DevBW, DevCL] Errata: A destination register from a send can not be
2679  *      used as a destination register until after it has been sourced by an
2680  *      instruction with a different destination register.
2681  */
2682 void
2683 fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
2684 {
2685    int write_len = inst->regs_written * dispatch_width / 8;
2686    int first_write_grf = inst->dst.reg;
2687    bool needs_dep[BRW_MAX_MRF];
2688    assert(write_len < (int)sizeof(needs_dep) - 1);
2689
2690    memset(needs_dep, false, sizeof(needs_dep));
2691    memset(needs_dep, true, write_len);
2692    /* Walk forwards looking for writes to registers we're writing which aren't
2693     * read before being written.
2694     */
2695    for (fs_inst *scan_inst = (fs_inst *)inst->next;
2696         !scan_inst->is_tail_sentinel();
2697         scan_inst = (fs_inst *)scan_inst->next) {
2698       /* If we hit control flow, force resolve all remaining dependencies. */
2699       if (scan_inst->is_control_flow()) {
2700          for (int i = 0; i < write_len; i++) {
2701             if (needs_dep[i])
2702                scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2703          }
2704          return;
2705       }
2706
2707       /* Clear the flag for registers that actually got read (as expected). */
2708       clear_deps_for_inst_src(scan_inst, dispatch_width,
2709                               needs_dep, first_write_grf, write_len);
2710
2711       /* We insert our reads as late as possible since they're reading the
2712        * result of a SEND, which has massive latency.
2713        */
2714       if (scan_inst->dst.file == GRF &&
2715           scan_inst->dst.reg >= first_write_grf &&
2716           scan_inst->dst.reg < first_write_grf + write_len &&
2717           needs_dep[scan_inst->dst.reg - first_write_grf]) {
2718          scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2719          needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2720       }
2721
2722       /* Continue the loop only if we haven't resolved all the dependencies */
2723       int i;
2724       for (i = 0; i < write_len; i++) {
2725          if (needs_dep[i])
2726             break;
2727       }
2728       if (i == write_len)
2729          return;
2730    }
2731
2732    /* If we hit the end of the program, resolve all remaining dependencies out
2733     * of paranoia.
2734     */
2735    fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2736    assert(last_inst->eot);
2737    for (int i = 0; i < write_len; i++) {
2738       if (needs_dep[i])
2739          last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2740    }
2741 }
2742
2743 void
2744 fs_visitor::insert_gen4_send_dependency_workarounds()
2745 {
2746    if (brw->gen != 4 || brw->is_g4x)
2747       return;
2748
2749    /* Note that we're done with register allocation, so GRF fs_regs always
2750     * have a .reg_offset of 0.
2751     */
2752
2753    foreach_list_safe(node, &this->instructions) {
2754       fs_inst *inst = (fs_inst *)node;
2755
2756       if (inst->mlen != 0 && inst->dst.file == GRF) {
2757          insert_gen4_pre_send_dependency_workarounds(inst);
2758          insert_gen4_post_send_dependency_workarounds(inst);
2759       }
2760    }
2761 }
2762
2763 /**
2764  * Turns the generic expression-style uniform pull constant load instruction
2765  * into a hardware-specific series of instructions for loading a pull
2766  * constant.
2767  *
2768  * The expression style allows the CSE pass before this to optimize out
2769  * repeated loads from the same offset, and gives the pre-register-allocation
2770  * scheduling full flexibility, while the conversion to native instructions
2771  * allows the post-register-allocation scheduler the best information
2772  * possible.
2773  *
2774  * Note that execution masking for setting up pull constant loads is special:
2775  * the channels that need to be written are unrelated to the current execution
2776  * mask, since a later instruction will use one of the result channels as a
2777  * source operand for all 8 or 16 of its channels.
2778  */
2779 void
2780 fs_visitor::lower_uniform_pull_constant_loads()
2781 {
2782    foreach_list(node, &this->instructions) {
2783       fs_inst *inst = (fs_inst *)node;
2784
2785       if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2786          continue;
2787
2788       if (brw->gen >= 7) {
2789          /* The offset arg before was a vec4-aligned byte offset.  We need to
2790           * turn it into a dword offset.
2791           */
2792          fs_reg const_offset_reg = inst->src[1];
2793          assert(const_offset_reg.file == IMM &&
2794                 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2795          const_offset_reg.imm.u /= 4;
2796          fs_reg payload = fs_reg(this, glsl_type::uint_type);
2797
2798          /* This is actually going to be a MOV, but since only the first dword
2799           * is accessed, we have a special opcode to do just that one.  Note
2800           * that this needs to be an operation that will be considered a def
2801           * by live variable analysis, or register allocation will explode.
2802           */
2803          fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2804                                                payload, const_offset_reg);
2805          setup->force_writemask_all = true;
2806
2807          setup->ir = inst->ir;
2808          setup->annotation = inst->annotation;
2809          inst->insert_before(setup);
2810
2811          /* Similarly, this will only populate the first 4 channels of the
2812           * result register (since we only use smear values from 0-3), but we
2813           * don't tell the optimizer.
2814           */
2815          inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2816          inst->src[1] = payload;
2817
2818          invalidate_live_intervals();
2819       } else {
2820          /* Before register allocation, we didn't tell the scheduler about the
2821           * MRF we use.  We know it's safe to use this MRF because nothing
2822           * else does except for register spill/unspill, which generates and
2823           * uses its MRF within a single IR instruction.
2824           */
2825          inst->base_mrf = 14;
2826          inst->mlen = 1;
2827       }
2828    }
2829 }
2830
2831 void
2832 fs_visitor::dump_instruction(backend_instruction *be_inst)
2833 {
2834    fs_inst *inst = (fs_inst *)be_inst;
2835
2836    if (inst->predicate) {
2837       printf("(%cf0.%d) ",
2838              inst->predicate_inverse ? '-' : '+',
2839              inst->flag_subreg);
2840    }
2841
2842    printf("%s", brw_instruction_name(inst->opcode));
2843    if (inst->saturate)
2844       printf(".sat");
2845    if (inst->conditional_mod) {
2846       printf("%s", conditional_modifier[inst->conditional_mod]);
2847       if (!inst->predicate &&
2848           (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2849                               inst->opcode != BRW_OPCODE_IF &&
2850                               inst->opcode != BRW_OPCODE_WHILE))) {
2851          printf(".f0.%d", inst->flag_subreg);
2852       }
2853    }
2854    printf(" ");
2855
2856
2857    switch (inst->dst.file) {
2858    case GRF:
2859       printf("vgrf%d", inst->dst.reg);
2860       if (inst->dst.reg_offset)
2861          printf("+%d", inst->dst.reg_offset);
2862       break;
2863    case MRF:
2864       printf("m%d", inst->dst.reg);
2865       break;
2866    case BAD_FILE:
2867       printf("(null)");
2868       break;
2869    case UNIFORM:
2870       printf("***u%d***", inst->dst.reg);
2871       break;
2872    case HW_REG:
2873       if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
2874          switch (inst->dst.fixed_hw_reg.nr) {
2875          case BRW_ARF_NULL:
2876             printf("null");
2877             break;
2878          case BRW_ARF_ADDRESS:
2879             printf("a0.%d", inst->dst.fixed_hw_reg.subnr);
2880             break;
2881          case BRW_ARF_ACCUMULATOR:
2882             printf("acc%d", inst->dst.fixed_hw_reg.subnr);
2883             break;
2884          case BRW_ARF_FLAG:
2885             printf("f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
2886                              inst->dst.fixed_hw_reg.subnr);
2887             break;
2888          default:
2889             printf("arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
2890                                inst->dst.fixed_hw_reg.subnr);
2891             break;
2892          }
2893       } else {
2894          printf("hw_reg%d", inst->dst.fixed_hw_reg.nr);
2895       }
2896       if (inst->dst.fixed_hw_reg.subnr)
2897          printf("+%d", inst->dst.fixed_hw_reg.subnr);
2898       break;
2899    default:
2900       printf("???");
2901       break;
2902    }
2903    printf(":%s, ", reg_encoding[inst->dst.type]);
2904
2905    for (int i = 0; i < 3 && inst->src[i].file != BAD_FILE; i++) {
2906       if (inst->src[i].negate)
2907          printf("-");
2908       if (inst->src[i].abs)
2909          printf("|");
2910       switch (inst->src[i].file) {
2911       case GRF:
2912          printf("vgrf%d", inst->src[i].reg);
2913          if (inst->src[i].reg_offset)
2914             printf("+%d", inst->src[i].reg_offset);
2915          break;
2916       case MRF:
2917          printf("***m%d***", inst->src[i].reg);
2918          break;
2919       case UNIFORM:
2920          printf("u%d", inst->src[i].reg);
2921          if (inst->src[i].reg_offset)
2922             printf(".%d", inst->src[i].reg_offset);
2923          break;
2924       case BAD_FILE:
2925          printf("(null)");
2926          break;
2927       case IMM:
2928          switch (inst->src[i].type) {
2929          case BRW_REGISTER_TYPE_F:
2930             printf("%ff", inst->src[i].imm.f);
2931             break;
2932          case BRW_REGISTER_TYPE_D:
2933             printf("%dd", inst->src[i].imm.i);
2934             break;
2935          case BRW_REGISTER_TYPE_UD:
2936             printf("%uu", inst->src[i].imm.u);
2937             break;
2938          default:
2939             printf("???");
2940             break;
2941          }
2942          break;
2943       case HW_REG:
2944          if (inst->src[i].fixed_hw_reg.negate)
2945             printf("-");
2946          if (inst->src[i].fixed_hw_reg.abs)
2947             printf("|");
2948          if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
2949             switch (inst->src[i].fixed_hw_reg.nr) {
2950             case BRW_ARF_NULL:
2951                printf("null");
2952                break;
2953             case BRW_ARF_ADDRESS:
2954                printf("a0.%d", inst->src[i].fixed_hw_reg.subnr);
2955                break;
2956             case BRW_ARF_ACCUMULATOR:
2957                printf("acc%d", inst->src[i].fixed_hw_reg.subnr);
2958                break;
2959             case BRW_ARF_FLAG:
2960                printf("f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
2961                                 inst->src[i].fixed_hw_reg.subnr);
2962                break;
2963             default:
2964                printf("arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
2965                                   inst->src[i].fixed_hw_reg.subnr);
2966                break;
2967             }
2968          } else {
2969             printf("hw_reg%d", inst->src[i].fixed_hw_reg.nr);
2970          }
2971          if (inst->src[i].fixed_hw_reg.subnr)
2972             printf("+%d", inst->src[i].fixed_hw_reg.subnr);
2973          if (inst->src[i].fixed_hw_reg.abs)
2974             printf("|");
2975          break;
2976       default:
2977          printf("???");
2978          break;
2979       }
2980       if (inst->src[i].abs)
2981          printf("|");
2982
2983       if (inst->src[i].file != IMM) {
2984          printf(":%s", reg_encoding[inst->src[i].type]);
2985       }
2986
2987       if (i < 2 && inst->src[i + 1].file != BAD_FILE)
2988          printf(", ");
2989    }
2990
2991    printf(" ");
2992
2993    if (inst->force_uncompressed)
2994       printf("1sthalf ");
2995
2996    if (inst->force_sechalf)
2997       printf("2ndhalf ");
2998
2999    printf("\n");
3000 }
3001
3002 /**
3003  * Possibly returns an instruction that set up @param reg.
3004  *
3005  * Sometimes we want to take the result of some expression/variable
3006  * dereference tree and rewrite the instruction generating the result
3007  * of the tree.  When processing the tree, we know that the
3008  * instructions generated are all writing temporaries that are dead
3009  * outside of this tree.  So, if we have some instructions that write
3010  * a temporary, we're free to point that temp write somewhere else.
3011  *
3012  * Note that this doesn't guarantee that the instruction generated
3013  * only reg -- it might be the size=4 destination of a texture instruction.
3014  */
3015 fs_inst *
3016 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3017                                            fs_inst *end,
3018                                            fs_reg reg)
3019 {
3020    if (end == start ||
3021        end->is_partial_write() ||
3022        reg.reladdr ||
3023        !reg.equals(end->dst)) {
3024       return NULL;
3025    } else {
3026       return end;
3027    }
3028 }
3029
3030 void
3031 fs_visitor::setup_payload_gen6()
3032 {
3033    bool uses_depth =
3034       (fp->Base.InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3035    unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
3036
3037    assert(brw->gen >= 6);
3038
3039    /* R0-1: masks, pixel X/Y coordinates. */
3040    c->nr_payload_regs = 2;
3041    /* R2: only for 32-pixel dispatch.*/
3042
3043    /* R3-26: barycentric interpolation coordinates.  These appear in the
3044     * same order that they appear in the brw_wm_barycentric_interp_mode
3045     * enum.  Each set of coordinates occupies 2 registers if dispatch width
3046     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
3047     * appear if they were enabled using the "Barycentric Interpolation
3048     * Mode" bits in WM_STATE.
3049     */
3050    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3051       if (barycentric_interp_modes & (1 << i)) {
3052          c->barycentric_coord_reg[i] = c->nr_payload_regs;
3053          c->nr_payload_regs += 2;
3054          if (dispatch_width == 16) {
3055             c->nr_payload_regs += 2;
3056          }
3057       }
3058    }
3059
3060    /* R27: interpolated depth if uses source depth */
3061    if (uses_depth) {
3062       c->source_depth_reg = c->nr_payload_regs;
3063       c->nr_payload_regs++;
3064       if (dispatch_width == 16) {
3065          /* R28: interpolated depth if not 8-wide. */
3066          c->nr_payload_regs++;
3067       }
3068    }
3069    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3070    if (uses_depth) {
3071       c->source_w_reg = c->nr_payload_regs;
3072       c->nr_payload_regs++;
3073       if (dispatch_width == 16) {
3074          /* R30: interpolated W if not 8-wide. */
3075          c->nr_payload_regs++;
3076       }
3077    }
3078
3079    c->prog_data.uses_pos_offset = c->key.compute_pos_offset;
3080    /* R31: MSAA position offsets. */
3081    if (c->prog_data.uses_pos_offset) {
3082       c->sample_pos_reg = c->nr_payload_regs;
3083       c->nr_payload_regs++;
3084    }
3085
3086    /* R32: MSAA input coverage mask */
3087    if (fp->Base.SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
3088       assert(brw->gen >= 7);
3089       c->sample_mask_reg = c->nr_payload_regs;
3090       c->nr_payload_regs++;
3091       if (dispatch_width == 16) {
3092          /* R33: input coverage mask if not 8-wide. */
3093          c->nr_payload_regs++;
3094       }
3095    }
3096
3097    /* R34-: bary for 32-pixel. */
3098    /* R58-59: interp W for 32-pixel. */
3099
3100    if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3101       c->source_depth_to_render_target = true;
3102    }
3103 }
3104
3105 void
3106 fs_visitor::assign_binding_table_offsets()
3107 {
3108    uint32_t next_binding_table_offset = 0;
3109
3110    /* If there are no color regions, we still perform an FB write to a null
3111     * renderbuffer, which we place at surface index 0.
3112     */
3113    c->prog_data.binding_table.render_target_start = next_binding_table_offset;
3114    next_binding_table_offset += MAX2(c->key.nr_color_regions, 1);
3115
3116    assign_common_binding_table_offsets(next_binding_table_offset);
3117 }
3118
3119 bool
3120 fs_visitor::run()
3121 {
3122    sanity_param_count = fp->Base.Parameters->NumParameters;
3123    uint32_t orig_nr_params = c->prog_data.nr_params;
3124    bool allocated_without_spills;
3125
3126    assign_binding_table_offsets();
3127
3128    if (brw->gen >= 6)
3129       setup_payload_gen6();
3130    else
3131       setup_payload_gen4();
3132
3133    if (0) {
3134       emit_dummy_fs();
3135    } else {
3136       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3137          emit_shader_time_begin();
3138
3139       calculate_urb_setup();
3140       if (fp->Base.InputsRead > 0) {
3141          if (brw->gen < 6)
3142             emit_interpolation_setup_gen4();
3143          else
3144             emit_interpolation_setup_gen6();
3145       }
3146
3147       /* We handle discards by keeping track of the still-live pixels in f0.1.
3148        * Initialize it with the dispatched pixels.
3149        */
3150       if (fp->UsesKill || c->key.alpha_test_func) {
3151          fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
3152          discard_init->flag_subreg = 1;
3153       }
3154
3155       /* Generate FS IR for main().  (the visitor only descends into
3156        * functions called "main").
3157        */
3158       if (shader) {
3159          foreach_list(node, &*shader->ir) {
3160             ir_instruction *ir = (ir_instruction *)node;
3161             base_ir = ir;
3162             this->result = reg_undef;
3163             ir->accept(this);
3164          }
3165       } else {
3166          emit_fragment_program_code();
3167       }
3168       base_ir = NULL;
3169       if (failed)
3170          return false;
3171
3172       emit(FS_OPCODE_PLACEHOLDER_HALT);
3173
3174       if (c->key.alpha_test_func)
3175          emit_alpha_test();
3176
3177       emit_fb_writes();
3178
3179       split_virtual_grfs();
3180
3181       move_uniform_array_access_to_pull_constants();
3182       remove_dead_constants();
3183       setup_pull_constants();
3184
3185       bool progress;
3186       do {
3187          progress = false;
3188
3189          compact_virtual_grfs();
3190
3191          progress = remove_duplicate_mrf_writes() || progress;
3192
3193          progress = opt_algebraic() || progress;
3194          progress = opt_cse() || progress;
3195          progress = opt_copy_propagate() || progress;
3196          progress = opt_peephole_sel() || progress;
3197          progress = opt_peephole_predicated_break() || progress;
3198          progress = dead_code_eliminate() || progress;
3199          progress = dead_code_eliminate_local() || progress;
3200          progress = dead_control_flow_eliminate(this) || progress;
3201          progress = register_coalesce() || progress;
3202          progress = compute_to_mrf() || progress;
3203       } while (progress);
3204
3205       lower_uniform_pull_constant_loads();
3206
3207       assign_curb_setup();
3208       assign_urb_setup();
3209
3210       static enum instruction_scheduler_mode pre_modes[] = {
3211          SCHEDULE_PRE,
3212          SCHEDULE_PRE_NON_LIFO,
3213          SCHEDULE_PRE_LIFO,
3214       };
3215
3216       /* Try each scheduling heuristic to see if it can successfully register
3217        * allocate without spilling.  They should be ordered by decreasing
3218        * performance but increasing likelihood of allocating.
3219        */
3220       for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3221          schedule_instructions(pre_modes[i]);
3222
3223          if (0) {
3224             assign_regs_trivial();
3225             allocated_without_spills = true;
3226          } else {
3227             allocated_without_spills = assign_regs(false);
3228          }
3229          if (allocated_without_spills)
3230             break;
3231       }
3232
3233       if (!allocated_without_spills) {
3234          /* We assume that any spilling is worse than just dropping back to
3235           * SIMD8.  There's probably actually some intermediate point where
3236           * SIMD16 with a couple of spills is still better.
3237           */
3238          if (dispatch_width == 16) {
3239             fail("Failure to register allocate.  Reduce number of "
3240                  "live scalar values to avoid this.");
3241          }
3242
3243          /* Since we're out of heuristics, just go spill registers until we
3244           * get an allocation.
3245           */
3246          while (!assign_regs(true)) {
3247             if (failed)
3248                break;
3249          }
3250       }
3251    }
3252    assert(force_uncompressed_stack == 0);
3253
3254    /* This must come after all optimization and register allocation, since
3255     * it inserts dead code that happens to have side effects, and it does
3256     * so based on the actual physical registers in use.
3257     */
3258    insert_gen4_send_dependency_workarounds();
3259
3260    if (failed)
3261       return false;
3262
3263    if (!allocated_without_spills)
3264       schedule_instructions(SCHEDULE_POST);
3265
3266    if (dispatch_width == 8) {
3267       c->prog_data.reg_blocks = brw_register_blocks(grf_used);
3268    } else {
3269       c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
3270
3271       /* Make sure we didn't try to sneak in an extra uniform */
3272       assert(orig_nr_params == c->prog_data.nr_params);
3273       (void) orig_nr_params;
3274    }
3275
3276    /* If any state parameters were appended, then ParameterValues could have
3277     * been realloced, in which case the driver uniform storage set up by
3278     * _mesa_associate_uniform_storage() would point to freed memory.  Make
3279     * sure that didn't happen.
3280     */
3281    assert(sanity_param_count == fp->Base.Parameters->NumParameters);
3282
3283    return !failed;
3284 }
3285
3286 const unsigned *
3287 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
3288                struct gl_fragment_program *fp,
3289                struct gl_shader_program *prog,
3290                unsigned *final_assembly_size)
3291 {
3292    bool start_busy = false;
3293    float start_time = 0;
3294
3295    if (unlikely(brw->perf_debug)) {
3296       start_busy = (brw->batch.last_bo &&
3297                     drm_intel_bo_busy(brw->batch.last_bo));
3298       start_time = get_time();
3299    }
3300
3301    struct brw_shader *shader = NULL;
3302    if (prog)
3303       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3304
3305    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3306       if (prog) {
3307          printf("GLSL IR for native fragment shader %d:\n", prog->Name);
3308          _mesa_print_ir(shader->ir, NULL);
3309          printf("\n\n");
3310       } else {
3311          printf("ARB_fragment_program %d ir for native fragment shader\n",
3312                 fp->Base.Id);
3313          _mesa_print_program(&fp->Base);
3314       }
3315    }
3316
3317    /* Now the main event: Visit the shader IR and generate our FS IR for it.
3318     */
3319    fs_visitor v(brw, c, prog, fp, 8);
3320    if (!v.run()) {
3321       if (prog) {
3322          prog->LinkStatus = false;
3323          ralloc_strcat(&prog->InfoLog, v.fail_msg);
3324       }
3325
3326       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3327                     v.fail_msg);
3328
3329       return NULL;
3330    }
3331
3332    exec_list *simd16_instructions = NULL;
3333    fs_visitor v2(brw, c, prog, fp, 16);
3334    if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16))) {
3335       if (c->prog_data.nr_pull_params == 0) {
3336          /* Try a 16-wide compile */
3337          v2.import_uniforms(&v);
3338          if (!v2.run()) {
3339             perf_debug("16-wide shader failed to compile, falling back to "
3340                        "8-wide at a 10-20%% performance cost: %s", v2.fail_msg);
3341          } else {
3342             simd16_instructions = &v2.instructions;
3343          }
3344       } else {
3345          perf_debug("Skipping 16-wide due to pull parameters.\n");
3346       }
3347    }
3348
3349    fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
3350    const unsigned *generated = g.generate_assembly(&v.instructions,
3351                                                    simd16_instructions,
3352                                                    final_assembly_size);
3353
3354    if (unlikely(brw->perf_debug) && shader) {
3355       if (shader->compiled_once)
3356          brw_wm_debug_recompile(brw, prog, &c->key);
3357       shader->compiled_once = true;
3358
3359       if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
3360          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3361                     (get_time() - start_time) * 1000);
3362       }
3363    }
3364
3365    return generated;
3366 }
3367
3368 bool
3369 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
3370 {
3371    struct brw_context *brw = brw_context(ctx);
3372    struct brw_wm_prog_key key;
3373
3374    if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
3375       return true;
3376
3377    struct gl_fragment_program *fp = (struct gl_fragment_program *)
3378       prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
3379    struct brw_fragment_program *bfp = brw_fragment_program(fp);
3380    bool program_uses_dfdy = fp->UsesDFdy;
3381
3382    memset(&key, 0, sizeof(key));
3383
3384    if (brw->gen < 6) {
3385       if (fp->UsesKill)
3386          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3387
3388       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3389          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3390
3391       /* Just assume depth testing. */
3392       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3393       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3394    }
3395
3396    if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
3397                                          BRW_FS_VARYING_INPUT_MASK) > 16)
3398       key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
3399
3400    key.clamp_fragment_color = ctx->API == API_OPENGL_COMPAT;
3401
3402    unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
3403    for (unsigned i = 0; i < sampler_count; i++) {
3404       if (fp->Base.ShadowSamplers & (1 << i)) {
3405          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3406          key.tex.swizzles[i] =
3407             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3408       } else {
3409          /* Color sampler: assume no swizzling. */
3410          key.tex.swizzles[i] = SWIZZLE_XYZW;
3411       }
3412    }
3413
3414    if (fp->Base.InputsRead & VARYING_BIT_POS) {
3415       key.drawable_height = ctx->DrawBuffer->Height;
3416    }
3417
3418    if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3419       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
3420    }
3421
3422    key.nr_color_regions = 1;
3423
3424    /* GL_FRAGMENT_SHADER_DERIVATIVE_HINT is almost always GL_DONT_CARE.  The
3425     * quality of the derivatives is likely to be determined by the driconf
3426     * option.
3427     */
3428    key.high_quality_derivatives = brw->disable_derivative_optimization;
3429
3430    key.program_string_id = bfp->id;
3431
3432    uint32_t old_prog_offset = brw->wm.base.prog_offset;
3433    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3434
3435    bool success = do_wm_prog(brw, prog, bfp, &key);
3436
3437    brw->wm.base.prog_offset = old_prog_offset;
3438    brw->wm.prog_data = old_prog_data;
3439
3440    return success;
3441 }