src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 extern "C" {
  32
  33 #include <sys/types.h>
  34
  35 #include "main/hash_table.h"
  36 #include "main/macros.h"
  37 #include "main/shaderobj.h"
  38 #include "main/fbobject.h"
  39 #include "program/prog_parameter.h"
  40 #include "program/prog_print.h"
  41 #include "program/register_allocate.h"
  42 #include "program/sampler.h"
  43 #include "program/hash_table.h"
  44 #include "brw_context.h"
  45 #include "brw_eu.h"
  46 #include "brw_wm.h"
  47 }
  48 #include "brw_fs.h"
  49 #include "brw_dead_control_flow.h"
  50 #include "main/uniforms.h"
  51 #include "brw_fs_live_variables.h"
  52 #include "glsl/glsl_types.h"
  53
  54 void
  55 fs_inst::init()
  56 {
  57    memset(this, 0, sizeof(*this));
  58    this->opcode = BRW_OPCODE_NOP;
  59    this->conditional_mod = BRW_CONDITIONAL_NONE;
  60
  61    this->dst = reg_undef;
  62    this->src[0] = reg_undef;
  63    this->src[1] = reg_undef;
  64    this->src[2] = reg_undef;
  65
  66    /* This will be the case for almost all instructions. */
  67    this->regs_written = 1;
  68 }
  69
  70 fs_inst::fs_inst()
  71 {
  72    init();
  73 }
  74
  75 fs_inst::fs_inst(enum opcode opcode)
  76 {
  77    init();
  78    this->opcode = opcode;
  79 }
  80
  81 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
  82 {
  83    init();
  84    this->opcode = opcode;
  85    this->dst = dst;
  86
  87    if (dst.file == GRF)
  88       assert(dst.reg_offset >= 0);
  89 }
  90
  91 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
  92 {
  93    init();
  94    this->opcode = opcode;
  95    this->dst = dst;
  96    this->src[0] = src0;
  97
  98    if (dst.file == GRF)
  99       assert(dst.reg_offset >= 0);
 100    if (src[0].file == GRF)
 101       assert(src[0].reg_offset >= 0);
 102 }
 103
 104 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 105 {
 106    init();
 107    this->opcode = opcode;
 108    this->dst = dst;
 109    this->src[0] = src0;
 110    this->src[1] = src1;
 111
 112    if (dst.file == GRF)
 113       assert(dst.reg_offset >= 0);
 114    if (src[0].file == GRF)
 115       assert(src[0].reg_offset >= 0);
 116    if (src[1].file == GRF)
 117       assert(src[1].reg_offset >= 0);
 118 }
 119
 120 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
 121                  fs_reg src0, fs_reg src1, fs_reg src2)
 122 {
 123    init();
 124    this->opcode = opcode;
 125    this->dst = dst;
 126    this->src[0] = src0;
 127    this->src[1] = src1;
 128    this->src[2] = src2;
 129
 130    if (dst.file == GRF)
 131       assert(dst.reg_offset >= 0);
 132    if (src[0].file == GRF)
 133       assert(src[0].reg_offset >= 0);
 134    if (src[1].file == GRF)
 135       assert(src[1].reg_offset >= 0);
 136    if (src[2].file == GRF)
 137       assert(src[2].reg_offset >= 0);
 138 }
 139
 140 #define ALU1(op)                                                        \
 141    fs_inst *                                                            \
 142    fs_visitor::op(fs_reg dst, fs_reg src0)                              \
 143    {                                                                    \
 144       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
 145    }
 146
 147 #define ALU2(op)                                                        \
 148    fs_inst *                                                            \
 149    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1)                 \
 150    {                                                                    \
 151       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
 152    }
 153
 154 #define ALU3(op)                                                        \
 155    fs_inst *                                                            \
 156    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1, fs_reg src2)    \
 157    {                                                                    \
 158       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
 159    }
 160
 161 ALU1(NOT)
 162 ALU1(MOV)
 163 ALU1(FRC)
 164 ALU1(RNDD)
 165 ALU1(RNDE)
 166 ALU1(RNDZ)
 167 ALU2(ADD)
 168 ALU2(MUL)
 169 ALU2(MACH)
 170 ALU2(AND)
 171 ALU2(OR)
 172 ALU2(XOR)
 173 ALU2(SHL)
 174 ALU2(SHR)
 175 ALU2(ASR)
 176 ALU3(LRP)
 177 ALU1(BFREV)
 178 ALU3(BFE)
 179 ALU2(BFI1)
 180 ALU3(BFI2)
 181 ALU1(FBH)
 182 ALU1(FBL)
 183 ALU1(CBIT)
 184 ALU3(MAD)
 185 ALU2(ADDC)
 186 ALU2(SUBB)
 187 ALU2(SEL)
 188
 189 /** Gen4 predicated IF. */
 190 fs_inst *
 191 fs_visitor::IF(uint32_t predicate)
 192 {
 193    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
 194    inst->predicate = predicate;
 195    return inst;
 196 }
 197
 198 /** Gen6 IF with embedded comparison. */
 199 fs_inst *
 200 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
 201 {
 202    assert(brw->gen == 6);
 203    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
 204                                         reg_null_d, src0, src1);
 205    inst->conditional_mod = condition;
 206    return inst;
 207 }
 208
 209 /**
 210  * CMP: Sets the low bit of the destination channels with the result
 211  * of the comparison, while the upper bits are undefined, and updates
 212  * the flag register with the packed 16 bits of the result.
 213  */
 214 fs_inst *
 215 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
 216 {
 217    fs_inst *inst;
 218
 219    /* Take the instruction:
 220     *
 221     * CMP null<d> src0<f> src1<f>
 222     *
 223     * Original gen4 does type conversion to the destination type before
 224     * comparison, producing garbage results for floating point comparisons.
 225     * gen5 does the comparison on the execution type (resolved source types),
 226     * so dst type doesn't matter.  gen6 does comparison and then uses the
 227     * result as if it was the dst type with no conversion, which happens to
 228     * mostly work out for float-interpreted-as-int since our comparisons are
 229     * for >0, =0, <0.
 230     */
 231    if (brw->gen == 4) {
 232       dst.type = src0.type;
 233       if (dst.file == HW_REG)
 234          dst.fixed_hw_reg.type = dst.type;
 235    }
 236
 237    resolve_ud_negate(&src0);
 238    resolve_ud_negate(&src1);
 239
 240    inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
 241    inst->conditional_mod = condition;
 242
 243    return inst;
 244 }
 245
 246 exec_list
 247 fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
 248                                        fs_reg varying_offset,
 249                                        uint32_t const_offset)
 250 {
 251    exec_list instructions;
 252    fs_inst *inst;
 253
 254    /* We have our constant surface use a pitch of 4 bytes, so our index can
 255     * be any component of a vector, and then we load 4 contiguous
 256     * components starting from that.
 257     *
 258     * We break down the const_offset to a portion added to the variable
 259     * offset and a portion done using reg_offset, which means that if you
 260     * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
 261     * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
 262     * CSE can later notice that those loads are all the same and eliminate
 263     * the redundant ones.
 264     */
 265    fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
 266    instructions.push_tail(ADD(vec4_offset,
 267                               varying_offset, const_offset & ~3));
 268
 269    int scale = 1;
 270    if (brw->gen == 4 && dispatch_width == 8) {
 271       /* Pre-gen5, we can either use a SIMD8 message that requires (header,
 272        * u, v, r) as parameters, or we can just use the SIMD16 message
 273        * consisting of (header, u).  We choose the second, at the cost of a
 274        * longer return length.
 275        */
 276       scale = 2;
 277    }
 278
 279    enum opcode op;
 280    if (brw->gen >= 7)
 281       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
 282    else
 283       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
 284    fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4 * scale), dst.type);
 285    inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
 286    inst->regs_written = 4 * scale;
 287    instructions.push_tail(inst);
 288
 289    if (brw->gen < 7) {
 290       inst->base_mrf = 13;
 291       inst->header_present = true;
 292       if (brw->gen == 4)
 293          inst->mlen = 3;
 294       else
 295          inst->mlen = 1 + dispatch_width / 8;
 296    }
 297
 298    vec4_result.reg_offset += (const_offset & 3) * scale;
 299    instructions.push_tail(MOV(dst, vec4_result));
 300
 301    return instructions;
 302 }
 303
 304 /**
 305  * A helper for MOV generation for fixing up broken hardware SEND dependency
 306  * handling.
 307  */
 308 fs_inst *
 309 fs_visitor::DEP_RESOLVE_MOV(int grf)
 310 {
 311    fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
 312
 313    inst->ir = NULL;
 314    inst->annotation = "send dependency resolve";
 315
 316    /* The caller always wants uncompressed to emit the minimal extra
 317     * dependencies, and to avoid having to deal with aligning its regs to 2.
 318     */
 319    inst->force_uncompressed = true;
 320
 321    return inst;
 322 }
 323
 324 bool
 325 fs_inst::equals(fs_inst *inst)
 326 {
 327    return (opcode == inst->opcode &&
 328            dst.equals(inst->dst) &&
 329            src[0].equals(inst->src[0]) &&
 330            src[1].equals(inst->src[1]) &&
 331            src[2].equals(inst->src[2]) &&
 332            saturate == inst->saturate &&
 333            predicate == inst->predicate &&
 334            conditional_mod == inst->conditional_mod &&
 335            mlen == inst->mlen &&
 336            base_mrf == inst->base_mrf &&
 337            sampler == inst->sampler &&
 338            target == inst->target &&
 339            eot == inst->eot &&
 340            header_present == inst->header_present &&
 341            shadow_compare == inst->shadow_compare &&
 342            offset == inst->offset);
 343 }
 344
 345 bool
 346 fs_inst::overwrites_reg(const fs_reg &reg)
 347 {
 348    return (reg.file == dst.file &&
 349            reg.reg == dst.reg &&
 350            reg.reg_offset >= dst.reg_offset  &&
 351            reg.reg_offset < dst.reg_offset + regs_written);
 352 }
 353
 354 bool
 355 fs_inst::is_send_from_grf()
 356 {
 357    return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
 358            opcode == SHADER_OPCODE_SHADER_TIME_ADD ||
 359            (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
 360             src[1].file == GRF) ||
 361            (is_tex() && src[0].file == GRF));
 362 }
 363
 364 bool
 365 fs_visitor::can_do_source_mods(fs_inst *inst)
 366 {
 367    if (brw->gen == 6 && inst->is_math())
 368       return false;
 369
 370    if (inst->is_send_from_grf())
 371       return false;
 372
 373    if (!inst->can_do_source_mods())
 374       return false;
 375
 376    return true;
 377 }
 378
 379 void
 380 fs_reg::init()
 381 {
 382    memset(this, 0, sizeof(*this));
 383    this->smear = -1;
 384 }
 385
 386 /** Generic unset register constructor. */
 387 fs_reg::fs_reg()
 388 {
 389    init();
 390    this->file = BAD_FILE;
 391 }
 392
 393 /** Immediate value constructor. */
 394 fs_reg::fs_reg(float f)
 395 {
 396    init();
 397    this->file = IMM;
 398    this->type = BRW_REGISTER_TYPE_F;
 399    this->imm.f = f;
 400 }
 401
 402 /** Immediate value constructor. */
 403 fs_reg::fs_reg(int32_t i)
 404 {
 405    init();
 406    this->file = IMM;
 407    this->type = BRW_REGISTER_TYPE_D;
 408    this->imm.i = i;
 409 }
 410
 411 /** Immediate value constructor. */
 412 fs_reg::fs_reg(uint32_t u)
 413 {
 414    init();
 415    this->file = IMM;
 416    this->type = BRW_REGISTER_TYPE_UD;
 417    this->imm.u = u;
 418 }
 419
 420 /** Fixed brw_reg. */
 421 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
 422 {
 423    init();
 424    this->file = HW_REG;
 425    this->fixed_hw_reg = fixed_hw_reg;
 426    this->type = fixed_hw_reg.type;
 427 }
 428
 429 bool
 430 fs_reg::equals(const fs_reg &r) const
 431 {
 432    return (file == r.file &&
 433            reg == r.reg &&
 434            reg_offset == r.reg_offset &&
 435            type == r.type &&
 436            negate == r.negate &&
 437            abs == r.abs &&
 438            !reladdr && !r.reladdr &&
 439            memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
 440                   sizeof(fixed_hw_reg)) == 0 &&
 441            smear == r.smear &&
 442            imm.u == r.imm.u);
 443 }
 444
 445 fs_reg
 446 fs_reg::retype(uint32_t type)
 447 {
 448    fs_reg result = *this;
 449    result.type = type;
 450    return result;
 451 }
 452
 453 bool
 454 fs_reg::is_zero() const
 455 {
 456    if (file != IMM)
 457       return false;
 458
 459    return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
 460 }
 461
 462 bool
 463 fs_reg::is_one() const
 464 {
 465    if (file != IMM)
 466       return false;
 467
 468    return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
 469 }
 470
 471 bool
 472 fs_reg::is_null() const
 473 {
 474    return file == HW_REG &&
 475           fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
 476           fixed_hw_reg.nr == BRW_ARF_NULL;
 477 }
 478
 479 bool
 480 fs_reg::is_valid_3src() const
 481 {
 482    return file == GRF || file == UNIFORM;
 483 }
 484
 485 int
 486 fs_visitor::type_size(const struct glsl_type *type)
 487 {
 488    unsigned int size, i;
 489
 490    switch (type->base_type) {
 491    case GLSL_TYPE_UINT:
 492    case GLSL_TYPE_INT:
 493    case GLSL_TYPE_FLOAT:
 494    case GLSL_TYPE_BOOL:
 495       return type->components();
 496    case GLSL_TYPE_ARRAY:
 497       return type_size(type->fields.array) * type->length;
 498    case GLSL_TYPE_STRUCT:
 499       size = 0;
 500       for (i = 0; i < type->length; i++) {
 501          size += type_size(type->fields.structure[i].type);
 502       }
 503       return size;
 504    case GLSL_TYPE_SAMPLER:
 505       /* Samplers take up no register space, since they're baked in at
 506        * link time.
 507        */
 508       return 0;
 509    case GLSL_TYPE_ATOMIC_UINT:
 510       return 0;
 511    case GLSL_TYPE_VOID:
 512    case GLSL_TYPE_ERROR:
 513    case GLSL_TYPE_INTERFACE:
 514       assert(!"not reached");
 515       break;
 516    }
 517
 518    return 0;
 519 }
 520
 521 fs_reg
 522 fs_visitor::get_timestamp()
 523 {
 524    assert(brw->gen >= 7);
 525
 526    fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 527                                           BRW_ARF_TIMESTAMP,
 528                                           0),
 529                              BRW_REGISTER_TYPE_UD));
 530
 531    fs_reg dst = fs_reg(this, glsl_type::uint_type);
 532
 533    fs_inst *mov = emit(MOV(dst, ts));
 534    /* We want to read the 3 fields we care about (mostly field 0, but also 2)
 535     * even if it's not enabled in the dispatch.
 536     */
 537    mov->force_writemask_all = true;
 538    mov->force_uncompressed = true;
 539
 540    /* The caller wants the low 32 bits of the timestamp.  Since it's running
 541     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
 542     * which is plenty of time for our purposes.  It is identical across the
 543     * EUs, but since it's tracking GPU core speed it will increment at a
 544     * varying rate as render P-states change.
 545     *
 546     * The caller could also check if render P-states have changed (or anything
 547     * else that might disrupt timing) by setting smear to 2 and checking if
 548     * that field is != 0.
 549     */
 550    dst.smear = 0;
 551
 552    return dst;
 553 }
 554
 555 void
 556 fs_visitor::emit_shader_time_begin()
 557 {
 558    current_annotation = "shader time start";
 559    shader_start_time = get_timestamp();
 560 }
 561
 562 void
 563 fs_visitor::emit_shader_time_end()
 564 {
 565    current_annotation = "shader time end";
 566
 567    enum shader_time_shader_type type, written_type, reset_type;
 568    if (dispatch_width == 8) {
 569       type = ST_FS8;
 570       written_type = ST_FS8_WRITTEN;
 571       reset_type = ST_FS8_RESET;
 572    } else {
 573       assert(dispatch_width == 16);
 574       type = ST_FS16;
 575       written_type = ST_FS16_WRITTEN;
 576       reset_type = ST_FS16_RESET;
 577    }
 578
 579    fs_reg shader_end_time = get_timestamp();
 580
 581    /* Check that there weren't any timestamp reset events (assuming these
 582     * were the only two timestamp reads that happened).
 583     */
 584    fs_reg reset = shader_end_time;
 585    reset.smear = 2;
 586    fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
 587    test->conditional_mod = BRW_CONDITIONAL_Z;
 588    emit(IF(BRW_PREDICATE_NORMAL));
 589
 590    push_force_uncompressed();
 591    fs_reg start = shader_start_time;
 592    start.negate = true;
 593    fs_reg diff = fs_reg(this, glsl_type::uint_type);
 594    emit(ADD(diff, start, shader_end_time));
 595
 596    /* If there were no instructions between the two timestamp gets, the diff
 597     * is 2 cycles.  Remove that overhead, so I can forget about that when
 598     * trying to determine the time taken for single instructions.
 599     */
 600    emit(ADD(diff, diff, fs_reg(-2u)));
 601
 602    emit_shader_time_write(type, diff);
 603    emit_shader_time_write(written_type, fs_reg(1u));
 604    emit(BRW_OPCODE_ELSE);
 605    emit_shader_time_write(reset_type, fs_reg(1u));
 606    emit(BRW_OPCODE_ENDIF);
 607
 608    pop_force_uncompressed();
 609 }
 610
 611 void
 612 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
 613                                    fs_reg value)
 614 {
 615    int shader_time_index =
 616       brw_get_shader_time_index(brw, shader_prog, &fp->Base, type);
 617    fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
 618
 619    fs_reg payload;
 620    if (dispatch_width == 8)
 621       payload = fs_reg(this, glsl_type::uvec2_type);
 622    else
 623       payload = fs_reg(this, glsl_type::uint_type);
 624
 625    emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
 626                 fs_reg(), payload, offset, value));
 627 }
 628
 629 void
 630 fs_visitor::fail(const char *format, ...)
 631 {
 632    va_list va;
 633    char *msg;
 634
 635    if (failed)
 636       return;
 637
 638    failed = true;
 639
 640    va_start(va, format);
 641    msg = ralloc_vasprintf(mem_ctx, format, va);
 642    va_end(va);
 643    msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
 644
 645    this->fail_msg = msg;
 646
 647    if (INTEL_DEBUG & DEBUG_WM) {
 648       fprintf(stderr, "%s",  msg);
 649    }
 650 }
 651
 652 fs_inst *
 653 fs_visitor::emit(enum opcode opcode)
 654 {
 655    return emit(fs_inst(opcode));
 656 }
 657
 658 fs_inst *
 659 fs_visitor::emit(enum opcode opcode, fs_reg dst)
 660 {
 661    return emit(fs_inst(opcode, dst));
 662 }
 663
 664 fs_inst *
 665 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
 666 {
 667    return emit(fs_inst(opcode, dst, src0));
 668 }
 669
 670 fs_inst *
 671 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 672 {
 673    return emit(fs_inst(opcode, dst, src0, src1));
 674 }
 675
 676 fs_inst *
 677 fs_visitor::emit(enum opcode opcode, fs_reg dst,
 678                  fs_reg src0, fs_reg src1, fs_reg src2)
 679 {
 680    return emit(fs_inst(opcode, dst, src0, src1, src2));
 681 }
 682
 683 void
 684 fs_visitor::push_force_uncompressed()
 685 {
 686    force_uncompressed_stack++;
 687 }
 688
 689 void
 690 fs_visitor::pop_force_uncompressed()
 691 {
 692    force_uncompressed_stack--;
 693    assert(force_uncompressed_stack >= 0);
 694 }
 695
 696 /**
 697  * Returns true if the instruction has a flag that means it won't
 698  * update an entire destination register.
 699  *
 700  * For example, dead code elimination and live variable analysis want to know
 701  * when a write to a variable screens off any preceding values that were in
 702  * it.
 703  */
 704 bool
 705 fs_inst::is_partial_write()
 706 {
 707    return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
 708            this->force_uncompressed ||
 709            this->force_sechalf);
 710 }
 711
 712 int
 713 fs_inst::regs_read(fs_visitor *v, int arg)
 714 {
 715    if (is_tex() && arg == 0 && src[0].file == GRF) {
 716       if (v->dispatch_width == 16)
 717          return (mlen + 1) / 2;
 718       else
 719          return mlen;
 720    }
 721    return 1;
 722 }
 723
 724 bool
 725 fs_inst::reads_flag()
 726 {
 727    return predicate;
 728 }
 729
 730 bool
 731 fs_inst::writes_flag()
 732 {
 733    return (conditional_mod && opcode != BRW_OPCODE_SEL) ||
 734           opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
 735 }
 736
 737 /**
 738  * Returns how many MRFs an FS opcode will write over.
 739  *
 740  * Note that this is not the 0 or 1 implied writes in an actual gen
 741  * instruction -- the FS opcodes often generate MOVs in addition.
 742  */
 743 int
 744 fs_visitor::implied_mrf_writes(fs_inst *inst)
 745 {
 746    if (inst->mlen == 0)
 747       return 0;
 748
 749    if (inst->base_mrf == -1)
 750       return 0;
 751
 752    switch (inst->opcode) {
 753    case SHADER_OPCODE_RCP:
 754    case SHADER_OPCODE_RSQ:
 755    case SHADER_OPCODE_SQRT:
 756    case SHADER_OPCODE_EXP2:
 757    case SHADER_OPCODE_LOG2:
 758    case SHADER_OPCODE_SIN:
 759    case SHADER_OPCODE_COS:
 760       return 1 * dispatch_width / 8;
 761    case SHADER_OPCODE_POW:
 762    case SHADER_OPCODE_INT_QUOTIENT:
 763    case SHADER_OPCODE_INT_REMAINDER:
 764       return 2 * dispatch_width / 8;
 765    case SHADER_OPCODE_TEX:
 766    case FS_OPCODE_TXB:
 767    case SHADER_OPCODE_TXD:
 768    case SHADER_OPCODE_TXF:
 769    case SHADER_OPCODE_TXF_MS:
 770    case SHADER_OPCODE_TXF_MCS:
 771    case SHADER_OPCODE_TG4:
 772    case SHADER_OPCODE_TG4_OFFSET:
 773    case SHADER_OPCODE_TXL:
 774    case SHADER_OPCODE_TXS:
 775    case SHADER_OPCODE_LOD:
 776       return 1;
 777    case FS_OPCODE_FB_WRITE:
 778       return 2;
 779    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 780    case SHADER_OPCODE_GEN4_SCRATCH_READ:
 781       return 1;
 782    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
 783       return inst->mlen;
 784    case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
 785       return 2;
 786    case SHADER_OPCODE_UNTYPED_ATOMIC:
 787    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
 788       return 0;
 789    default:
 790       assert(!"not reached");
 791       return inst->mlen;
 792    }
 793 }
 794
 795 int
 796 fs_visitor::virtual_grf_alloc(int size)
 797 {
 798    if (virtual_grf_array_size <= virtual_grf_count) {
 799       if (virtual_grf_array_size == 0)
 800          virtual_grf_array_size = 16;
 801       else
 802          virtual_grf_array_size *= 2;
 803       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 804                                    virtual_grf_array_size);
 805    }
 806    virtual_grf_sizes[virtual_grf_count] = size;
 807    return virtual_grf_count++;
 808 }
 809
 810 /** Fixed HW reg constructor. */
 811 fs_reg::fs_reg(enum register_file file, int reg)
 812 {
 813    init();
 814    this->file = file;
 815    this->reg = reg;
 816    this->type = BRW_REGISTER_TYPE_F;
 817 }
 818
 819 /** Fixed HW reg constructor. */
 820 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
 821 {
 822    init();
 823    this->file = file;
 824    this->reg = reg;
 825    this->type = type;
 826 }
 827
 828 /** Automatic reg constructor. */
 829 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
 830 {
 831    init();
 832
 833    this->file = GRF;
 834    this->reg = v->virtual_grf_alloc(v->type_size(type));
 835    this->reg_offset = 0;
 836    this->type = brw_type_for_base_type(type);
 837 }
 838
 839 fs_reg *
 840 fs_visitor::variable_storage(ir_variable *var)
 841 {
 842    return (fs_reg *)hash_table_find(this->variable_ht, var);
 843 }
 844
 845 void
 846 import_uniforms_callback(const void *key,
 847                          void *data,
 848                          void *closure)
 849 {
 850    struct hash_table *dst_ht = (struct hash_table *)closure;
 851    const fs_reg *reg = (const fs_reg *)data;
 852
 853    if (reg->file != UNIFORM)
 854       return;
 855
 856    hash_table_insert(dst_ht, data, key);
 857 }
 858
 859 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
 860  * This brings in those uniform definitions
 861  */
 862 void
 863 fs_visitor::import_uniforms(fs_visitor *v)
 864 {
 865    hash_table_call_foreach(v->variable_ht,
 866                            import_uniforms_callback,
 867                            variable_ht);
 868    this->params_remap = v->params_remap;
 869    this->nr_params_remap = v->nr_params_remap;
 870 }
 871
 872 /* Our support for uniforms is piggy-backed on the struct
 873  * gl_fragment_program, because that's where the values actually
 874  * get stored, rather than in some global gl_shader_program uniform
 875  * store.
 876  */
 877 void
 878 fs_visitor::setup_uniform_values(ir_variable *ir)
 879 {
 880    int namelen = strlen(ir->name);
 881
 882    /* The data for our (non-builtin) uniforms is stored in a series of
 883     * gl_uniform_driver_storage structs for each subcomponent that
 884     * glGetUniformLocation() could name.  We know it's been set up in the same
 885     * order we'd walk the type, so walk the list of storage and find anything
 886     * with our name, or the prefix of a component that starts with our name.
 887     */
 888    unsigned params_before = c->prog_data.nr_params;
 889    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
 890       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 891
 892       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 893           (storage->name[namelen] != 0 &&
 894            storage->name[namelen] != '.' &&
 895            storage->name[namelen] != '[')) {
 896          continue;
 897       }
 898
 899       unsigned slots = storage->type->component_slots();
 900       if (storage->array_elements)
 901          slots *= storage->array_elements;
 902
 903       for (unsigned i = 0; i < slots; i++) {
 904          c->prog_data.param[c->prog_data.nr_params++] =
 905             &storage->storage[i].f;
 906       }
 907    }
 908
 909    /* Make sure we actually initialized the right amount of stuff here. */
 910    assert(params_before + ir->type->component_slots() ==
 911           c->prog_data.nr_params);
 912    (void)params_before;
 913 }
 914
 915
 916 /* Our support for builtin uniforms is even scarier than non-builtin.
 917  * It sits on top of the PROG_STATE_VAR parameters that are
 918  * automatically updated from GL context state.
 919  */
 920 void
 921 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
 922 {
 923    const ir_state_slot *const slots = ir->state_slots;
 924    assert(ir->state_slots != NULL);
 925
 926    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 927       /* This state reference has already been setup by ir_to_mesa, but we'll
 928        * get the same index back here.
 929        */
 930       int index = _mesa_add_state_reference(this->fp->Base.Parameters,
 931                                             (gl_state_index *)slots[i].tokens);
 932
 933       /* Add each of the unique swizzles of the element as a parameter.
 934        * This'll end up matching the expected layout of the
 935        * array/matrix/structure we're trying to fill in.
 936        */
 937       int last_swiz = -1;
 938       for (unsigned int j = 0; j < 4; j++) {
 939          int swiz = GET_SWZ(slots[i].swizzle, j);
 940          if (swiz == last_swiz)
 941             break;
 942          last_swiz = swiz;
 943
 944          c->prog_data.param[c->prog_data.nr_params++] =
 945             &fp->Base.Parameters->ParameterValues[index][swiz].f;
 946       }
 947    }
 948 }
 949
 950 fs_reg *
 951 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
 952 {
 953    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 954    fs_reg wpos = *reg;
 955    bool flip = !ir->data.origin_upper_left ^ c->key.render_to_fbo;
 956
 957    /* gl_FragCoord.x */
 958    if (ir->data.pixel_center_integer) {
 959       emit(MOV(wpos, this->pixel_x));
 960    } else {
 961       emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
 962    }
 963    wpos.reg_offset++;
 964
 965    /* gl_FragCoord.y */
 966    if (!flip && ir->data.pixel_center_integer) {
 967       emit(MOV(wpos, this->pixel_y));
 968    } else {
 969       fs_reg pixel_y = this->pixel_y;
 970       float offset = (ir->data.pixel_center_integer ? 0.0 : 0.5);
 971
 972       if (flip) {
 973          pixel_y.negate = true;
 974          offset += c->key.drawable_height - 1.0;
 975       }
 976
 977       emit(ADD(wpos, pixel_y, fs_reg(offset)));
 978    }
 979    wpos.reg_offset++;
 980
 981    /* gl_FragCoord.z */
 982    if (brw->gen >= 6) {
 983       emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
 984    } else {
 985       emit(FS_OPCODE_LINTERP, wpos,
 986            this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
 987            this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
 988            interp_reg(VARYING_SLOT_POS, 2));
 989    }
 990    wpos.reg_offset++;
 991
 992    /* gl_FragCoord.w: Already set up in emit_interpolation */
 993    emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
 994
 995    return reg;
 996 }
 997
 998 fs_inst *
 999 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1000                          glsl_interp_qualifier interpolation_mode,
1001                          bool is_centroid, bool is_sample)
1002 {
1003    brw_wm_barycentric_interp_mode barycoord_mode;
1004    if (brw->gen >= 6) {
1005       if (is_centroid) {
1006          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1007             barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1008          else
1009             barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1010       } else if (is_sample) {
1011           if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1012             barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1013          else
1014             barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1015       } else {
1016          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1017             barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1018          else
1019             barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1020       }
1021    } else {
1022       /* On Ironlake and below, there is only one interpolation mode.
1023        * Centroid interpolation doesn't mean anything on this hardware --
1024        * there is no multisampling.
1025        */
1026       barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1027    }
1028    return emit(FS_OPCODE_LINTERP, attr,
1029                this->delta_x[barycoord_mode],
1030                this->delta_y[barycoord_mode], interp);
1031 }
1032
1033 fs_reg *
1034 fs_visitor::emit_general_interpolation(ir_variable *ir)
1035 {
1036    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1037    reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
1038    fs_reg attr = *reg;
1039
1040    unsigned int array_elements;
1041    const glsl_type *type;
1042
1043    if (ir->type->is_array()) {
1044       array_elements = ir->type->length;
1045       if (array_elements == 0) {
1046          fail("dereferenced array '%s' has length 0\n", ir->name);
1047       }
1048       type = ir->type->fields.array;
1049    } else {
1050       array_elements = 1;
1051       type = ir->type;
1052    }
1053
1054    glsl_interp_qualifier interpolation_mode =
1055       ir->determine_interpolation_mode(c->key.flat_shade);
1056
1057    int location = ir->data.location;
1058    for (unsigned int i = 0; i < array_elements; i++) {
1059       for (unsigned int j = 0; j < type->matrix_columns; j++) {
1060          if (c->prog_data.urb_setup[location] == -1) {
1061             /* If there's no incoming setup data for this slot, don't
1062              * emit interpolation for it.
1063              */
1064             attr.reg_offset += type->vector_elements;
1065             location++;
1066             continue;
1067          }
1068
1069          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1070             /* Constant interpolation (flat shading) case. The SF has
1071              * handed us defined values in only the constant offset
1072              * field of the setup reg.
1073              */
1074             for (unsigned int k = 0; k < type->vector_elements; k++) {
1075                struct brw_reg interp = interp_reg(location, k);
1076                interp = suboffset(interp, 3);
1077                interp.type = reg->type;
1078                emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1079                attr.reg_offset++;
1080             }
1081          } else {
1082             /* Smooth/noperspective interpolation case. */
1083             for (unsigned int k = 0; k < type->vector_elements; k++) {
1084                /* FINISHME: At some point we probably want to push
1085                 * this farther by giving similar treatment to the
1086                 * other potentially constant components of the
1087                 * attribute, as well as making brw_vs_constval.c
1088                 * handle varyings other than gl_TexCoord.
1089                 */
1090                struct brw_reg interp = interp_reg(location, k);
1091                emit_linterp(attr, fs_reg(interp), interpolation_mode,
1092                             ir->data.centroid,
1093                             ir->data.sample || c->key.persample_shading);
1094                if (brw->needs_unlit_centroid_workaround && ir->data.centroid) {
1095                   /* Get the pixel/sample mask into f0 so that we know
1096                    * which pixels are lit.  Then, for each channel that is
1097                    * unlit, replace the centroid data with non-centroid
1098                    * data.
1099                    */
1100                   emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1101                   fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1102                                                interpolation_mode,
1103                                                false, false);
1104                   inst->predicate = BRW_PREDICATE_NORMAL;
1105                   inst->predicate_inverse = true;
1106                }
1107                if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1108                   emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1109                }
1110                attr.reg_offset++;
1111             }
1112
1113          }
1114          location++;
1115       }
1116    }
1117
1118    return reg;
1119 }
1120
1121 fs_reg *
1122 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1123 {
1124    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1125
1126    /* The frontfacing comes in as a bit in the thread payload. */
1127    if (brw->gen >= 6) {
1128       emit(BRW_OPCODE_ASR, *reg,
1129            fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1130            fs_reg(15));
1131       emit(BRW_OPCODE_NOT, *reg, *reg);
1132       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1133    } else {
1134       struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1135       /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1136        * us front face
1137        */
1138       emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1139       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1140    }
1141
1142    return reg;
1143 }
1144
1145 void
1146 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1147 {
1148    assert(dst.type == BRW_REGISTER_TYPE_F);
1149
1150    if (c->key.compute_pos_offset) {
1151       /* Convert int_sample_pos to floating point */
1152       emit(MOV(dst, int_sample_pos));
1153       /* Scale to the range [0, 1] */
1154       emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1155    }
1156    else {
1157       /* From ARB_sample_shading specification:
1158        * "When rendering to a non-multisample buffer, or if multisample
1159        *  rasterization is disabled, gl_SamplePosition will always be
1160        *  (0.5, 0.5).
1161        */
1162       emit(MOV(dst, fs_reg(0.5f)));
1163    }
1164 }
1165
1166 fs_reg *
1167 fs_visitor::emit_samplepos_setup(ir_variable *ir)
1168 {
1169    assert(brw->gen >= 6);
1170    assert(ir->type == glsl_type::vec2_type);
1171
1172    this->current_annotation = "compute sample position";
1173    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1174    fs_reg pos = *reg;
1175    fs_reg int_sample_x = fs_reg(this, glsl_type::int_type);
1176    fs_reg int_sample_y = fs_reg(this, glsl_type::int_type);
1177
1178    /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1179     * mode will be enabled.
1180     *
1181     * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1182     * R31.1:0         Position Offset X/Y for Slot[3:0]
1183     * R31.3:2         Position Offset X/Y for Slot[7:4]
1184     * .....
1185     *
1186     * The X, Y sample positions come in as bytes in  thread payload. So, read
1187     * the positions using vstride=16, width=8, hstride=2.
1188     */
1189    struct brw_reg sample_pos_reg =
1190       stride(retype(brw_vec1_grf(c->sample_pos_reg, 0),
1191                     BRW_REGISTER_TYPE_B), 16, 8, 2);
1192
1193    emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1194    if (dispatch_width == 16) {
1195       int_sample_x.sechalf = true;
1196       fs_inst *inst = emit(MOV(int_sample_x,
1197                                fs_reg(suboffset(sample_pos_reg, 16))));
1198       inst->force_sechalf = true;
1199       int_sample_x.sechalf = false;
1200    }
1201    /* Compute gl_SamplePosition.x */
1202    compute_sample_position(pos, int_sample_x);
1203    pos.reg_offset++;
1204    emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1205    if (dispatch_width == 16) {
1206       int_sample_y.sechalf = true;
1207       fs_inst *inst = emit(MOV(int_sample_y,
1208                                fs_reg(suboffset(sample_pos_reg, 17))));
1209       inst->force_sechalf = true;
1210       int_sample_y.sechalf = false;
1211    }
1212    /* Compute gl_SamplePosition.y */
1213    compute_sample_position(pos, int_sample_y);
1214    return reg;
1215 }
1216
1217 fs_reg *
1218 fs_visitor::emit_sampleid_setup(ir_variable *ir)
1219 {
1220    assert(brw->gen >= 6);
1221
1222    this->current_annotation = "compute sample id";
1223    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1224
1225    if (c->key.compute_sample_id) {
1226       fs_reg t1 = fs_reg(this, glsl_type::int_type);
1227       fs_reg t2 = fs_reg(this, glsl_type::int_type);
1228       t2.type = BRW_REGISTER_TYPE_UW;
1229
1230       /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1231        * 8x multisampling, subspan 0 will represent sample N (where N
1232        * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1233        * 7. We can find the value of N by looking at R0.0 bits 7:6
1234        * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1235        * (since samples are always delivered in pairs). That is, we
1236        * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1237        * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1238        * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1239        * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1240        * populating a temporary variable with the sequence (0, 1, 2, 3),
1241        * and then reading from it using vstride=1, width=4, hstride=0.
1242        * These computations hold good for 4x multisampling as well.
1243        */
1244       emit(BRW_OPCODE_AND, t1,
1245            fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1246            fs_reg(brw_imm_d(0xc0)));
1247       emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1248       /* This works for both SIMD8 and SIMD16 */
1249       emit(MOV(t2, brw_imm_v(0x3210)));
1250       /* This special instruction takes care of setting vstride=1,
1251        * width=4, hstride=0 of t2 during an ADD instruction.
1252        */
1253       emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1254    } else {
1255       /* As per GL_ARB_sample_shading specification:
1256        * "When rendering to a non-multisample buffer, or if multisample
1257        *  rasterization is disabled, gl_SampleID will always be zero."
1258        */
1259       emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1260    }
1261
1262    return reg;
1263 }
1264
1265 fs_reg *
1266 fs_visitor::emit_samplemaskin_setup(ir_variable *ir)
1267 {
1268    assert(brw->gen >= 7);
1269    this->current_annotation = "compute gl_SampleMaskIn";
1270    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1271    emit(MOV(*reg, fs_reg(retype(brw_vec8_grf(c->sample_mask_reg, 0), BRW_REGISTER_TYPE_D))));
1272    return reg;
1273 }
1274
1275 fs_reg
1276 fs_visitor::fix_math_operand(fs_reg src)
1277 {
1278    /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1279     * might be able to do better by doing execsize = 1 math and then
1280     * expanding that result out, but we would need to be careful with
1281     * masking.
1282     *
1283     * The hardware ignores source modifiers (negate and abs) on math
1284     * instructions, so we also move to a temp to set those up.
1285     */
1286    if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1287        !src.abs && !src.negate)
1288       return src;
1289
1290    /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1291     * operands to math
1292     */
1293    if (brw->gen >= 7 && src.file != IMM)
1294       return src;
1295
1296    fs_reg expanded = fs_reg(this, glsl_type::float_type);
1297    expanded.type = src.type;
1298    emit(BRW_OPCODE_MOV, expanded, src);
1299    return expanded;
1300 }
1301
1302 fs_inst *
1303 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1304 {
1305    switch (opcode) {
1306    case SHADER_OPCODE_RCP:
1307    case SHADER_OPCODE_RSQ:
1308    case SHADER_OPCODE_SQRT:
1309    case SHADER_OPCODE_EXP2:
1310    case SHADER_OPCODE_LOG2:
1311    case SHADER_OPCODE_SIN:
1312    case SHADER_OPCODE_COS:
1313       break;
1314    default:
1315       assert(!"not reached: bad math opcode");
1316       return NULL;
1317    }
1318
1319    /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
1320     * might be able to do better by doing execsize = 1 math and then
1321     * expanding that result out, but we would need to be careful with
1322     * masking.
1323     *
1324     * Gen 6 hardware ignores source modifiers (negate and abs) on math
1325     * instructions, so we also move to a temp to set those up.
1326     */
1327    if (brw->gen >= 6)
1328       src = fix_math_operand(src);
1329
1330    fs_inst *inst = emit(opcode, dst, src);
1331
1332    if (brw->gen < 6) {
1333       inst->base_mrf = 2;
1334       inst->mlen = dispatch_width / 8;
1335    }
1336
1337    return inst;
1338 }
1339
1340 fs_inst *
1341 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1342 {
1343    int base_mrf = 2;
1344    fs_inst *inst;
1345
1346    switch (opcode) {
1347    case SHADER_OPCODE_INT_QUOTIENT:
1348    case SHADER_OPCODE_INT_REMAINDER:
1349       if (brw->gen >= 7 && dispatch_width == 16)
1350          fail("SIMD16 INTDIV unsupported\n");
1351       break;
1352    case SHADER_OPCODE_POW:
1353       break;
1354    default:
1355       assert(!"not reached: unsupported binary math opcode.");
1356       return NULL;
1357    }
1358
1359    if (brw->gen >= 6) {
1360       src0 = fix_math_operand(src0);
1361       src1 = fix_math_operand(src1);
1362
1363       inst = emit(opcode, dst, src0, src1);
1364    } else {
1365       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1366        * "Message Payload":
1367        *
1368        * "Operand0[7].  For the INT DIV functions, this operand is the
1369        *  denominator."
1370        *  ...
1371        * "Operand1[7].  For the INT DIV functions, this operand is the
1372        *  numerator."
1373        */
1374       bool is_int_div = opcode != SHADER_OPCODE_POW;
1375       fs_reg &op0 = is_int_div ? src1 : src0;
1376       fs_reg &op1 = is_int_div ? src0 : src1;
1377
1378       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1379       inst = emit(opcode, dst, op0, reg_null_f);
1380
1381       inst->base_mrf = base_mrf;
1382       inst->mlen = 2 * dispatch_width / 8;
1383    }
1384    return inst;
1385 }
1386
1387 void
1388 fs_visitor::assign_curb_setup()
1389 {
1390    c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
1391    if (dispatch_width == 8) {
1392       c->prog_data.first_curbe_grf = c->nr_payload_regs;
1393    } else {
1394       c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1395    }
1396
1397    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1398    foreach_list(node, &this->instructions) {
1399       fs_inst *inst = (fs_inst *)node;
1400
1401       for (unsigned int i = 0; i < 3; i++) {
1402          if (inst->src[i].file == UNIFORM) {
1403             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1404             struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1405                                                   constant_nr / 8,
1406                                                   constant_nr % 8);
1407
1408             inst->src[i].file = HW_REG;
1409             inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
1410          }
1411       }
1412    }
1413 }
1414
1415 void
1416 fs_visitor::calculate_urb_setup()
1417 {
1418    for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1419       c->prog_data.urb_setup[i] = -1;
1420    }
1421
1422    int urb_next = 0;
1423    /* Figure out where each of the incoming setup attributes lands. */
1424    if (brw->gen >= 6) {
1425       if (_mesa_bitcount_64(fp->Base.InputsRead &
1426                             BRW_FS_VARYING_INPUT_MASK) <= 16) {
1427          /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1428           * first 16 varying inputs, so we can put them wherever we want.
1429           * Just put them in order.
1430           *
1431           * This is useful because it means that (a) inputs not used by the
1432           * fragment shader won't take up valuable register space, and (b) we
1433           * won't have to recompile the fragment shader if it gets paired with
1434           * a different vertex (or geometry) shader.
1435           */
1436          for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1437             if (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1438                 BITFIELD64_BIT(i)) {
1439                c->prog_data.urb_setup[i] = urb_next++;
1440             }
1441          }
1442       } else {
1443          /* We have enough input varyings that the SF/SBE pipeline stage can't
1444           * arbitrarily rearrange them to suit our whim; we have to put them
1445           * in an order that matches the output of the previous pipeline stage
1446           * (geometry or vertex shader).
1447           */
1448          struct brw_vue_map prev_stage_vue_map;
1449          brw_compute_vue_map(brw, &prev_stage_vue_map,
1450                              c->key.input_slots_valid);
1451          int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1452          assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1453          for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1454               slot++) {
1455             int varying = prev_stage_vue_map.slot_to_varying[slot];
1456             /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1457              * unused.
1458              */
1459             if (varying != BRW_VARYING_SLOT_COUNT &&
1460                 (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1461                  BITFIELD64_BIT(varying))) {
1462                c->prog_data.urb_setup[varying] = slot - first_slot;
1463             }
1464          }
1465          urb_next = prev_stage_vue_map.num_slots - first_slot;
1466       }
1467    } else {
1468       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1469       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1470          /* Point size is packed into the header, not as a general attribute */
1471          if (i == VARYING_SLOT_PSIZ)
1472             continue;
1473
1474          if (c->key.input_slots_valid & BITFIELD64_BIT(i)) {
1475             /* The back color slot is skipped when the front color is
1476              * also written to.  In addition, some slots can be
1477              * written in the vertex shader and not read in the
1478              * fragment shader.  So the register number must always be
1479              * incremented, mapped or not.
1480              */
1481             if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1482                c->prog_data.urb_setup[i] = urb_next;
1483             urb_next++;
1484          }
1485       }
1486
1487       /*
1488        * It's a FS only attribute, and we did interpolation for this attribute
1489        * in SF thread. So, count it here, too.
1490        *
1491        * See compile_sf_prog() for more info.
1492        */
1493       if (fp->Base.InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1494          c->prog_data.urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1495    }
1496
1497    c->prog_data.num_varying_inputs = urb_next;
1498 }
1499
1500 void
1501 fs_visitor::assign_urb_setup()
1502 {
1503    int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1504
1505    /* Offset all the urb_setup[] index by the actual position of the
1506     * setup regs, now that the location of the constants has been chosen.
1507     */
1508    foreach_list(node, &this->instructions) {
1509       fs_inst *inst = (fs_inst *)node;
1510
1511       if (inst->opcode == FS_OPCODE_LINTERP) {
1512          assert(inst->src[2].file == HW_REG);
1513          inst->src[2].fixed_hw_reg.nr += urb_start;
1514       }
1515
1516       if (inst->opcode == FS_OPCODE_CINTERP) {
1517          assert(inst->src[0].file == HW_REG);
1518          inst->src[0].fixed_hw_reg.nr += urb_start;
1519       }
1520    }
1521
1522    /* Each attribute is 4 setup channels, each of which is half a reg. */
1523    this->first_non_payload_grf =
1524       urb_start + c->prog_data.num_varying_inputs * 2;
1525 }
1526
1527 /**
1528  * Split large virtual GRFs into separate components if we can.
1529  *
1530  * This is mostly duplicated with what brw_fs_vector_splitting does,
1531  * but that's really conservative because it's afraid of doing
1532  * splitting that doesn't result in real progress after the rest of
1533  * the optimization phases, which would cause infinite looping in
1534  * optimization.  We can do it once here, safely.  This also has the
1535  * opportunity to split interpolated values, or maybe even uniforms,
1536  * which we don't have at the IR level.
1537  *
1538  * We want to split, because virtual GRFs are what we register
1539  * allocate and spill (due to contiguousness requirements for some
1540  * instructions), and they're what we naturally generate in the
1541  * codegen process, but most virtual GRFs don't actually need to be
1542  * contiguous sets of GRFs.  If we split, we'll end up with reduced
1543  * live intervals and better dead code elimination and coalescing.
1544  */
1545 void
1546 fs_visitor::split_virtual_grfs()
1547 {
1548    int num_vars = this->virtual_grf_count;
1549    bool split_grf[num_vars];
1550    int new_virtual_grf[num_vars];
1551
1552    /* Try to split anything > 0 sized. */
1553    for (int i = 0; i < num_vars; i++) {
1554       if (this->virtual_grf_sizes[i] != 1)
1555          split_grf[i] = true;
1556       else
1557          split_grf[i] = false;
1558    }
1559
1560    if (brw->has_pln &&
1561        this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1562       /* PLN opcodes rely on the delta_xy being contiguous.  We only have to
1563        * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1564        * Gen6, that was the only supported interpolation mode, and since Gen6,
1565        * delta_x and delta_y are in fixed hardware registers.
1566        */
1567       split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1568          false;
1569    }
1570
1571    foreach_list(node, &this->instructions) {
1572       fs_inst *inst = (fs_inst *)node;
1573
1574       /* If there's a SEND message that requires contiguous destination
1575        * registers, no splitting is allowed.
1576        */
1577       if (inst->regs_written > 1) {
1578          split_grf[inst->dst.reg] = false;
1579       }
1580
1581       /* If we're sending from a GRF, don't split it, on the assumption that
1582        * the send is reading the whole thing.
1583        */
1584       if (inst->is_send_from_grf()) {
1585          for (int i = 0; i < 3; i++) {
1586             if (inst->src[i].file == GRF) {
1587                split_grf[inst->src[i].reg] = false;
1588             }
1589          }
1590       }
1591    }
1592
1593    /* Allocate new space for split regs.  Note that the virtual
1594     * numbers will be contiguous.
1595     */
1596    for (int i = 0; i < num_vars; i++) {
1597       if (split_grf[i]) {
1598          new_virtual_grf[i] = virtual_grf_alloc(1);
1599          for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1600             int reg = virtual_grf_alloc(1);
1601             assert(reg == new_virtual_grf[i] + j - 1);
1602             (void) reg;
1603          }
1604          this->virtual_grf_sizes[i] = 1;
1605       }
1606    }
1607
1608    foreach_list(node, &this->instructions) {
1609       fs_inst *inst = (fs_inst *)node;
1610
1611       if (inst->dst.file == GRF &&
1612           split_grf[inst->dst.reg] &&
1613           inst->dst.reg_offset != 0) {
1614          inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1615                           inst->dst.reg_offset - 1);
1616          inst->dst.reg_offset = 0;
1617       }
1618       for (int i = 0; i < 3; i++) {
1619          if (inst->src[i].file == GRF &&
1620              split_grf[inst->src[i].reg] &&
1621              inst->src[i].reg_offset != 0) {
1622             inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1623                                 inst->src[i].reg_offset - 1);
1624             inst->src[i].reg_offset = 0;
1625          }
1626       }
1627    }
1628    invalidate_live_intervals();
1629 }
1630
1631 /**
1632  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1633  *
1634  * During code generation, we create tons of temporary variables, many of
1635  * which get immediately killed and are never used again.  Yet, in later
1636  * optimization and analysis passes, such as compute_live_intervals, we need
1637  * to loop over all the virtual GRFs.  Compacting them can save a lot of
1638  * overhead.
1639  */
1640 void
1641 fs_visitor::compact_virtual_grfs()
1642 {
1643    /* Mark which virtual GRFs are used, and count how many. */
1644    int remap_table[this->virtual_grf_count];
1645    memset(remap_table, -1, sizeof(remap_table));
1646
1647    foreach_list(node, &this->instructions) {
1648       const fs_inst *inst = (const fs_inst *) node;
1649
1650       if (inst->dst.file == GRF)
1651          remap_table[inst->dst.reg] = 0;
1652
1653       for (int i = 0; i < 3; i++) {
1654          if (inst->src[i].file == GRF)
1655             remap_table[inst->src[i].reg] = 0;
1656       }
1657    }
1658
1659    /* In addition to registers used in instructions, fs_visitor keeps
1660     * direct references to certain special values which must be patched:
1661     */
1662    fs_reg *special[] = {
1663       &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1664       &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1665       &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1666       &delta_x[0], &delta_x[1], &delta_x[2],
1667       &delta_x[3], &delta_x[4], &delta_x[5],
1668       &delta_y[0], &delta_y[1], &delta_y[2],
1669       &delta_y[3], &delta_y[4], &delta_y[5],
1670    };
1671    STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1672    STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1673
1674    /* Treat all special values as used, to be conservative */
1675    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1676       if (special[i]->file == GRF)
1677          remap_table[special[i]->reg] = 0;
1678    }
1679
1680    /* Compact the GRF arrays. */
1681    int new_index = 0;
1682    for (int i = 0; i < this->virtual_grf_count; i++) {
1683       if (remap_table[i] != -1) {
1684          remap_table[i] = new_index;
1685          virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1686          invalidate_live_intervals();
1687          ++new_index;
1688       }
1689    }
1690
1691    this->virtual_grf_count = new_index;
1692
1693    /* Patch all the instructions to use the newly renumbered registers */
1694    foreach_list(node, &this->instructions) {
1695       fs_inst *inst = (fs_inst *) node;
1696
1697       if (inst->dst.file == GRF)
1698          inst->dst.reg = remap_table[inst->dst.reg];
1699
1700       for (int i = 0; i < 3; i++) {
1701          if (inst->src[i].file == GRF)
1702             inst->src[i].reg = remap_table[inst->src[i].reg];
1703       }
1704    }
1705
1706    /* Patch all the references to special values */
1707    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1708       if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1709          special[i]->reg = remap_table[special[i]->reg];
1710    }
1711 }
1712
1713 bool
1714 fs_visitor::remove_dead_constants()
1715 {
1716    if (dispatch_width == 8) {
1717       this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
1718       this->nr_params_remap = c->prog_data.nr_params;
1719
1720       for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
1721          this->params_remap[i] = -1;
1722
1723       /* Find which params are still in use. */
1724       foreach_list(node, &this->instructions) {
1725          fs_inst *inst = (fs_inst *)node;
1726
1727          for (int i = 0; i < 3; i++) {
1728             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1729
1730             if (inst->src[i].file != UNIFORM)
1731                continue;
1732
1733             /* Section 5.11 of the OpenGL 4.3 spec says:
1734              *
1735              *     "Out-of-bounds reads return undefined values, which include
1736              *     values from other variables of the active program or zero."
1737              */
1738             if (constant_nr < 0 || constant_nr >= (int)c->prog_data.nr_params) {
1739                constant_nr = 0;
1740             }
1741
1742             /* For now, set this to non-negative.  We'll give it the
1743              * actual new number in a moment, in order to keep the
1744              * register numbers nicely ordered.
1745              */
1746             this->params_remap[constant_nr] = 0;
1747          }
1748       }
1749
1750       /* Figure out what the new numbers for the params will be.  At some
1751        * point when we're doing uniform array access, we're going to want
1752        * to keep the distinction between .reg and .reg_offset, but for
1753        * now we don't care.
1754        */
1755       unsigned int new_nr_params = 0;
1756       for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1757          if (this->params_remap[i] != -1) {
1758             this->params_remap[i] = new_nr_params++;
1759          }
1760       }
1761
1762       /* Update the list of params to be uploaded to match our new numbering. */
1763       for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1764          int remapped = this->params_remap[i];
1765
1766          if (remapped == -1)
1767             continue;
1768
1769          c->prog_data.param[remapped] = c->prog_data.param[i];
1770       }
1771
1772       c->prog_data.nr_params = new_nr_params;
1773    } else {
1774       /* This should have been generated in the SIMD8 pass already. */
1775       assert(this->params_remap);
1776    }
1777
1778    /* Now do the renumbering of the shader to remove unused params. */
1779    foreach_list(node, &this->instructions) {
1780       fs_inst *inst = (fs_inst *)node;
1781
1782       for (int i = 0; i < 3; i++) {
1783          int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1784
1785          if (inst->src[i].file != UNIFORM)
1786             continue;
1787
1788          /* as above alias to 0 */
1789          if (constant_nr < 0 || constant_nr >= (int)this->nr_params_remap) {
1790             constant_nr = 0;
1791          }
1792          assert(this->params_remap[constant_nr] != -1);
1793          inst->src[i].reg = this->params_remap[constant_nr];
1794          inst->src[i].reg_offset = 0;
1795       }
1796    }
1797
1798    return true;
1799 }
1800
1801 /*
1802  * Implements array access of uniforms by inserting a
1803  * PULL_CONSTANT_LOAD instruction.
1804  *
1805  * Unlike temporary GRF array access (where we don't support it due to
1806  * the difficulty of doing relative addressing on instruction
1807  * destinations), we could potentially do array access of uniforms
1808  * that were loaded in GRF space as push constants.  In real-world
1809  * usage we've seen, though, the arrays being used are always larger
1810  * than we could load as push constants, so just always move all
1811  * uniform array access out to a pull constant buffer.
1812  */
1813 void
1814 fs_visitor::move_uniform_array_access_to_pull_constants()
1815 {
1816    int pull_constant_loc[c->prog_data.nr_params];
1817
1818    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1819       pull_constant_loc[i] = -1;
1820    }
1821
1822    /* Walk through and find array access of uniforms.  Put a copy of that
1823     * uniform in the pull constant buffer.
1824     *
1825     * Note that we don't move constant-indexed accesses to arrays.  No
1826     * testing has been done of the performance impact of this choice.
1827     */
1828    foreach_list_safe(node, &this->instructions) {
1829       fs_inst *inst = (fs_inst *)node;
1830
1831       for (int i = 0 ; i < 3; i++) {
1832          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1833             continue;
1834
1835          int uniform = inst->src[i].reg;
1836
1837          /* If this array isn't already present in the pull constant buffer,
1838           * add it.
1839           */
1840          if (pull_constant_loc[uniform] == -1) {
1841             const float **values = &c->prog_data.param[uniform];
1842
1843             pull_constant_loc[uniform] = c->prog_data.nr_pull_params;
1844
1845             assert(param_size[uniform]);
1846
1847             for (int j = 0; j < param_size[uniform]; j++) {
1848                c->prog_data.pull_param[c->prog_data.nr_pull_params++] =
1849                   values[j];
1850             }
1851          }
1852
1853          /* Set up the annotation tracking for new generated instructions. */
1854          base_ir = inst->ir;
1855          current_annotation = inst->annotation;
1856
1857          fs_reg surf_index = fs_reg(c->prog_data.base.binding_table.pull_constants_start);
1858          fs_reg temp = fs_reg(this, glsl_type::float_type);
1859          exec_list list = VARYING_PULL_CONSTANT_LOAD(temp,
1860                                                      surf_index,
1861                                                      *inst->src[i].reladdr,
1862                                                      pull_constant_loc[uniform] +
1863                                                      inst->src[i].reg_offset);
1864          inst->insert_before(&list);
1865
1866          inst->src[i].file = temp.file;
1867          inst->src[i].reg = temp.reg;
1868          inst->src[i].reg_offset = temp.reg_offset;
1869          inst->src[i].reladdr = NULL;
1870       }
1871    }
1872 }
1873
1874 /**
1875  * Choose accesses from the UNIFORM file to demote to using the pull
1876  * constant buffer.
1877  *
1878  * We allow a fragment shader to have more than the specified minimum
1879  * maximum number of fragment shader uniform components (64).  If
1880  * there are too many of these, they'd fill up all of register space.
1881  * So, this will push some of them out to the pull constant buffer and
1882  * update the program to load them.
1883  */
1884 void
1885 fs_visitor::setup_pull_constants()
1886 {
1887    /* Only allow 16 registers (128 uniform components) as push constants. */
1888    unsigned int max_uniform_components = 16 * 8;
1889    if (c->prog_data.nr_params <= max_uniform_components)
1890       return;
1891
1892    if (dispatch_width == 16) {
1893       fail("Pull constants not supported in SIMD16\n");
1894       return;
1895    }
1896
1897    /* Just demote the end of the list.  We could probably do better
1898     * here, demoting things that are rarely used in the program first.
1899     */
1900    unsigned int pull_uniform_base = max_uniform_components;
1901
1902    int pull_constant_loc[c->prog_data.nr_params];
1903    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1904       if (i < pull_uniform_base) {
1905          pull_constant_loc[i] = -1;
1906       } else {
1907          pull_constant_loc[i] = -1;
1908          /* If our constant is already being uploaded for reladdr purposes,
1909           * reuse it.
1910           */
1911          for (unsigned int j = 0; j < c->prog_data.nr_pull_params; j++) {
1912             if (c->prog_data.pull_param[j] == c->prog_data.param[i]) {
1913                pull_constant_loc[i] = j;
1914                break;
1915             }
1916          }
1917          if (pull_constant_loc[i] == -1) {
1918             int pull_index = c->prog_data.nr_pull_params++;
1919             c->prog_data.pull_param[pull_index] = c->prog_data.param[i];
1920             pull_constant_loc[i] = pull_index;;
1921          }
1922       }
1923    }
1924    c->prog_data.nr_params = pull_uniform_base;
1925
1926    foreach_list(node, &this->instructions) {
1927       fs_inst *inst = (fs_inst *)node;
1928
1929       for (int i = 0; i < 3; i++) {
1930          if (inst->src[i].file != UNIFORM)
1931             continue;
1932
1933          int pull_index = pull_constant_loc[inst->src[i].reg +
1934                                             inst->src[i].reg_offset];
1935          if (pull_index == -1)
1936             continue;
1937
1938          assert(!inst->src[i].reladdr);
1939
1940          fs_reg dst = fs_reg(this, glsl_type::float_type);
1941          fs_reg index = fs_reg(c->prog_data.base.binding_table.pull_constants_start);
1942          fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1943          fs_inst *pull =
1944             new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1945                                  dst, index, offset);
1946          pull->ir = inst->ir;
1947          pull->annotation = inst->annotation;
1948
1949          inst->insert_before(pull);
1950
1951          inst->src[i].file = GRF;
1952          inst->src[i].reg = dst.reg;
1953          inst->src[i].reg_offset = 0;
1954          inst->src[i].smear = pull_index & 3;
1955       }
1956    }
1957 }
1958
1959 bool
1960 fs_visitor::opt_algebraic()
1961 {
1962    bool progress = false;
1963
1964    foreach_list(node, &this->instructions) {
1965       fs_inst *inst = (fs_inst *)node;
1966
1967       switch (inst->opcode) {
1968       case BRW_OPCODE_MUL:
1969          if (inst->src[1].file != IMM)
1970             continue;
1971
1972          /* a * 1.0 = a */
1973          if (inst->src[1].is_one()) {
1974             inst->opcode = BRW_OPCODE_MOV;
1975             inst->src[1] = reg_undef;
1976             progress = true;
1977             break;
1978          }
1979
1980          /* a * 0.0 = 0.0 */
1981          if (inst->src[1].is_zero()) {
1982             inst->opcode = BRW_OPCODE_MOV;
1983             inst->src[0] = inst->src[1];
1984             inst->src[1] = reg_undef;
1985             progress = true;
1986             break;
1987          }
1988
1989          break;
1990       case BRW_OPCODE_ADD:
1991          if (inst->src[1].file != IMM)
1992             continue;
1993
1994          /* a + 0.0 = a */
1995          if (inst->src[1].is_zero()) {
1996             inst->opcode = BRW_OPCODE_MOV;
1997             inst->src[1] = reg_undef;
1998             progress = true;
1999             break;
2000          }
2001          break;
2002       case BRW_OPCODE_OR:
2003          if (inst->src[0].equals(inst->src[1])) {
2004             inst->opcode = BRW_OPCODE_MOV;
2005             inst->src[1] = reg_undef;
2006             progress = true;
2007             break;
2008          }
2009          break;
2010       case BRW_OPCODE_LRP:
2011          if (inst->src[1].equals(inst->src[2])) {
2012             inst->opcode = BRW_OPCODE_MOV;
2013             inst->src[0] = inst->src[1];
2014             inst->src[1] = reg_undef;
2015             inst->src[2] = reg_undef;
2016             progress = true;
2017             break;
2018          }
2019          break;
2020       case BRW_OPCODE_SEL:
2021          if (inst->saturate && inst->src[1].file == IMM) {
2022             switch (inst->conditional_mod) {
2023             case BRW_CONDITIONAL_LE:
2024             case BRW_CONDITIONAL_L:
2025                switch (inst->src[1].type) {
2026                case BRW_REGISTER_TYPE_F:
2027                   if (inst->src[1].imm.f >= 1.0f) {
2028                      inst->opcode = BRW_OPCODE_MOV;
2029                      inst->src[1] = reg_undef;
2030                      progress = true;
2031                   }
2032                   break;
2033                default:
2034                   break;
2035                }
2036                break;
2037             case BRW_CONDITIONAL_GE:
2038             case BRW_CONDITIONAL_G:
2039                switch (inst->src[1].type) {
2040                case BRW_REGISTER_TYPE_F:
2041                   if (inst->src[1].imm.f <= 0.0f) {
2042                      inst->opcode = BRW_OPCODE_MOV;
2043                      inst->src[1] = reg_undef;
2044                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
2045                      progress = true;
2046                   }
2047                   break;
2048                default:
2049                   break;
2050                }
2051             default:
2052                break;
2053             }
2054          }
2055          break;
2056       default:
2057          break;
2058       }
2059    }
2060
2061    return progress;
2062 }
2063
2064 /**
2065  * Removes any instructions writing a VGRF where that VGRF is not used by any
2066  * later instruction.
2067  */
2068 bool
2069 fs_visitor::dead_code_eliminate()
2070 {
2071    bool progress = false;
2072    int pc = 0;
2073
2074    calculate_live_intervals();
2075
2076    foreach_list_safe(node, &this->instructions) {
2077       fs_inst *inst = (fs_inst *)node;
2078
2079       if (inst->dst.file == GRF && !inst->has_side_effects()) {
2080          bool dead = true;
2081
2082          for (int i = 0; i < inst->regs_written; i++) {
2083             int var = live_intervals->var_from_vgrf[inst->dst.reg];
2084             assert(live_intervals->end[var + inst->dst.reg_offset + i] >= pc);
2085             if (live_intervals->end[var + inst->dst.reg_offset + i] != pc) {
2086                dead = false;
2087                break;
2088             }
2089          }
2090
2091          if (dead) {
2092             /* Don't dead code eliminate instructions that write to the
2093              * accumulator as a side-effect. Instead just set the destination
2094              * to the null register to free it.
2095              */
2096             switch (inst->opcode) {
2097             case BRW_OPCODE_ADDC:
2098             case BRW_OPCODE_SUBB:
2099             case BRW_OPCODE_MACH:
2100                inst->dst = fs_reg(retype(brw_null_reg(), inst->dst.type));
2101                break;
2102             default:
2103                inst->remove();
2104                progress = true;
2105                break;
2106             }
2107          }
2108       }
2109
2110       pc++;
2111    }
2112
2113    if (progress)
2114       invalidate_live_intervals();
2115
2116    return progress;
2117 }
2118
2119 struct dead_code_hash_key
2120 {
2121    int vgrf;
2122    int reg_offset;
2123 };
2124
2125 static bool
2126 dead_code_hash_compare(const void *a, const void *b)
2127 {
2128    return memcmp(a, b, sizeof(struct dead_code_hash_key)) == 0;
2129 }
2130
2131 static void
2132 clear_dead_code_hash(struct hash_table *ht)
2133 {
2134    struct hash_entry *entry;
2135
2136    hash_table_foreach(ht, entry) {
2137       _mesa_hash_table_remove(ht, entry);
2138    }
2139 }
2140
2141 static void
2142 insert_dead_code_hash(struct hash_table *ht,
2143                       int vgrf, int reg_offset, fs_inst *inst)
2144 {
2145    /* We don't bother freeing keys, because they'll be GCed with the ht. */
2146    struct dead_code_hash_key *key = ralloc(ht, struct dead_code_hash_key);
2147
2148    key->vgrf = vgrf;
2149    key->reg_offset = reg_offset;
2150
2151    _mesa_hash_table_insert(ht, _mesa_hash_data(key, sizeof(*key)), key, inst);
2152 }
2153
2154 static struct hash_entry *
2155 get_dead_code_hash_entry(struct hash_table *ht, int vgrf, int reg_offset)
2156 {
2157    struct dead_code_hash_key key;
2158
2159    key.vgrf = vgrf;
2160    key.reg_offset = reg_offset;
2161
2162    return _mesa_hash_table_search(ht, _mesa_hash_data(&key, sizeof(key)), &key);
2163 }
2164
2165 static void
2166 remove_dead_code_hash(struct hash_table *ht,
2167                       int vgrf, int reg_offset)
2168 {
2169    struct hash_entry *entry = get_dead_code_hash_entry(ht, vgrf, reg_offset);
2170    if (!entry)
2171       return;
2172
2173    _mesa_hash_table_remove(ht, entry);
2174 }
2175
2176 /**
2177  * Walks basic blocks, removing any regs that are written but not read before
2178  * being redefined.
2179  *
2180  * The dead_code_eliminate() function implements a global dead code
2181  * elimination, but it only handles the removing the last write to a register
2182  * if it's never read.  This one can handle intermediate writes, but only
2183  * within a basic block.
2184  */
2185 bool
2186 fs_visitor::dead_code_eliminate_local()
2187 {
2188    struct hash_table *ht;
2189    bool progress = false;
2190
2191    ht = _mesa_hash_table_create(mem_ctx, dead_code_hash_compare);
2192
2193    foreach_list_safe(node, &this->instructions) {
2194       fs_inst *inst = (fs_inst *)node;
2195
2196       /* At a basic block, empty the HT since we don't understand dataflow
2197        * here.
2198        */
2199       if (inst->is_control_flow()) {
2200          clear_dead_code_hash(ht);
2201          continue;
2202       }
2203
2204       /* Clear the HT of any instructions that got read. */
2205       for (int i = 0; i < 3; i++) {
2206          fs_reg src = inst->src[i];
2207          if (src.file != GRF)
2208             continue;
2209
2210          int read = 1;
2211          if (inst->is_send_from_grf())
2212             read = virtual_grf_sizes[src.reg] - src.reg_offset;
2213
2214          for (int reg_offset = src.reg_offset;
2215               reg_offset < src.reg_offset + read;
2216               reg_offset++) {
2217             remove_dead_code_hash(ht, src.reg, reg_offset);
2218          }
2219       }
2220
2221       /* Add any update of a GRF to the HT, removing a previous write if it
2222        * wasn't read.
2223        */
2224       if (inst->dst.file == GRF) {
2225          if (inst->regs_written > 1) {
2226             /* We don't know how to trim channels from an instruction's
2227              * writes, so we can't incrementally remove unread channels from
2228              * it.  Just remove whatever it overwrites from the table
2229              */
2230             for (int i = 0; i < inst->regs_written; i++) {
2231                remove_dead_code_hash(ht,
2232                                      inst->dst.reg,
2233                                      inst->dst.reg_offset + i);
2234             }
2235          } else {
2236             struct hash_entry *entry =
2237                get_dead_code_hash_entry(ht, inst->dst.reg,
2238                                         inst->dst.reg_offset);
2239
2240             if (entry) {
2241                if (inst->is_partial_write()) {
2242                   /* For a partial write, we can't remove any previous dead code
2243                    * candidate, since we're just modifying their result.
2244                    */
2245                } else {
2246                   /* We're completely updating a channel, and there was a
2247                    * previous write to the channel that wasn't read.  Kill it!
2248                    */
2249                   fs_inst *inst = (fs_inst *)entry->data;
2250                   inst->remove();
2251                   progress = true;
2252                }
2253
2254                _mesa_hash_table_remove(ht, entry);
2255             }
2256
2257             if (!inst->has_side_effects())
2258                insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
2259                                      inst);
2260          }
2261       }
2262    }
2263
2264    _mesa_hash_table_destroy(ht, NULL);
2265
2266    if (progress)
2267       invalidate_live_intervals();
2268
2269    return progress;
2270 }
2271
2272 /**
2273  * Implements register coalescing: Checks if the two registers involved in a
2274  * raw move don't interfere, in which case they can both be stored in the same
2275  * place and the MOV removed.
2276  *
2277  * To do this, all uses of the source of the MOV in the shader are replaced
2278  * with the destination of the MOV. For example:
2279  *
2280  * add vgrf3:F, vgrf1:F, vgrf2:F
2281  * mov vgrf4:F, vgrf3:F
2282  * mul vgrf5:F, vgrf5:F, vgrf4:F
2283  *
2284  * becomes
2285  *
2286  * add vgrf4:F, vgrf1:F, vgrf2:F
2287  * mul vgrf5:F, vgrf5:F, vgrf4:F
2288  */
2289 bool
2290 fs_visitor::register_coalesce()
2291 {
2292    bool progress = false;
2293
2294    calculate_live_intervals();
2295
2296    int src_size = 0;
2297    int channels_remaining = 0;
2298    int reg_from = -1, reg_to = -1;
2299    int reg_to_offset[MAX_SAMPLER_MESSAGE_SIZE];
2300    fs_inst *mov[MAX_SAMPLER_MESSAGE_SIZE];
2301
2302    foreach_list(node, &this->instructions) {
2303       fs_inst *inst = (fs_inst *)node;
2304
2305       if (inst->opcode != BRW_OPCODE_MOV ||
2306           inst->is_partial_write() ||
2307           inst->saturate ||
2308           inst->src[0].file != GRF ||
2309           inst->src[0].negate ||
2310           inst->src[0].abs ||
2311           inst->src[0].smear != -1 ||
2312           inst->dst.file != GRF ||
2313           inst->dst.type != inst->src[0].type) {
2314          continue;
2315       }
2316
2317       if (virtual_grf_sizes[inst->src[0].reg] >
2318           virtual_grf_sizes[inst->dst.reg])
2319          continue;
2320
2321       int var_from = live_intervals->var_from_reg(&inst->src[0]);
2322       int var_to = live_intervals->var_from_reg(&inst->dst);
2323
2324       if (live_intervals->vars_interfere(var_from, var_to) &&
2325           !inst->dst.equals(inst->src[0])) {
2326
2327          /* We know that the live ranges of A (var_from) and B (var_to)
2328           * interfere because of the ->vars_interfere() call above. If the end
2329           * of B's live range is after the end of A's range, then we know two
2330           * things:
2331           *  - the start of B's live range must be in A's live range (since we
2332           *    already know the two ranges interfere, this is the only remaining
2333           *    possibility)
2334           *  - the interference isn't of the form we're looking for (where B is
2335           *    entirely inside A)
2336           */
2337          if (live_intervals->end[var_to] > live_intervals->end[var_from])
2338             continue;
2339
2340          bool overwritten = false;
2341          int scan_ip = -1;
2342
2343          foreach_list(n, &this->instructions) {
2344             fs_inst *scan_inst = (fs_inst *)n;
2345             scan_ip++;
2346
2347             if (scan_inst->is_control_flow()) {
2348                overwritten = true;
2349                break;
2350             }
2351
2352             if (scan_ip <= live_intervals->start[var_to])
2353                continue;
2354
2355             if (scan_ip > live_intervals->end[var_to])
2356                break;
2357
2358             if (scan_inst->dst.equals(inst->dst) ||
2359                 scan_inst->dst.equals(inst->src[0])) {
2360                overwritten = true;
2361                break;
2362             }
2363          }
2364
2365          if (overwritten)
2366             continue;
2367       }
2368
2369       if (reg_from != inst->src[0].reg) {
2370          reg_from = inst->src[0].reg;
2371
2372          src_size = virtual_grf_sizes[inst->src[0].reg];
2373          assert(src_size <= MAX_SAMPLER_MESSAGE_SIZE);
2374
2375          channels_remaining = src_size;
2376          memset(mov, 0, sizeof(mov));
2377
2378          reg_to = inst->dst.reg;
2379       }
2380
2381       if (reg_to != inst->dst.reg)
2382          continue;
2383
2384       const int offset = inst->src[0].reg_offset;
2385       reg_to_offset[offset] = inst->dst.reg_offset;
2386       mov[offset] = inst;
2387       channels_remaining--;
2388
2389       if (channels_remaining)
2390          continue;
2391
2392       bool removed = false;
2393       for (int i = 0; i < src_size; i++) {
2394          if (mov[i]) {
2395             removed = true;
2396
2397             mov[i]->opcode = BRW_OPCODE_NOP;
2398             mov[i]->conditional_mod = BRW_CONDITIONAL_NONE;
2399             mov[i]->dst = reg_undef;
2400             mov[i]->src[0] = reg_undef;
2401             mov[i]->src[1] = reg_undef;
2402             mov[i]->src[2] = reg_undef;
2403          }
2404       }
2405
2406       foreach_list(node, &this->instructions) {
2407          fs_inst *scan_inst = (fs_inst *)node;
2408
2409          for (int i = 0; i < src_size; i++) {
2410             if (mov[i]) {
2411                if (scan_inst->dst.file == GRF &&
2412                    scan_inst->dst.reg == reg_from &&
2413                    scan_inst->dst.reg_offset == i) {
2414                   scan_inst->dst.reg = reg_to;
2415                   scan_inst->dst.reg_offset = reg_to_offset[i];
2416                }
2417                for (int j = 0; j < 3; j++) {
2418                   if (scan_inst->src[j].file == GRF &&
2419                       scan_inst->src[j].reg == reg_from &&
2420                       scan_inst->src[j].reg_offset == i) {
2421                      scan_inst->src[j].reg = reg_to;
2422                      scan_inst->src[j].reg_offset = reg_to_offset[i];
2423                   }
2424                }
2425             }
2426          }
2427       }
2428
2429       if (removed) {
2430          live_intervals->start[var_to] = MIN2(live_intervals->start[var_to],
2431                                               live_intervals->start[var_from]);
2432          live_intervals->end[var_to] = MAX2(live_intervals->end[var_to],
2433                                             live_intervals->end[var_from]);
2434          reg_from = -1;
2435       }
2436    }
2437
2438    foreach_list_safe(node, &this->instructions) {
2439       fs_inst *inst = (fs_inst *)node;
2440
2441       if (inst->opcode == BRW_OPCODE_NOP) {
2442          inst->remove();
2443          progress = true;
2444       }
2445    }
2446
2447    if (progress)
2448       invalidate_live_intervals();
2449
2450    return progress;
2451 }
2452
2453 bool
2454 fs_visitor::compute_to_mrf()
2455 {
2456    bool progress = false;
2457    int next_ip = 0;
2458
2459    calculate_live_intervals();
2460
2461    foreach_list_safe(node, &this->instructions) {
2462       fs_inst *inst = (fs_inst *)node;
2463
2464       int ip = next_ip;
2465       next_ip++;
2466
2467       if (inst->opcode != BRW_OPCODE_MOV ||
2468           inst->is_partial_write() ||
2469           inst->dst.file != MRF || inst->src[0].file != GRF ||
2470           inst->dst.type != inst->src[0].type ||
2471           inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
2472          continue;
2473
2474       /* Work out which hardware MRF registers are written by this
2475        * instruction.
2476        */
2477       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2478       int mrf_high;
2479       if (inst->dst.reg & BRW_MRF_COMPR4) {
2480          mrf_high = mrf_low + 4;
2481       } else if (dispatch_width == 16 &&
2482                  (!inst->force_uncompressed && !inst->force_sechalf)) {
2483          mrf_high = mrf_low + 1;
2484       } else {
2485          mrf_high = mrf_low;
2486       }
2487
2488       /* Can't compute-to-MRF this GRF if someone else was going to
2489        * read it later.
2490        */
2491       if (this->virtual_grf_end[inst->src[0].reg] > ip)
2492          continue;
2493
2494       /* Found a move of a GRF to a MRF.  Let's see if we can go
2495        * rewrite the thing that made this GRF to write into the MRF.
2496        */
2497       fs_inst *scan_inst;
2498       for (scan_inst = (fs_inst *)inst->prev;
2499            scan_inst->prev != NULL;
2500            scan_inst = (fs_inst *)scan_inst->prev) {
2501          if (scan_inst->dst.file == GRF &&
2502              scan_inst->dst.reg == inst->src[0].reg) {
2503             /* Found the last thing to write our reg we want to turn
2504              * into a compute-to-MRF.
2505              */
2506
2507             /* If this one instruction didn't populate all the
2508              * channels, bail.  We might be able to rewrite everything
2509              * that writes that reg, but it would require smarter
2510              * tracking to delay the rewriting until complete success.
2511              */
2512             if (scan_inst->is_partial_write())
2513                break;
2514
2515             /* Things returning more than one register would need us to
2516              * understand coalescing out more than one MOV at a time.
2517              */
2518             if (scan_inst->regs_written > 1)
2519                break;
2520
2521             /* SEND instructions can't have MRF as a destination. */
2522             if (scan_inst->mlen)
2523                break;
2524
2525             if (brw->gen == 6) {
2526                /* gen6 math instructions must have the destination be
2527                 * GRF, so no compute-to-MRF for them.
2528                 */
2529                if (scan_inst->is_math()) {
2530                   break;
2531                }
2532             }
2533
2534             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2535                /* Found the creator of our MRF's source value. */
2536                scan_inst->dst.file = MRF;
2537                scan_inst->dst.reg = inst->dst.reg;
2538                scan_inst->saturate |= inst->saturate;
2539                inst->remove();
2540                progress = true;
2541             }
2542             break;
2543          }
2544
2545          /* We don't handle control flow here.  Most computation of
2546           * values that end up in MRFs are shortly before the MRF
2547           * write anyway.
2548           */
2549          if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2550             break;
2551
2552          /* You can't read from an MRF, so if someone else reads our
2553           * MRF's source GRF that we wanted to rewrite, that stops us.
2554           */
2555          bool interfered = false;
2556          for (int i = 0; i < 3; i++) {
2557             if (scan_inst->src[i].file == GRF &&
2558                 scan_inst->src[i].reg == inst->src[0].reg &&
2559                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2560                interfered = true;
2561             }
2562          }
2563          if (interfered)
2564             break;
2565
2566          if (scan_inst->dst.file == MRF) {
2567             /* If somebody else writes our MRF here, we can't
2568              * compute-to-MRF before that.
2569              */
2570             int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2571             int scan_mrf_high;
2572
2573             if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2574                scan_mrf_high = scan_mrf_low + 4;
2575             } else if (dispatch_width == 16 &&
2576                        (!scan_inst->force_uncompressed &&
2577                         !scan_inst->force_sechalf)) {
2578                scan_mrf_high = scan_mrf_low + 1;
2579             } else {
2580                scan_mrf_high = scan_mrf_low;
2581             }
2582
2583             if (mrf_low == scan_mrf_low ||
2584                 mrf_low == scan_mrf_high ||
2585                 mrf_high == scan_mrf_low ||
2586                 mrf_high == scan_mrf_high) {
2587                break;
2588             }
2589          }
2590
2591          if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2592             /* Found a SEND instruction, which means that there are
2593              * live values in MRFs from base_mrf to base_mrf +
2594              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
2595              * above it.
2596              */
2597             if (mrf_low >= scan_inst->base_mrf &&
2598                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2599                break;
2600             }
2601             if (mrf_high >= scan_inst->base_mrf &&
2602                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2603                break;
2604             }
2605          }
2606       }
2607    }
2608
2609    if (progress)
2610       invalidate_live_intervals();
2611
2612    return progress;
2613 }
2614
2615 /**
2616  * Walks through basic blocks, looking for repeated MRF writes and
2617  * removing the later ones.
2618  */
2619 bool
2620 fs_visitor::remove_duplicate_mrf_writes()
2621 {
2622    fs_inst *last_mrf_move[16];
2623    bool progress = false;
2624
2625    /* Need to update the MRF tracking for compressed instructions. */
2626    if (dispatch_width == 16)
2627       return false;
2628
2629    memset(last_mrf_move, 0, sizeof(last_mrf_move));
2630
2631    foreach_list_safe(node, &this->instructions) {
2632       fs_inst *inst = (fs_inst *)node;
2633
2634       if (inst->is_control_flow()) {
2635          memset(last_mrf_move, 0, sizeof(last_mrf_move));
2636       }
2637
2638       if (inst->opcode == BRW_OPCODE_MOV &&
2639           inst->dst.file == MRF) {
2640          fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2641          if (prev_inst && inst->equals(prev_inst)) {
2642             inst->remove();
2643             progress = true;
2644             continue;
2645          }
2646       }
2647
2648       /* Clear out the last-write records for MRFs that were overwritten. */
2649       if (inst->dst.file == MRF) {
2650          last_mrf_move[inst->dst.reg] = NULL;
2651       }
2652
2653       if (inst->mlen > 0 && inst->base_mrf != -1) {
2654          /* Found a SEND instruction, which will include two or fewer
2655           * implied MRF writes.  We could do better here.
2656           */
2657          for (int i = 0; i < implied_mrf_writes(inst); i++) {
2658             last_mrf_move[inst->base_mrf + i] = NULL;
2659          }
2660       }
2661
2662       /* Clear out any MRF move records whose sources got overwritten. */
2663       if (inst->dst.file == GRF) {
2664          for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2665             if (last_mrf_move[i] &&
2666                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2667                last_mrf_move[i] = NULL;
2668             }
2669          }
2670       }
2671
2672       if (inst->opcode == BRW_OPCODE_MOV &&
2673           inst->dst.file == MRF &&
2674           inst->src[0].file == GRF &&
2675           !inst->is_partial_write()) {
2676          last_mrf_move[inst->dst.reg] = inst;
2677       }
2678    }
2679
2680    if (progress)
2681       invalidate_live_intervals();
2682
2683    return progress;
2684 }
2685
2686 static void
2687 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2688                         int first_grf, int grf_len)
2689 {
2690    bool inst_simd16 = (dispatch_width > 8 &&
2691                        !inst->force_uncompressed &&
2692                        !inst->force_sechalf);
2693
2694    /* Clear the flag for registers that actually got read (as expected). */
2695    for (int i = 0; i < 3; i++) {
2696       int grf;
2697       if (inst->src[i].file == GRF) {
2698          grf = inst->src[i].reg;
2699       } else if (inst->src[i].file == HW_REG &&
2700                  inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2701          grf = inst->src[i].fixed_hw_reg.nr;
2702       } else {
2703          continue;
2704       }
2705
2706       if (grf >= first_grf &&
2707           grf < first_grf + grf_len) {
2708          deps[grf - first_grf] = false;
2709          if (inst_simd16)
2710             deps[grf - first_grf + 1] = false;
2711       }
2712    }
2713 }
2714
2715 /**
2716  * Implements this workaround for the original 965:
2717  *
2718  *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2719  *      check for post destination dependencies on this instruction, software
2720  *      must ensure that there is no destination hazard for the case of ‘write
2721  *      followed by a posted write’ shown in the following example.
2722  *
2723  *      1. mov r3 0
2724  *      2. send r3.xy <rest of send instruction>
2725  *      3. mov r2 r3
2726  *
2727  *      Due to no post-destination dependency check on the ‘send’, the above
2728  *      code sequence could have two instructions (1 and 2) in flight at the
2729  *      same time that both consider ‘r3’ as the target of their final writes.
2730  */
2731 void
2732 fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
2733 {
2734    int reg_size = dispatch_width / 8;
2735    int write_len = inst->regs_written * reg_size;
2736    int first_write_grf = inst->dst.reg;
2737    bool needs_dep[BRW_MAX_MRF];
2738    assert(write_len < (int)sizeof(needs_dep) - 1);
2739
2740    memset(needs_dep, false, sizeof(needs_dep));
2741    memset(needs_dep, true, write_len);
2742
2743    clear_deps_for_inst_src(inst, dispatch_width,
2744                            needs_dep, first_write_grf, write_len);
2745
2746    /* Walk backwards looking for writes to registers we're writing which
2747     * aren't read since being written.  If we hit the start of the program,
2748     * we assume that there are no outstanding dependencies on entry to the
2749     * program.
2750     */
2751    for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2752         scan_inst != NULL;
2753         scan_inst = (fs_inst *)scan_inst->prev) {
2754
2755       /* If we hit control flow, assume that there *are* outstanding
2756        * dependencies, and force their cleanup before our instruction.
2757        */
2758       if (scan_inst->is_control_flow()) {
2759          for (int i = 0; i < write_len; i++) {
2760             if (needs_dep[i]) {
2761                inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2762             }
2763          }
2764          return;
2765       }
2766
2767       bool scan_inst_simd16 = (dispatch_width > 8 &&
2768                                !scan_inst->force_uncompressed &&
2769                                !scan_inst->force_sechalf);
2770
2771       /* We insert our reads as late as possible on the assumption that any
2772        * instruction but a MOV that might have left us an outstanding
2773        * dependency has more latency than a MOV.
2774        */
2775       if (scan_inst->dst.file == GRF) {
2776          for (int i = 0; i < scan_inst->regs_written; i++) {
2777             int reg = scan_inst->dst.reg + i * reg_size;
2778
2779             if (reg >= first_write_grf &&
2780                 reg < first_write_grf + write_len &&
2781                 needs_dep[reg - first_write_grf]) {
2782                inst->insert_before(DEP_RESOLVE_MOV(reg));
2783                needs_dep[reg - first_write_grf] = false;
2784                if (scan_inst_simd16)
2785                   needs_dep[reg - first_write_grf + 1] = false;
2786             }
2787          }
2788       }
2789
2790       /* Clear the flag for registers that actually got read (as expected). */
2791       clear_deps_for_inst_src(scan_inst, dispatch_width,
2792                               needs_dep, first_write_grf, write_len);
2793
2794       /* Continue the loop only if we haven't resolved all the dependencies */
2795       int i;
2796       for (i = 0; i < write_len; i++) {
2797          if (needs_dep[i])
2798             break;
2799       }
2800       if (i == write_len)
2801          return;
2802    }
2803 }
2804
2805 /**
2806  * Implements this workaround for the original 965:
2807  *
2808  *     "[DevBW, DevCL] Errata: A destination register from a send can not be
2809  *      used as a destination register until after it has been sourced by an
2810  *      instruction with a different destination register.
2811  */
2812 void
2813 fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
2814 {
2815    int write_len = inst->regs_written * dispatch_width / 8;
2816    int first_write_grf = inst->dst.reg;
2817    bool needs_dep[BRW_MAX_MRF];
2818    assert(write_len < (int)sizeof(needs_dep) - 1);
2819
2820    memset(needs_dep, false, sizeof(needs_dep));
2821    memset(needs_dep, true, write_len);
2822    /* Walk forwards looking for writes to registers we're writing which aren't
2823     * read before being written.
2824     */
2825    for (fs_inst *scan_inst = (fs_inst *)inst->next;
2826         !scan_inst->is_tail_sentinel();
2827         scan_inst = (fs_inst *)scan_inst->next) {
2828       /* If we hit control flow, force resolve all remaining dependencies. */
2829       if (scan_inst->is_control_flow()) {
2830          for (int i = 0; i < write_len; i++) {
2831             if (needs_dep[i])
2832                scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2833          }
2834          return;
2835       }
2836
2837       /* Clear the flag for registers that actually got read (as expected). */
2838       clear_deps_for_inst_src(scan_inst, dispatch_width,
2839                               needs_dep, first_write_grf, write_len);
2840
2841       /* We insert our reads as late as possible since they're reading the
2842        * result of a SEND, which has massive latency.
2843        */
2844       if (scan_inst->dst.file == GRF &&
2845           scan_inst->dst.reg >= first_write_grf &&
2846           scan_inst->dst.reg < first_write_grf + write_len &&
2847           needs_dep[scan_inst->dst.reg - first_write_grf]) {
2848          scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2849          needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2850       }
2851
2852       /* Continue the loop only if we haven't resolved all the dependencies */
2853       int i;
2854       for (i = 0; i < write_len; i++) {
2855          if (needs_dep[i])
2856             break;
2857       }
2858       if (i == write_len)
2859          return;
2860    }
2861
2862    /* If we hit the end of the program, resolve all remaining dependencies out
2863     * of paranoia.
2864     */
2865    fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2866    assert(last_inst->eot);
2867    for (int i = 0; i < write_len; i++) {
2868       if (needs_dep[i])
2869          last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2870    }
2871 }
2872
2873 void
2874 fs_visitor::insert_gen4_send_dependency_workarounds()
2875 {
2876    if (brw->gen != 4 || brw->is_g4x)
2877       return;
2878
2879    /* Note that we're done with register allocation, so GRF fs_regs always
2880     * have a .reg_offset of 0.
2881     */
2882
2883    foreach_list_safe(node, &this->instructions) {
2884       fs_inst *inst = (fs_inst *)node;
2885
2886       if (inst->mlen != 0 && inst->dst.file == GRF) {
2887          insert_gen4_pre_send_dependency_workarounds(inst);
2888          insert_gen4_post_send_dependency_workarounds(inst);
2889       }
2890    }
2891 }
2892
2893 /**
2894  * Turns the generic expression-style uniform pull constant load instruction
2895  * into a hardware-specific series of instructions for loading a pull
2896  * constant.
2897  *
2898  * The expression style allows the CSE pass before this to optimize out
2899  * repeated loads from the same offset, and gives the pre-register-allocation
2900  * scheduling full flexibility, while the conversion to native instructions
2901  * allows the post-register-allocation scheduler the best information
2902  * possible.
2903  *
2904  * Note that execution masking for setting up pull constant loads is special:
2905  * the channels that need to be written are unrelated to the current execution
2906  * mask, since a later instruction will use one of the result channels as a
2907  * source operand for all 8 or 16 of its channels.
2908  */
2909 void
2910 fs_visitor::lower_uniform_pull_constant_loads()
2911 {
2912    foreach_list(node, &this->instructions) {
2913       fs_inst *inst = (fs_inst *)node;
2914
2915       if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2916          continue;
2917
2918       if (brw->gen >= 7) {
2919          /* The offset arg before was a vec4-aligned byte offset.  We need to
2920           * turn it into a dword offset.
2921           */
2922          fs_reg const_offset_reg = inst->src[1];
2923          assert(const_offset_reg.file == IMM &&
2924                 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2925          const_offset_reg.imm.u /= 4;
2926          fs_reg payload = fs_reg(this, glsl_type::uint_type);
2927
2928          /* This is actually going to be a MOV, but since only the first dword
2929           * is accessed, we have a special opcode to do just that one.  Note
2930           * that this needs to be an operation that will be considered a def
2931           * by live variable analysis, or register allocation will explode.
2932           */
2933          fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2934                                                payload, const_offset_reg);
2935          setup->force_writemask_all = true;
2936
2937          setup->ir = inst->ir;
2938          setup->annotation = inst->annotation;
2939          inst->insert_before(setup);
2940
2941          /* Similarly, this will only populate the first 4 channels of the
2942           * result register (since we only use smear values from 0-3), but we
2943           * don't tell the optimizer.
2944           */
2945          inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2946          inst->src[1] = payload;
2947
2948          invalidate_live_intervals();
2949       } else {
2950          /* Before register allocation, we didn't tell the scheduler about the
2951           * MRF we use.  We know it's safe to use this MRF because nothing
2952           * else does except for register spill/unspill, which generates and
2953           * uses its MRF within a single IR instruction.
2954           */
2955          inst->base_mrf = 14;
2956          inst->mlen = 1;
2957       }
2958    }
2959 }
2960
2961 void
2962 fs_visitor::dump_instructions()
2963 {
2964    calculate_register_pressure();
2965
2966    int ip = 0, max_pressure = 0;
2967    foreach_list(node, &this->instructions) {
2968       backend_instruction *inst = (backend_instruction *)node;
2969       max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
2970       printf("{%3d} %4d: ", regs_live_at_ip[ip], ip);
2971       dump_instruction(inst);
2972       ++ip;
2973    }
2974    printf("Maximum %3d registers live at once.\n", max_pressure);
2975 }
2976
2977 void
2978 fs_visitor::dump_instruction(backend_instruction *be_inst)
2979 {
2980    fs_inst *inst = (fs_inst *)be_inst;
2981
2982    if (inst->predicate) {
2983       printf("(%cf0.%d) ",
2984              inst->predicate_inverse ? '-' : '+',
2985              inst->flag_subreg);
2986    }
2987
2988    printf("%s", brw_instruction_name(inst->opcode));
2989    if (inst->saturate)
2990       printf(".sat");
2991    if (inst->conditional_mod) {
2992       printf("%s", conditional_modifier[inst->conditional_mod]);
2993       if (!inst->predicate &&
2994           (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2995                               inst->opcode != BRW_OPCODE_IF &&
2996                               inst->opcode != BRW_OPCODE_WHILE))) {
2997          printf(".f0.%d", inst->flag_subreg);
2998       }
2999    }
3000    printf(" ");
3001
3002
3003    switch (inst->dst.file) {
3004    case GRF:
3005       printf("vgrf%d", inst->dst.reg);
3006       if (virtual_grf_sizes[inst->dst.reg] != 1)
3007          printf("+%d", inst->dst.reg_offset);
3008       break;
3009    case MRF:
3010       printf("m%d", inst->dst.reg);
3011       break;
3012    case BAD_FILE:
3013       printf("(null)");
3014       break;
3015    case UNIFORM:
3016       printf("***u%d***", inst->dst.reg);
3017       break;
3018    case HW_REG:
3019       if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3020          switch (inst->dst.fixed_hw_reg.nr) {
3021          case BRW_ARF_NULL:
3022             printf("null");
3023             break;
3024          case BRW_ARF_ADDRESS:
3025             printf("a0.%d", inst->dst.fixed_hw_reg.subnr);
3026             break;
3027          case BRW_ARF_ACCUMULATOR:
3028             printf("acc%d", inst->dst.fixed_hw_reg.subnr);
3029             break;
3030          case BRW_ARF_FLAG:
3031             printf("f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3032                              inst->dst.fixed_hw_reg.subnr);
3033             break;
3034          default:
3035             printf("arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3036                                inst->dst.fixed_hw_reg.subnr);
3037             break;
3038          }
3039       } else {
3040          printf("hw_reg%d", inst->dst.fixed_hw_reg.nr);
3041       }
3042       if (inst->dst.fixed_hw_reg.subnr)
3043          printf("+%d", inst->dst.fixed_hw_reg.subnr);
3044       break;
3045    default:
3046       printf("???");
3047       break;
3048    }
3049    printf(":%s, ", reg_encoding[inst->dst.type]);
3050
3051    for (int i = 0; i < 3 && inst->src[i].file != BAD_FILE; i++) {
3052       if (inst->src[i].negate)
3053          printf("-");
3054       if (inst->src[i].abs)
3055          printf("|");
3056       switch (inst->src[i].file) {
3057       case GRF:
3058          printf("vgrf%d", inst->src[i].reg);
3059          if (virtual_grf_sizes[inst->src[i].reg] != 1)
3060             printf("+%d", inst->src[i].reg_offset);
3061          break;
3062       case MRF:
3063          printf("***m%d***", inst->src[i].reg);
3064          break;
3065       case UNIFORM:
3066          printf("u%d", inst->src[i].reg);
3067          if (virtual_grf_sizes[inst->src[i].reg] != 1)
3068             printf(".%d", inst->src[i].reg_offset);
3069          break;
3070       case BAD_FILE:
3071          printf("(null)");
3072          break;
3073       case IMM:
3074          switch (inst->src[i].type) {
3075          case BRW_REGISTER_TYPE_F:
3076             printf("%ff", inst->src[i].imm.f);
3077             break;
3078          case BRW_REGISTER_TYPE_D:
3079             printf("%dd", inst->src[i].imm.i);
3080             break;
3081          case BRW_REGISTER_TYPE_UD:
3082             printf("%uu", inst->src[i].imm.u);
3083             break;
3084          default:
3085             printf("???");
3086             break;
3087          }
3088          break;
3089       case HW_REG:
3090          if (inst->src[i].fixed_hw_reg.negate)
3091             printf("-");
3092          if (inst->src[i].fixed_hw_reg.abs)
3093             printf("|");
3094          if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3095             switch (inst->src[i].fixed_hw_reg.nr) {
3096             case BRW_ARF_NULL:
3097                printf("null");
3098                break;
3099             case BRW_ARF_ADDRESS:
3100                printf("a0.%d", inst->src[i].fixed_hw_reg.subnr);
3101                break;
3102             case BRW_ARF_ACCUMULATOR:
3103                printf("acc%d", inst->src[i].fixed_hw_reg.subnr);
3104                break;
3105             case BRW_ARF_FLAG:
3106                printf("f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3107                                 inst->src[i].fixed_hw_reg.subnr);
3108                break;
3109             default:
3110                printf("arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3111                                   inst->src[i].fixed_hw_reg.subnr);
3112                break;
3113             }
3114          } else {
3115             printf("hw_reg%d", inst->src[i].fixed_hw_reg.nr);
3116          }
3117          if (inst->src[i].fixed_hw_reg.subnr)
3118             printf("+%d", inst->src[i].fixed_hw_reg.subnr);
3119          if (inst->src[i].fixed_hw_reg.abs)
3120             printf("|");
3121          break;
3122       default:
3123          printf("???");
3124          break;
3125       }
3126       if (inst->src[i].abs)
3127          printf("|");
3128
3129       if (inst->src[i].file != IMM) {
3130          printf(":%s", reg_encoding[inst->src[i].type]);
3131       }
3132
3133       if (i < 2 && inst->src[i + 1].file != BAD_FILE)
3134          printf(", ");
3135    }
3136
3137    printf(" ");
3138
3139    if (inst->force_uncompressed)
3140       printf("1sthalf ");
3141
3142    if (inst->force_sechalf)
3143       printf("2ndhalf ");
3144
3145    printf("\n");
3146 }
3147
3148 /**
3149  * Possibly returns an instruction that set up @param reg.
3150  *
3151  * Sometimes we want to take the result of some expression/variable
3152  * dereference tree and rewrite the instruction generating the result
3153  * of the tree.  When processing the tree, we know that the
3154  * instructions generated are all writing temporaries that are dead
3155  * outside of this tree.  So, if we have some instructions that write
3156  * a temporary, we're free to point that temp write somewhere else.
3157  *
3158  * Note that this doesn't guarantee that the instruction generated
3159  * only reg -- it might be the size=4 destination of a texture instruction.
3160  */
3161 fs_inst *
3162 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3163                                            fs_inst *end,
3164                                            fs_reg reg)
3165 {
3166    if (end == start ||
3167        end->is_partial_write() ||
3168        reg.reladdr ||
3169        !reg.equals(end->dst)) {
3170       return NULL;
3171    } else {
3172       return end;
3173    }
3174 }
3175
3176 void
3177 fs_visitor::setup_payload_gen6()
3178 {
3179    bool uses_depth =
3180       (fp->Base.InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3181    unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
3182
3183    assert(brw->gen >= 6);
3184
3185    /* R0-1: masks, pixel X/Y coordinates. */
3186    c->nr_payload_regs = 2;
3187    /* R2: only for 32-pixel dispatch.*/
3188
3189    /* R3-26: barycentric interpolation coordinates.  These appear in the
3190     * same order that they appear in the brw_wm_barycentric_interp_mode
3191     * enum.  Each set of coordinates occupies 2 registers if dispatch width
3192     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
3193     * appear if they were enabled using the "Barycentric Interpolation
3194     * Mode" bits in WM_STATE.
3195     */
3196    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3197       if (barycentric_interp_modes & (1 << i)) {
3198          c->barycentric_coord_reg[i] = c->nr_payload_regs;
3199          c->nr_payload_regs += 2;
3200          if (dispatch_width == 16) {
3201             c->nr_payload_regs += 2;
3202          }
3203       }
3204    }
3205
3206    /* R27: interpolated depth if uses source depth */
3207    if (uses_depth) {
3208       c->source_depth_reg = c->nr_payload_regs;
3209       c->nr_payload_regs++;
3210       if (dispatch_width == 16) {
3211          /* R28: interpolated depth if not SIMD8. */
3212          c->nr_payload_regs++;
3213       }
3214    }
3215    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3216    if (uses_depth) {
3217       c->source_w_reg = c->nr_payload_regs;
3218       c->nr_payload_regs++;
3219       if (dispatch_width == 16) {
3220          /* R30: interpolated W if not SIMD8. */
3221          c->nr_payload_regs++;
3222       }
3223    }
3224
3225    c->prog_data.uses_pos_offset = c->key.compute_pos_offset;
3226    /* R31: MSAA position offsets. */
3227    if (c->prog_data.uses_pos_offset) {
3228       c->sample_pos_reg = c->nr_payload_regs;
3229       c->nr_payload_regs++;
3230    }
3231
3232    /* R32: MSAA input coverage mask */
3233    if (fp->Base.SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
3234       assert(brw->gen >= 7);
3235       c->sample_mask_reg = c->nr_payload_regs;
3236       c->nr_payload_regs++;
3237       if (dispatch_width == 16) {
3238          /* R33: input coverage mask if not SIMD8. */
3239          c->nr_payload_regs++;
3240       }
3241    }
3242
3243    /* R34-: bary for 32-pixel. */
3244    /* R58-59: interp W for 32-pixel. */
3245
3246    if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3247       c->source_depth_to_render_target = true;
3248    }
3249 }
3250
3251 void
3252 fs_visitor::assign_binding_table_offsets()
3253 {
3254    uint32_t next_binding_table_offset = 0;
3255
3256    /* If there are no color regions, we still perform an FB write to a null
3257     * renderbuffer, which we place at surface index 0.
3258     */
3259    c->prog_data.binding_table.render_target_start = next_binding_table_offset;
3260    next_binding_table_offset += MAX2(c->key.nr_color_regions, 1);
3261
3262    assign_common_binding_table_offsets(next_binding_table_offset);
3263 }
3264
3265 void
3266 fs_visitor::calculate_register_pressure()
3267 {
3268    calculate_live_intervals();
3269
3270    int num_instructions = 0;
3271    foreach_list(node, &this->instructions) {
3272       ++num_instructions;
3273    }
3274
3275    regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
3276
3277    for (int reg = 0; reg < virtual_grf_count; reg++) {
3278       for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
3279          regs_live_at_ip[ip] += virtual_grf_sizes[reg];
3280    }
3281 }
3282
3283 bool
3284 fs_visitor::run()
3285 {
3286    sanity_param_count = fp->Base.Parameters->NumParameters;
3287    uint32_t orig_nr_params = c->prog_data.nr_params;
3288    bool allocated_without_spills;
3289
3290    assign_binding_table_offsets();
3291
3292    if (brw->gen >= 6)
3293       setup_payload_gen6();
3294    else
3295       setup_payload_gen4();
3296
3297    if (0) {
3298       emit_dummy_fs();
3299    } else {
3300       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3301          emit_shader_time_begin();
3302
3303       calculate_urb_setup();
3304       if (fp->Base.InputsRead > 0) {
3305          if (brw->gen < 6)
3306             emit_interpolation_setup_gen4();
3307          else
3308             emit_interpolation_setup_gen6();
3309       }
3310
3311       /* We handle discards by keeping track of the still-live pixels in f0.1.
3312        * Initialize it with the dispatched pixels.
3313        */
3314       if (fp->UsesKill || c->key.alpha_test_func) {
3315          fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
3316          discard_init->flag_subreg = 1;
3317       }
3318
3319       /* Generate FS IR for main().  (the visitor only descends into
3320        * functions called "main").
3321        */
3322       if (shader) {
3323          foreach_list(node, &*shader->base.ir) {
3324             ir_instruction *ir = (ir_instruction *)node;
3325             base_ir = ir;
3326             this->result = reg_undef;
3327             ir->accept(this);
3328          }
3329       } else {
3330          emit_fragment_program_code();
3331       }
3332       base_ir = NULL;
3333       if (failed)
3334          return false;
3335
3336       emit(FS_OPCODE_PLACEHOLDER_HALT);
3337
3338       if (c->key.alpha_test_func)
3339          emit_alpha_test();
3340
3341       emit_fb_writes();
3342
3343       split_virtual_grfs();
3344
3345       move_uniform_array_access_to_pull_constants();
3346       remove_dead_constants();
3347       setup_pull_constants();
3348
3349       bool progress;
3350       do {
3351          progress = false;
3352
3353          compact_virtual_grfs();
3354
3355          progress = remove_duplicate_mrf_writes() || progress;
3356
3357          progress = opt_algebraic() || progress;
3358          progress = opt_cse() || progress;
3359          progress = opt_copy_propagate() || progress;
3360          progress = opt_peephole_predicated_break() || progress;
3361          progress = dead_code_eliminate() || progress;
3362          progress = dead_code_eliminate_local() || progress;
3363          progress = opt_peephole_sel() || progress;
3364          progress = dead_control_flow_eliminate(this) || progress;
3365          progress = register_coalesce() || progress;
3366          progress = compute_to_mrf() || progress;
3367       } while (progress);
3368
3369       lower_uniform_pull_constant_loads();
3370
3371       assign_curb_setup();
3372       assign_urb_setup();
3373
3374       static enum instruction_scheduler_mode pre_modes[] = {
3375          SCHEDULE_PRE,
3376          SCHEDULE_PRE_NON_LIFO,
3377          SCHEDULE_PRE_LIFO,
3378       };
3379
3380       /* Try each scheduling heuristic to see if it can successfully register
3381        * allocate without spilling.  They should be ordered by decreasing
3382        * performance but increasing likelihood of allocating.
3383        */
3384       for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3385          schedule_instructions(pre_modes[i]);
3386
3387          if (0) {
3388             assign_regs_trivial();
3389             allocated_without_spills = true;
3390          } else {
3391             allocated_without_spills = assign_regs(false);
3392          }
3393          if (allocated_without_spills)
3394             break;
3395       }
3396
3397       if (!allocated_without_spills) {
3398          /* We assume that any spilling is worse than just dropping back to
3399           * SIMD8.  There's probably actually some intermediate point where
3400           * SIMD16 with a couple of spills is still better.
3401           */
3402          if (dispatch_width == 16) {
3403             fail("Failure to register allocate.  Reduce number of "
3404                  "live scalar values to avoid this.");
3405          }
3406
3407          /* Since we're out of heuristics, just go spill registers until we
3408           * get an allocation.
3409           */
3410          while (!assign_regs(true)) {
3411             if (failed)
3412                break;
3413          }
3414       }
3415    }
3416    assert(force_uncompressed_stack == 0);
3417
3418    /* This must come after all optimization and register allocation, since
3419     * it inserts dead code that happens to have side effects, and it does
3420     * so based on the actual physical registers in use.
3421     */
3422    insert_gen4_send_dependency_workarounds();
3423
3424    if (failed)
3425       return false;
3426
3427    if (!allocated_without_spills)
3428       schedule_instructions(SCHEDULE_POST);
3429
3430    if (dispatch_width == 8) {
3431       c->prog_data.reg_blocks = brw_register_blocks(grf_used);
3432    } else {
3433       c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
3434
3435       /* Make sure we didn't try to sneak in an extra uniform */
3436       assert(orig_nr_params == c->prog_data.nr_params);
3437       (void) orig_nr_params;
3438    }
3439
3440    /* If any state parameters were appended, then ParameterValues could have
3441     * been realloced, in which case the driver uniform storage set up by
3442     * _mesa_associate_uniform_storage() would point to freed memory.  Make
3443     * sure that didn't happen.
3444     */
3445    assert(sanity_param_count == fp->Base.Parameters->NumParameters);
3446
3447    return !failed;
3448 }
3449
3450 const unsigned *
3451 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
3452                struct gl_fragment_program *fp,
3453                struct gl_shader_program *prog,
3454                unsigned *final_assembly_size)
3455 {
3456    bool start_busy = false;
3457    float start_time = 0;
3458
3459    if (unlikely(brw->perf_debug)) {
3460       start_busy = (brw->batch.last_bo &&
3461                     drm_intel_bo_busy(brw->batch.last_bo));
3462       start_time = get_time();
3463    }
3464
3465    struct brw_shader *shader = NULL;
3466    if (prog)
3467       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3468
3469    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3470       if (prog) {
3471          printf("GLSL IR for native fragment shader %d:\n", prog->Name);
3472          _mesa_print_ir(shader->base.ir, NULL);
3473          printf("\n\n");
3474       } else {
3475          printf("ARB_fragment_program %d ir for native fragment shader\n",
3476                 fp->Base.Id);
3477          _mesa_print_program(&fp->Base);
3478       }
3479    }
3480
3481    /* Now the main event: Visit the shader IR and generate our FS IR for it.
3482     */
3483    fs_visitor v(brw, c, prog, fp, 8);
3484    if (!v.run()) {
3485       if (prog) {
3486          prog->LinkStatus = false;
3487          ralloc_strcat(&prog->InfoLog, v.fail_msg);
3488       }
3489
3490       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3491                     v.fail_msg);
3492
3493       return NULL;
3494    }
3495
3496    exec_list *simd16_instructions = NULL;
3497    fs_visitor v2(brw, c, prog, fp, 16);
3498    if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16))) {
3499       if (c->prog_data.nr_pull_params == 0) {
3500          /* Try a SIMD16 compile */
3501          v2.import_uniforms(&v);
3502          if (!v2.run()) {
3503             perf_debug("SIMD16 shader failed to compile, falling back to "
3504                        "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
3505          } else {
3506             simd16_instructions = &v2.instructions;
3507          }
3508       } else {
3509          perf_debug("Skipping SIMD16 due to pull parameters.\n");
3510       }
3511    }
3512
3513    const unsigned *assembly = NULL;
3514    if (brw->gen >= 8) {
3515       gen8_fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
3516       assembly = g.generate_assembly(&v.instructions, simd16_instructions,
3517                                      final_assembly_size);
3518    } else {
3519       fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
3520       assembly = g.generate_assembly(&v.instructions, simd16_instructions,
3521                                      final_assembly_size);
3522    }
3523
3524    if (unlikely(brw->perf_debug) && shader) {
3525       if (shader->compiled_once)
3526          brw_wm_debug_recompile(brw, prog, &c->key);
3527       shader->compiled_once = true;
3528
3529       if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
3530          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3531                     (get_time() - start_time) * 1000);
3532       }
3533    }
3534
3535    return assembly;
3536 }
3537
3538 bool
3539 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
3540 {
3541    struct brw_context *brw = brw_context(ctx);
3542    struct brw_wm_prog_key key;
3543
3544    if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
3545       return true;
3546
3547    struct gl_fragment_program *fp = (struct gl_fragment_program *)
3548       prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
3549    struct brw_fragment_program *bfp = brw_fragment_program(fp);
3550    bool program_uses_dfdy = fp->UsesDFdy;
3551
3552    memset(&key, 0, sizeof(key));
3553
3554    if (brw->gen < 6) {
3555       if (fp->UsesKill)
3556          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3557
3558       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3559          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3560
3561       /* Just assume depth testing. */
3562       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3563       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3564    }
3565
3566    if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
3567                                          BRW_FS_VARYING_INPUT_MASK) > 16)
3568       key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
3569
3570    key.clamp_fragment_color = ctx->API == API_OPENGL_COMPAT;
3571
3572    unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
3573    for (unsigned i = 0; i < sampler_count; i++) {
3574       if (fp->Base.ShadowSamplers & (1 << i)) {
3575          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3576          key.tex.swizzles[i] =
3577             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3578       } else {
3579          /* Color sampler: assume no swizzling. */
3580          key.tex.swizzles[i] = SWIZZLE_XYZW;
3581       }
3582    }
3583
3584    if (fp->Base.InputsRead & VARYING_BIT_POS) {
3585       key.drawable_height = ctx->DrawBuffer->Height;
3586    }
3587
3588    if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3589       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
3590    }
3591
3592    key.nr_color_regions = 1;
3593
3594    /* GL_FRAGMENT_SHADER_DERIVATIVE_HINT is almost always GL_DONT_CARE.  The
3595     * quality of the derivatives is likely to be determined by the driconf
3596     * option.
3597     */
3598    key.high_quality_derivatives = brw->disable_derivative_optimization;
3599
3600    key.program_string_id = bfp->id;
3601
3602    uint32_t old_prog_offset = brw->wm.base.prog_offset;
3603    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3604
3605    bool success = do_wm_prog(brw, prog, bfp, &key);
3606
3607    brw->wm.base.prog_offset = old_prog_offset;
3608    brw->wm.prog_data = old_prog_data;
3609
3610    return success;
3611 }