src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 extern "C" {
  32
  33 #include <sys/types.h>
  34
  35 #include "main/hash_table.h"
  36 #include "main/macros.h"
  37 #include "main/shaderobj.h"
  38 #include "main/fbobject.h"
  39 #include "program/prog_parameter.h"
  40 #include "program/prog_print.h"
  41 #include "program/register_allocate.h"
  42 #include "program/sampler.h"
  43 #include "program/hash_table.h"
  44 #include "brw_context.h"
  45 #include "brw_eu.h"
  46 #include "brw_wm.h"
  47 }
  48 #include "brw_fs.h"
  49 #include "brw_dead_control_flow.h"
  50 #include "main/uniforms.h"
  51 #include "brw_fs_live_variables.h"
  52 #include "glsl/glsl_types.h"
  53
  54 void
  55 fs_inst::init()
  56 {
  57    memset(this, 0, sizeof(*this));
  58    this->opcode = BRW_OPCODE_NOP;
  59    this->conditional_mod = BRW_CONDITIONAL_NONE;
  60
  61    this->dst = reg_undef;
  62    this->src[0] = reg_undef;
  63    this->src[1] = reg_undef;
  64    this->src[2] = reg_undef;
  65
  66    /* This will be the case for almost all instructions. */
  67    this->regs_written = 1;
  68 }
  69
  70 fs_inst::fs_inst()
  71 {
  72    init();
  73 }
  74
  75 fs_inst::fs_inst(enum opcode opcode)
  76 {
  77    init();
  78    this->opcode = opcode;
  79 }
  80
  81 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
  82 {
  83    init();
  84    this->opcode = opcode;
  85    this->dst = dst;
  86
  87    if (dst.file == GRF)
  88       assert(dst.reg_offset >= 0);
  89 }
  90
  91 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
  92 {
  93    init();
  94    this->opcode = opcode;
  95    this->dst = dst;
  96    this->src[0] = src0;
  97
  98    if (dst.file == GRF)
  99       assert(dst.reg_offset >= 0);
 100    if (src[0].file == GRF)
 101       assert(src[0].reg_offset >= 0);
 102 }
 103
 104 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 105 {
 106    init();
 107    this->opcode = opcode;
 108    this->dst = dst;
 109    this->src[0] = src0;
 110    this->src[1] = src1;
 111
 112    if (dst.file == GRF)
 113       assert(dst.reg_offset >= 0);
 114    if (src[0].file == GRF)
 115       assert(src[0].reg_offset >= 0);
 116    if (src[1].file == GRF)
 117       assert(src[1].reg_offset >= 0);
 118 }
 119
 120 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
 121                  fs_reg src0, fs_reg src1, fs_reg src2)
 122 {
 123    init();
 124    this->opcode = opcode;
 125    this->dst = dst;
 126    this->src[0] = src0;
 127    this->src[1] = src1;
 128    this->src[2] = src2;
 129
 130    if (dst.file == GRF)
 131       assert(dst.reg_offset >= 0);
 132    if (src[0].file == GRF)
 133       assert(src[0].reg_offset >= 0);
 134    if (src[1].file == GRF)
 135       assert(src[1].reg_offset >= 0);
 136    if (src[2].file == GRF)
 137       assert(src[2].reg_offset >= 0);
 138 }
 139
 140 #define ALU1(op)                                                        \
 141    fs_inst *                                                            \
 142    fs_visitor::op(fs_reg dst, fs_reg src0)                              \
 143    {                                                                    \
 144       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
 145    }
 146
 147 #define ALU2(op)                                                        \
 148    fs_inst *                                                            \
 149    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1)                 \
 150    {                                                                    \
 151       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
 152    }
 153
 154 #define ALU3(op)                                                        \
 155    fs_inst *                                                            \
 156    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1, fs_reg src2)    \
 157    {                                                                    \
 158       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
 159    }
 160
 161 ALU1(NOT)
 162 ALU1(MOV)
 163 ALU1(FRC)
 164 ALU1(RNDD)
 165 ALU1(RNDE)
 166 ALU1(RNDZ)
 167 ALU2(ADD)
 168 ALU2(MUL)
 169 ALU2(MACH)
 170 ALU2(AND)
 171 ALU2(OR)
 172 ALU2(XOR)
 173 ALU2(SHL)
 174 ALU2(SHR)
 175 ALU2(ASR)
 176 ALU3(LRP)
 177 ALU1(BFREV)
 178 ALU3(BFE)
 179 ALU2(BFI1)
 180 ALU3(BFI2)
 181 ALU1(FBH)
 182 ALU1(FBL)
 183 ALU1(CBIT)
 184 ALU3(MAD)
 185 ALU2(ADDC)
 186 ALU2(SUBB)
 187 ALU2(SEL)
 188
 189 /** Gen4 predicated IF. */
 190 fs_inst *
 191 fs_visitor::IF(uint32_t predicate)
 192 {
 193    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
 194    inst->predicate = predicate;
 195    return inst;
 196 }
 197
 198 /** Gen6 IF with embedded comparison. */
 199 fs_inst *
 200 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
 201 {
 202    assert(brw->gen == 6);
 203    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
 204                                         reg_null_d, src0, src1);
 205    inst->conditional_mod = condition;
 206    return inst;
 207 }
 208
 209 /**
 210  * CMP: Sets the low bit of the destination channels with the result
 211  * of the comparison, while the upper bits are undefined, and updates
 212  * the flag register with the packed 16 bits of the result.
 213  */
 214 fs_inst *
 215 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
 216 {
 217    fs_inst *inst;
 218
 219    /* Take the instruction:
 220     *
 221     * CMP null<d> src0<f> src1<f>
 222     *
 223     * Original gen4 does type conversion to the destination type before
 224     * comparison, producing garbage results for floating point comparisons.
 225     * gen5 does the comparison on the execution type (resolved source types),
 226     * so dst type doesn't matter.  gen6 does comparison and then uses the
 227     * result as if it was the dst type with no conversion, which happens to
 228     * mostly work out for float-interpreted-as-int since our comparisons are
 229     * for >0, =0, <0.
 230     */
 231    if (brw->gen == 4) {
 232       dst.type = src0.type;
 233       if (dst.file == HW_REG)
 234          dst.fixed_hw_reg.type = dst.type;
 235    }
 236
 237    resolve_ud_negate(&src0);
 238    resolve_ud_negate(&src1);
 239
 240    inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
 241    inst->conditional_mod = condition;
 242
 243    return inst;
 244 }
 245
 246 exec_list
 247 fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
 248                                        fs_reg varying_offset,
 249                                        uint32_t const_offset)
 250 {
 251    exec_list instructions;
 252    fs_inst *inst;
 253
 254    /* We have our constant surface use a pitch of 4 bytes, so our index can
 255     * be any component of a vector, and then we load 4 contiguous
 256     * components starting from that.
 257     *
 258     * We break down the const_offset to a portion added to the variable
 259     * offset and a portion done using reg_offset, which means that if you
 260     * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
 261     * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
 262     * CSE can later notice that those loads are all the same and eliminate
 263     * the redundant ones.
 264     */
 265    fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
 266    instructions.push_tail(ADD(vec4_offset,
 267                               varying_offset, const_offset & ~3));
 268
 269    int scale = 1;
 270    if (brw->gen == 4 && dispatch_width == 8) {
 271       /* Pre-gen5, we can either use a SIMD8 message that requires (header,
 272        * u, v, r) as parameters, or we can just use the SIMD16 message
 273        * consisting of (header, u).  We choose the second, at the cost of a
 274        * longer return length.
 275        */
 276       scale = 2;
 277    }
 278
 279    enum opcode op;
 280    if (brw->gen >= 7)
 281       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
 282    else
 283       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
 284    fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4 * scale), dst.type);
 285    inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
 286    inst->regs_written = 4 * scale;
 287    instructions.push_tail(inst);
 288
 289    if (brw->gen < 7) {
 290       inst->base_mrf = 13;
 291       inst->header_present = true;
 292       if (brw->gen == 4)
 293          inst->mlen = 3;
 294       else
 295          inst->mlen = 1 + dispatch_width / 8;
 296    }
 297
 298    vec4_result.reg_offset += (const_offset & 3) * scale;
 299    instructions.push_tail(MOV(dst, vec4_result));
 300
 301    return instructions;
 302 }
 303
 304 /**
 305  * A helper for MOV generation for fixing up broken hardware SEND dependency
 306  * handling.
 307  */
 308 fs_inst *
 309 fs_visitor::DEP_RESOLVE_MOV(int grf)
 310 {
 311    fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
 312
 313    inst->ir = NULL;
 314    inst->annotation = "send dependency resolve";
 315
 316    /* The caller always wants uncompressed to emit the minimal extra
 317     * dependencies, and to avoid having to deal with aligning its regs to 2.
 318     */
 319    inst->force_uncompressed = true;
 320
 321    return inst;
 322 }
 323
 324 bool
 325 fs_inst::equals(fs_inst *inst)
 326 {
 327    return (opcode == inst->opcode &&
 328            dst.equals(inst->dst) &&
 329            src[0].equals(inst->src[0]) &&
 330            src[1].equals(inst->src[1]) &&
 331            src[2].equals(inst->src[2]) &&
 332            saturate == inst->saturate &&
 333            predicate == inst->predicate &&
 334            conditional_mod == inst->conditional_mod &&
 335            mlen == inst->mlen &&
 336            base_mrf == inst->base_mrf &&
 337            sampler == inst->sampler &&
 338            target == inst->target &&
 339            eot == inst->eot &&
 340            header_present == inst->header_present &&
 341            shadow_compare == inst->shadow_compare &&
 342            offset == inst->offset);
 343 }
 344
 345 bool
 346 fs_inst::overwrites_reg(const fs_reg &reg)
 347 {
 348    return (reg.file == dst.file &&
 349            reg.reg == dst.reg &&
 350            reg.reg_offset >= dst.reg_offset  &&
 351            reg.reg_offset < dst.reg_offset + regs_written);
 352 }
 353
 354 bool
 355 fs_inst::is_send_from_grf()
 356 {
 357    return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
 358            opcode == SHADER_OPCODE_SHADER_TIME_ADD ||
 359            (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
 360             src[1].file == GRF) ||
 361            (is_tex() && src[0].file == GRF));
 362 }
 363
 364 bool
 365 fs_visitor::can_do_source_mods(fs_inst *inst)
 366 {
 367    if (brw->gen == 6 && inst->is_math())
 368       return false;
 369
 370    if (inst->is_send_from_grf())
 371       return false;
 372
 373    if (!inst->can_do_source_mods())
 374       return false;
 375
 376    return true;
 377 }
 378
 379 void
 380 fs_reg::init()
 381 {
 382    memset(this, 0, sizeof(*this));
 383    this->smear = -1;
 384 }
 385
 386 /** Generic unset register constructor. */
 387 fs_reg::fs_reg()
 388 {
 389    init();
 390    this->file = BAD_FILE;
 391 }
 392
 393 /** Immediate value constructor. */
 394 fs_reg::fs_reg(float f)
 395 {
 396    init();
 397    this->file = IMM;
 398    this->type = BRW_REGISTER_TYPE_F;
 399    this->imm.f = f;
 400 }
 401
 402 /** Immediate value constructor. */
 403 fs_reg::fs_reg(int32_t i)
 404 {
 405    init();
 406    this->file = IMM;
 407    this->type = BRW_REGISTER_TYPE_D;
 408    this->imm.i = i;
 409 }
 410
 411 /** Immediate value constructor. */
 412 fs_reg::fs_reg(uint32_t u)
 413 {
 414    init();
 415    this->file = IMM;
 416    this->type = BRW_REGISTER_TYPE_UD;
 417    this->imm.u = u;
 418 }
 419
 420 /** Fixed brw_reg. */
 421 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
 422 {
 423    init();
 424    this->file = HW_REG;
 425    this->fixed_hw_reg = fixed_hw_reg;
 426    this->type = fixed_hw_reg.type;
 427 }
 428
 429 bool
 430 fs_reg::equals(const fs_reg &r) const
 431 {
 432    return (file == r.file &&
 433            reg == r.reg &&
 434            reg_offset == r.reg_offset &&
 435            type == r.type &&
 436            negate == r.negate &&
 437            abs == r.abs &&
 438            !reladdr && !r.reladdr &&
 439            memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
 440                   sizeof(fixed_hw_reg)) == 0 &&
 441            smear == r.smear &&
 442            imm.u == r.imm.u);
 443 }
 444
 445 fs_reg
 446 fs_reg::retype(uint32_t type)
 447 {
 448    fs_reg result = *this;
 449    result.type = type;
 450    return result;
 451 }
 452
 453 bool
 454 fs_reg::is_zero() const
 455 {
 456    if (file != IMM)
 457       return false;
 458
 459    return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
 460 }
 461
 462 bool
 463 fs_reg::is_one() const
 464 {
 465    if (file != IMM)
 466       return false;
 467
 468    return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
 469 }
 470
 471 bool
 472 fs_reg::is_null() const
 473 {
 474    return file == HW_REG &&
 475           fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
 476           fixed_hw_reg.nr == BRW_ARF_NULL;
 477 }
 478
 479 bool
 480 fs_reg::is_valid_3src() const
 481 {
 482    return file == GRF || file == UNIFORM;
 483 }
 484
 485 int
 486 fs_visitor::type_size(const struct glsl_type *type)
 487 {
 488    unsigned int size, i;
 489
 490    switch (type->base_type) {
 491    case GLSL_TYPE_UINT:
 492    case GLSL_TYPE_INT:
 493    case GLSL_TYPE_FLOAT:
 494    case GLSL_TYPE_BOOL:
 495       return type->components();
 496    case GLSL_TYPE_ARRAY:
 497       return type_size(type->fields.array) * type->length;
 498    case GLSL_TYPE_STRUCT:
 499       size = 0;
 500       for (i = 0; i < type->length; i++) {
 501          size += type_size(type->fields.structure[i].type);
 502       }
 503       return size;
 504    case GLSL_TYPE_SAMPLER:
 505       /* Samplers take up no register space, since they're baked in at
 506        * link time.
 507        */
 508       return 0;
 509    case GLSL_TYPE_ATOMIC_UINT:
 510       return 0;
 511    case GLSL_TYPE_IMAGE:
 512    case GLSL_TYPE_VOID:
 513    case GLSL_TYPE_ERROR:
 514    case GLSL_TYPE_INTERFACE:
 515       assert(!"not reached");
 516       break;
 517    }
 518
 519    return 0;
 520 }
 521
 522 fs_reg
 523 fs_visitor::get_timestamp()
 524 {
 525    assert(brw->gen >= 7);
 526
 527    fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 528                                           BRW_ARF_TIMESTAMP,
 529                                           0),
 530                              BRW_REGISTER_TYPE_UD));
 531
 532    fs_reg dst = fs_reg(this, glsl_type::uint_type);
 533
 534    fs_inst *mov = emit(MOV(dst, ts));
 535    /* We want to read the 3 fields we care about (mostly field 0, but also 2)
 536     * even if it's not enabled in the dispatch.
 537     */
 538    mov->force_writemask_all = true;
 539    mov->force_uncompressed = true;
 540
 541    /* The caller wants the low 32 bits of the timestamp.  Since it's running
 542     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
 543     * which is plenty of time for our purposes.  It is identical across the
 544     * EUs, but since it's tracking GPU core speed it will increment at a
 545     * varying rate as render P-states change.
 546     *
 547     * The caller could also check if render P-states have changed (or anything
 548     * else that might disrupt timing) by setting smear to 2 and checking if
 549     * that field is != 0.
 550     */
 551    dst.smear = 0;
 552
 553    return dst;
 554 }
 555
 556 void
 557 fs_visitor::emit_shader_time_begin()
 558 {
 559    current_annotation = "shader time start";
 560    shader_start_time = get_timestamp();
 561 }
 562
 563 void
 564 fs_visitor::emit_shader_time_end()
 565 {
 566    current_annotation = "shader time end";
 567
 568    enum shader_time_shader_type type, written_type, reset_type;
 569    if (dispatch_width == 8) {
 570       type = ST_FS8;
 571       written_type = ST_FS8_WRITTEN;
 572       reset_type = ST_FS8_RESET;
 573    } else {
 574       assert(dispatch_width == 16);
 575       type = ST_FS16;
 576       written_type = ST_FS16_WRITTEN;
 577       reset_type = ST_FS16_RESET;
 578    }
 579
 580    fs_reg shader_end_time = get_timestamp();
 581
 582    /* Check that there weren't any timestamp reset events (assuming these
 583     * were the only two timestamp reads that happened).
 584     */
 585    fs_reg reset = shader_end_time;
 586    reset.smear = 2;
 587    fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
 588    test->conditional_mod = BRW_CONDITIONAL_Z;
 589    emit(IF(BRW_PREDICATE_NORMAL));
 590
 591    push_force_uncompressed();
 592    fs_reg start = shader_start_time;
 593    start.negate = true;
 594    fs_reg diff = fs_reg(this, glsl_type::uint_type);
 595    emit(ADD(diff, start, shader_end_time));
 596
 597    /* If there were no instructions between the two timestamp gets, the diff
 598     * is 2 cycles.  Remove that overhead, so I can forget about that when
 599     * trying to determine the time taken for single instructions.
 600     */
 601    emit(ADD(diff, diff, fs_reg(-2u)));
 602
 603    emit_shader_time_write(type, diff);
 604    emit_shader_time_write(written_type, fs_reg(1u));
 605    emit(BRW_OPCODE_ELSE);
 606    emit_shader_time_write(reset_type, fs_reg(1u));
 607    emit(BRW_OPCODE_ENDIF);
 608
 609    pop_force_uncompressed();
 610 }
 611
 612 void
 613 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
 614                                    fs_reg value)
 615 {
 616    int shader_time_index =
 617       brw_get_shader_time_index(brw, shader_prog, &fp->Base, type);
 618    fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
 619
 620    fs_reg payload;
 621    if (dispatch_width == 8)
 622       payload = fs_reg(this, glsl_type::uvec2_type);
 623    else
 624       payload = fs_reg(this, glsl_type::uint_type);
 625
 626    emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
 627                 fs_reg(), payload, offset, value));
 628 }
 629
 630 void
 631 fs_visitor::fail(const char *format, ...)
 632 {
 633    va_list va;
 634    char *msg;
 635
 636    if (failed)
 637       return;
 638
 639    failed = true;
 640
 641    va_start(va, format);
 642    msg = ralloc_vasprintf(mem_ctx, format, va);
 643    va_end(va);
 644    msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
 645
 646    this->fail_msg = msg;
 647
 648    if (INTEL_DEBUG & DEBUG_WM) {
 649       fprintf(stderr, "%s",  msg);
 650    }
 651 }
 652
 653 fs_inst *
 654 fs_visitor::emit(enum opcode opcode)
 655 {
 656    return emit(fs_inst(opcode));
 657 }
 658
 659 fs_inst *
 660 fs_visitor::emit(enum opcode opcode, fs_reg dst)
 661 {
 662    return emit(fs_inst(opcode, dst));
 663 }
 664
 665 fs_inst *
 666 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
 667 {
 668    return emit(fs_inst(opcode, dst, src0));
 669 }
 670
 671 fs_inst *
 672 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 673 {
 674    return emit(fs_inst(opcode, dst, src0, src1));
 675 }
 676
 677 fs_inst *
 678 fs_visitor::emit(enum opcode opcode, fs_reg dst,
 679                  fs_reg src0, fs_reg src1, fs_reg src2)
 680 {
 681    return emit(fs_inst(opcode, dst, src0, src1, src2));
 682 }
 683
 684 void
 685 fs_visitor::push_force_uncompressed()
 686 {
 687    force_uncompressed_stack++;
 688 }
 689
 690 void
 691 fs_visitor::pop_force_uncompressed()
 692 {
 693    force_uncompressed_stack--;
 694    assert(force_uncompressed_stack >= 0);
 695 }
 696
 697 /**
 698  * Returns true if the instruction has a flag that means it won't
 699  * update an entire destination register.
 700  *
 701  * For example, dead code elimination and live variable analysis want to know
 702  * when a write to a variable screens off any preceding values that were in
 703  * it.
 704  */
 705 bool
 706 fs_inst::is_partial_write()
 707 {
 708    return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
 709            this->force_uncompressed ||
 710            this->force_sechalf);
 711 }
 712
 713 int
 714 fs_inst::regs_read(fs_visitor *v, int arg)
 715 {
 716    if (is_tex() && arg == 0 && src[0].file == GRF) {
 717       if (v->dispatch_width == 16)
 718          return (mlen + 1) / 2;
 719       else
 720          return mlen;
 721    }
 722    return 1;
 723 }
 724
 725 bool
 726 fs_inst::reads_flag()
 727 {
 728    return predicate;
 729 }
 730
 731 bool
 732 fs_inst::writes_flag()
 733 {
 734    return (conditional_mod && opcode != BRW_OPCODE_SEL) ||
 735           opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
 736 }
 737
 738 /**
 739  * Returns how many MRFs an FS opcode will write over.
 740  *
 741  * Note that this is not the 0 or 1 implied writes in an actual gen
 742  * instruction -- the FS opcodes often generate MOVs in addition.
 743  */
 744 int
 745 fs_visitor::implied_mrf_writes(fs_inst *inst)
 746 {
 747    if (inst->mlen == 0)
 748       return 0;
 749
 750    if (inst->base_mrf == -1)
 751       return 0;
 752
 753    switch (inst->opcode) {
 754    case SHADER_OPCODE_RCP:
 755    case SHADER_OPCODE_RSQ:
 756    case SHADER_OPCODE_SQRT:
 757    case SHADER_OPCODE_EXP2:
 758    case SHADER_OPCODE_LOG2:
 759    case SHADER_OPCODE_SIN:
 760    case SHADER_OPCODE_COS:
 761       return 1 * dispatch_width / 8;
 762    case SHADER_OPCODE_POW:
 763    case SHADER_OPCODE_INT_QUOTIENT:
 764    case SHADER_OPCODE_INT_REMAINDER:
 765       return 2 * dispatch_width / 8;
 766    case SHADER_OPCODE_TEX:
 767    case FS_OPCODE_TXB:
 768    case SHADER_OPCODE_TXD:
 769    case SHADER_OPCODE_TXF:
 770    case SHADER_OPCODE_TXF_CMS:
 771    case SHADER_OPCODE_TXF_MCS:
 772    case SHADER_OPCODE_TG4:
 773    case SHADER_OPCODE_TG4_OFFSET:
 774    case SHADER_OPCODE_TXL:
 775    case SHADER_OPCODE_TXS:
 776    case SHADER_OPCODE_LOD:
 777       return 1;
 778    case FS_OPCODE_FB_WRITE:
 779       return 2;
 780    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 781    case SHADER_OPCODE_GEN4_SCRATCH_READ:
 782       return 1;
 783    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
 784       return inst->mlen;
 785    case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
 786       return 2;
 787    case SHADER_OPCODE_UNTYPED_ATOMIC:
 788    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
 789       return 0;
 790    default:
 791       assert(!"not reached");
 792       return inst->mlen;
 793    }
 794 }
 795
 796 int
 797 fs_visitor::virtual_grf_alloc(int size)
 798 {
 799    if (virtual_grf_array_size <= virtual_grf_count) {
 800       if (virtual_grf_array_size == 0)
 801          virtual_grf_array_size = 16;
 802       else
 803          virtual_grf_array_size *= 2;
 804       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 805                                    virtual_grf_array_size);
 806    }
 807    virtual_grf_sizes[virtual_grf_count] = size;
 808    return virtual_grf_count++;
 809 }
 810
 811 /** Fixed HW reg constructor. */
 812 fs_reg::fs_reg(enum register_file file, int reg)
 813 {
 814    init();
 815    this->file = file;
 816    this->reg = reg;
 817    this->type = BRW_REGISTER_TYPE_F;
 818 }
 819
 820 /** Fixed HW reg constructor. */
 821 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
 822 {
 823    init();
 824    this->file = file;
 825    this->reg = reg;
 826    this->type = type;
 827 }
 828
 829 /** Automatic reg constructor. */
 830 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
 831 {
 832    init();
 833
 834    this->file = GRF;
 835    this->reg = v->virtual_grf_alloc(v->type_size(type));
 836    this->reg_offset = 0;
 837    this->type = brw_type_for_base_type(type);
 838 }
 839
 840 fs_reg *
 841 fs_visitor::variable_storage(ir_variable *var)
 842 {
 843    return (fs_reg *)hash_table_find(this->variable_ht, var);
 844 }
 845
 846 void
 847 import_uniforms_callback(const void *key,
 848                          void *data,
 849                          void *closure)
 850 {
 851    struct hash_table *dst_ht = (struct hash_table *)closure;
 852    const fs_reg *reg = (const fs_reg *)data;
 853
 854    if (reg->file != UNIFORM)
 855       return;
 856
 857    hash_table_insert(dst_ht, data, key);
 858 }
 859
 860 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
 861  * This brings in those uniform definitions
 862  */
 863 void
 864 fs_visitor::import_uniforms(fs_visitor *v)
 865 {
 866    hash_table_call_foreach(v->variable_ht,
 867                            import_uniforms_callback,
 868                            variable_ht);
 869    this->params_remap = v->params_remap;
 870    this->nr_params_remap = v->nr_params_remap;
 871 }
 872
 873 /* Our support for uniforms is piggy-backed on the struct
 874  * gl_fragment_program, because that's where the values actually
 875  * get stored, rather than in some global gl_shader_program uniform
 876  * store.
 877  */
 878 void
 879 fs_visitor::setup_uniform_values(ir_variable *ir)
 880 {
 881    int namelen = strlen(ir->name);
 882
 883    /* The data for our (non-builtin) uniforms is stored in a series of
 884     * gl_uniform_driver_storage structs for each subcomponent that
 885     * glGetUniformLocation() could name.  We know it's been set up in the same
 886     * order we'd walk the type, so walk the list of storage and find anything
 887     * with our name, or the prefix of a component that starts with our name.
 888     */
 889    unsigned params_before = c->prog_data.nr_params;
 890    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
 891       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 892
 893       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 894           (storage->name[namelen] != 0 &&
 895            storage->name[namelen] != '.' &&
 896            storage->name[namelen] != '[')) {
 897          continue;
 898       }
 899
 900       unsigned slots = storage->type->component_slots();
 901       if (storage->array_elements)
 902          slots *= storage->array_elements;
 903
 904       for (unsigned i = 0; i < slots; i++) {
 905          c->prog_data.param[c->prog_data.nr_params++] =
 906             &storage->storage[i].f;
 907       }
 908    }
 909
 910    /* Make sure we actually initialized the right amount of stuff here. */
 911    assert(params_before + ir->type->component_slots() ==
 912           c->prog_data.nr_params);
 913    (void)params_before;
 914 }
 915
 916
 917 /* Our support for builtin uniforms is even scarier than non-builtin.
 918  * It sits on top of the PROG_STATE_VAR parameters that are
 919  * automatically updated from GL context state.
 920  */
 921 void
 922 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
 923 {
 924    const ir_state_slot *const slots = ir->state_slots;
 925    assert(ir->state_slots != NULL);
 926
 927    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 928       /* This state reference has already been setup by ir_to_mesa, but we'll
 929        * get the same index back here.
 930        */
 931       int index = _mesa_add_state_reference(this->fp->Base.Parameters,
 932                                             (gl_state_index *)slots[i].tokens);
 933
 934       /* Add each of the unique swizzles of the element as a parameter.
 935        * This'll end up matching the expected layout of the
 936        * array/matrix/structure we're trying to fill in.
 937        */
 938       int last_swiz = -1;
 939       for (unsigned int j = 0; j < 4; j++) {
 940          int swiz = GET_SWZ(slots[i].swizzle, j);
 941          if (swiz == last_swiz)
 942             break;
 943          last_swiz = swiz;
 944
 945          c->prog_data.param[c->prog_data.nr_params++] =
 946             &fp->Base.Parameters->ParameterValues[index][swiz].f;
 947       }
 948    }
 949 }
 950
 951 fs_reg *
 952 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
 953 {
 954    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 955    fs_reg wpos = *reg;
 956    bool flip = !ir->data.origin_upper_left ^ c->key.render_to_fbo;
 957
 958    /* gl_FragCoord.x */
 959    if (ir->data.pixel_center_integer) {
 960       emit(MOV(wpos, this->pixel_x));
 961    } else {
 962       emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
 963    }
 964    wpos.reg_offset++;
 965
 966    /* gl_FragCoord.y */
 967    if (!flip && ir->data.pixel_center_integer) {
 968       emit(MOV(wpos, this->pixel_y));
 969    } else {
 970       fs_reg pixel_y = this->pixel_y;
 971       float offset = (ir->data.pixel_center_integer ? 0.0 : 0.5);
 972
 973       if (flip) {
 974          pixel_y.negate = true;
 975          offset += c->key.drawable_height - 1.0;
 976       }
 977
 978       emit(ADD(wpos, pixel_y, fs_reg(offset)));
 979    }
 980    wpos.reg_offset++;
 981
 982    /* gl_FragCoord.z */
 983    if (brw->gen >= 6) {
 984       emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
 985    } else {
 986       emit(FS_OPCODE_LINTERP, wpos,
 987            this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
 988            this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
 989            interp_reg(VARYING_SLOT_POS, 2));
 990    }
 991    wpos.reg_offset++;
 992
 993    /* gl_FragCoord.w: Already set up in emit_interpolation */
 994    emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
 995
 996    return reg;
 997 }
 998
 999 fs_inst *
1000 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1001                          glsl_interp_qualifier interpolation_mode,
1002                          bool is_centroid, bool is_sample)
1003 {
1004    brw_wm_barycentric_interp_mode barycoord_mode;
1005    if (brw->gen >= 6) {
1006       if (is_centroid) {
1007          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1008             barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1009          else
1010             barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1011       } else if (is_sample) {
1012           if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1013             barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1014          else
1015             barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1016       } else {
1017          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1018             barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1019          else
1020             barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1021       }
1022    } else {
1023       /* On Ironlake and below, there is only one interpolation mode.
1024        * Centroid interpolation doesn't mean anything on this hardware --
1025        * there is no multisampling.
1026        */
1027       barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1028    }
1029    return emit(FS_OPCODE_LINTERP, attr,
1030                this->delta_x[barycoord_mode],
1031                this->delta_y[barycoord_mode], interp);
1032 }
1033
1034 fs_reg *
1035 fs_visitor::emit_general_interpolation(ir_variable *ir)
1036 {
1037    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1038    reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
1039    fs_reg attr = *reg;
1040
1041    unsigned int array_elements;
1042    const glsl_type *type;
1043
1044    if (ir->type->is_array()) {
1045       array_elements = ir->type->length;
1046       if (array_elements == 0) {
1047          fail("dereferenced array '%s' has length 0\n", ir->name);
1048       }
1049       type = ir->type->fields.array;
1050    } else {
1051       array_elements = 1;
1052       type = ir->type;
1053    }
1054
1055    glsl_interp_qualifier interpolation_mode =
1056       ir->determine_interpolation_mode(c->key.flat_shade);
1057
1058    int location = ir->data.location;
1059    for (unsigned int i = 0; i < array_elements; i++) {
1060       for (unsigned int j = 0; j < type->matrix_columns; j++) {
1061          if (c->prog_data.urb_setup[location] == -1) {
1062             /* If there's no incoming setup data for this slot, don't
1063              * emit interpolation for it.
1064              */
1065             attr.reg_offset += type->vector_elements;
1066             location++;
1067             continue;
1068          }
1069
1070          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1071             /* Constant interpolation (flat shading) case. The SF has
1072              * handed us defined values in only the constant offset
1073              * field of the setup reg.
1074              */
1075             for (unsigned int k = 0; k < type->vector_elements; k++) {
1076                struct brw_reg interp = interp_reg(location, k);
1077                interp = suboffset(interp, 3);
1078                interp.type = reg->type;
1079                emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1080                attr.reg_offset++;
1081             }
1082          } else {
1083             /* Smooth/noperspective interpolation case. */
1084             for (unsigned int k = 0; k < type->vector_elements; k++) {
1085                /* FINISHME: At some point we probably want to push
1086                 * this farther by giving similar treatment to the
1087                 * other potentially constant components of the
1088                 * attribute, as well as making brw_vs_constval.c
1089                 * handle varyings other than gl_TexCoord.
1090                 */
1091                struct brw_reg interp = interp_reg(location, k);
1092                emit_linterp(attr, fs_reg(interp), interpolation_mode,
1093                             ir->data.centroid && !c->key.persample_shading,
1094                             ir->data.sample || c->key.persample_shading);
1095                if (brw->needs_unlit_centroid_workaround && ir->data.centroid) {
1096                   /* Get the pixel/sample mask into f0 so that we know
1097                    * which pixels are lit.  Then, for each channel that is
1098                    * unlit, replace the centroid data with non-centroid
1099                    * data.
1100                    */
1101                   emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1102                   fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1103                                                interpolation_mode,
1104                                                false, false);
1105                   inst->predicate = BRW_PREDICATE_NORMAL;
1106                   inst->predicate_inverse = true;
1107                }
1108                if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1109                   emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1110                }
1111                attr.reg_offset++;
1112             }
1113
1114          }
1115          location++;
1116       }
1117    }
1118
1119    return reg;
1120 }
1121
1122 fs_reg *
1123 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1124 {
1125    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1126
1127    /* The frontfacing comes in as a bit in the thread payload. */
1128    if (brw->gen >= 6) {
1129       emit(BRW_OPCODE_ASR, *reg,
1130            fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1131            fs_reg(15));
1132       emit(BRW_OPCODE_NOT, *reg, *reg);
1133       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1134    } else {
1135       struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1136       /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1137        * us front face
1138        */
1139       emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1140       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1141    }
1142
1143    return reg;
1144 }
1145
1146 void
1147 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1148 {
1149    assert(dst.type == BRW_REGISTER_TYPE_F);
1150
1151    if (c->key.compute_pos_offset) {
1152       /* Convert int_sample_pos to floating point */
1153       emit(MOV(dst, int_sample_pos));
1154       /* Scale to the range [0, 1] */
1155       emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1156    }
1157    else {
1158       /* From ARB_sample_shading specification:
1159        * "When rendering to a non-multisample buffer, or if multisample
1160        *  rasterization is disabled, gl_SamplePosition will always be
1161        *  (0.5, 0.5).
1162        */
1163       emit(MOV(dst, fs_reg(0.5f)));
1164    }
1165 }
1166
1167 fs_reg *
1168 fs_visitor::emit_samplepos_setup(ir_variable *ir)
1169 {
1170    assert(brw->gen >= 6);
1171    assert(ir->type == glsl_type::vec2_type);
1172
1173    this->current_annotation = "compute sample position";
1174    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1175    fs_reg pos = *reg;
1176    fs_reg int_sample_x = fs_reg(this, glsl_type::int_type);
1177    fs_reg int_sample_y = fs_reg(this, glsl_type::int_type);
1178
1179    /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1180     * mode will be enabled.
1181     *
1182     * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1183     * R31.1:0         Position Offset X/Y for Slot[3:0]
1184     * R31.3:2         Position Offset X/Y for Slot[7:4]
1185     * .....
1186     *
1187     * The X, Y sample positions come in as bytes in  thread payload. So, read
1188     * the positions using vstride=16, width=8, hstride=2.
1189     */
1190    struct brw_reg sample_pos_reg =
1191       stride(retype(brw_vec1_grf(c->sample_pos_reg, 0),
1192                     BRW_REGISTER_TYPE_B), 16, 8, 2);
1193
1194    emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1195    if (dispatch_width == 16) {
1196       int_sample_x.sechalf = true;
1197       fs_inst *inst = emit(MOV(int_sample_x,
1198                                fs_reg(suboffset(sample_pos_reg, 16))));
1199       inst->force_sechalf = true;
1200       int_sample_x.sechalf = false;
1201    }
1202    /* Compute gl_SamplePosition.x */
1203    compute_sample_position(pos, int_sample_x);
1204    pos.reg_offset++;
1205    emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1206    if (dispatch_width == 16) {
1207       int_sample_y.sechalf = true;
1208       fs_inst *inst = emit(MOV(int_sample_y,
1209                                fs_reg(suboffset(sample_pos_reg, 17))));
1210       inst->force_sechalf = true;
1211       int_sample_y.sechalf = false;
1212    }
1213    /* Compute gl_SamplePosition.y */
1214    compute_sample_position(pos, int_sample_y);
1215    return reg;
1216 }
1217
1218 fs_reg *
1219 fs_visitor::emit_sampleid_setup(ir_variable *ir)
1220 {
1221    assert(brw->gen >= 6);
1222
1223    this->current_annotation = "compute sample id";
1224    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1225
1226    if (c->key.compute_sample_id) {
1227       fs_reg t1 = fs_reg(this, glsl_type::int_type);
1228       fs_reg t2 = fs_reg(this, glsl_type::int_type);
1229       t2.type = BRW_REGISTER_TYPE_UW;
1230
1231       /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1232        * 8x multisampling, subspan 0 will represent sample N (where N
1233        * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1234        * 7. We can find the value of N by looking at R0.0 bits 7:6
1235        * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1236        * (since samples are always delivered in pairs). That is, we
1237        * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1238        * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1239        * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1240        * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1241        * populating a temporary variable with the sequence (0, 1, 2, 3),
1242        * and then reading from it using vstride=1, width=4, hstride=0.
1243        * These computations hold good for 4x multisampling as well.
1244        */
1245       emit(BRW_OPCODE_AND, t1,
1246            fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1247            fs_reg(brw_imm_d(0xc0)));
1248       emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1249       /* This works for both SIMD8 and SIMD16 */
1250       emit(MOV(t2, brw_imm_v(0x3210)));
1251       /* This special instruction takes care of setting vstride=1,
1252        * width=4, hstride=0 of t2 during an ADD instruction.
1253        */
1254       emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1255    } else {
1256       /* As per GL_ARB_sample_shading specification:
1257        * "When rendering to a non-multisample buffer, or if multisample
1258        *  rasterization is disabled, gl_SampleID will always be zero."
1259        */
1260       emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1261    }
1262
1263    return reg;
1264 }
1265
1266 fs_reg *
1267 fs_visitor::emit_samplemaskin_setup(ir_variable *ir)
1268 {
1269    assert(brw->gen >= 7);
1270    this->current_annotation = "compute gl_SampleMaskIn";
1271    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1272    emit(MOV(*reg, fs_reg(retype(brw_vec8_grf(c->sample_mask_reg, 0), BRW_REGISTER_TYPE_D))));
1273    return reg;
1274 }
1275
1276 fs_reg
1277 fs_visitor::fix_math_operand(fs_reg src)
1278 {
1279    /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1280     * might be able to do better by doing execsize = 1 math and then
1281     * expanding that result out, but we would need to be careful with
1282     * masking.
1283     *
1284     * The hardware ignores source modifiers (negate and abs) on math
1285     * instructions, so we also move to a temp to set those up.
1286     */
1287    if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1288        !src.abs && !src.negate)
1289       return src;
1290
1291    /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1292     * operands to math
1293     */
1294    if (brw->gen >= 7 && src.file != IMM)
1295       return src;
1296
1297    fs_reg expanded = fs_reg(this, glsl_type::float_type);
1298    expanded.type = src.type;
1299    emit(BRW_OPCODE_MOV, expanded, src);
1300    return expanded;
1301 }
1302
1303 fs_inst *
1304 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1305 {
1306    switch (opcode) {
1307    case SHADER_OPCODE_RCP:
1308    case SHADER_OPCODE_RSQ:
1309    case SHADER_OPCODE_SQRT:
1310    case SHADER_OPCODE_EXP2:
1311    case SHADER_OPCODE_LOG2:
1312    case SHADER_OPCODE_SIN:
1313    case SHADER_OPCODE_COS:
1314       break;
1315    default:
1316       assert(!"not reached: bad math opcode");
1317       return NULL;
1318    }
1319
1320    /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
1321     * might be able to do better by doing execsize = 1 math and then
1322     * expanding that result out, but we would need to be careful with
1323     * masking.
1324     *
1325     * Gen 6 hardware ignores source modifiers (negate and abs) on math
1326     * instructions, so we also move to a temp to set those up.
1327     */
1328    if (brw->gen >= 6)
1329       src = fix_math_operand(src);
1330
1331    fs_inst *inst = emit(opcode, dst, src);
1332
1333    if (brw->gen < 6) {
1334       inst->base_mrf = 2;
1335       inst->mlen = dispatch_width / 8;
1336    }
1337
1338    return inst;
1339 }
1340
1341 fs_inst *
1342 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1343 {
1344    int base_mrf = 2;
1345    fs_inst *inst;
1346
1347    switch (opcode) {
1348    case SHADER_OPCODE_INT_QUOTIENT:
1349    case SHADER_OPCODE_INT_REMAINDER:
1350       if (brw->gen >= 7 && dispatch_width == 16)
1351          fail("SIMD16 INTDIV unsupported\n");
1352       break;
1353    case SHADER_OPCODE_POW:
1354       break;
1355    default:
1356       assert(!"not reached: unsupported binary math opcode.");
1357       return NULL;
1358    }
1359
1360    if (brw->gen >= 6) {
1361       src0 = fix_math_operand(src0);
1362       src1 = fix_math_operand(src1);
1363
1364       inst = emit(opcode, dst, src0, src1);
1365    } else {
1366       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1367        * "Message Payload":
1368        *
1369        * "Operand0[7].  For the INT DIV functions, this operand is the
1370        *  denominator."
1371        *  ...
1372        * "Operand1[7].  For the INT DIV functions, this operand is the
1373        *  numerator."
1374        */
1375       bool is_int_div = opcode != SHADER_OPCODE_POW;
1376       fs_reg &op0 = is_int_div ? src1 : src0;
1377       fs_reg &op1 = is_int_div ? src0 : src1;
1378
1379       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1380       inst = emit(opcode, dst, op0, reg_null_f);
1381
1382       inst->base_mrf = base_mrf;
1383       inst->mlen = 2 * dispatch_width / 8;
1384    }
1385    return inst;
1386 }
1387
1388 void
1389 fs_visitor::assign_curb_setup()
1390 {
1391    c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
1392    if (dispatch_width == 8) {
1393       c->prog_data.first_curbe_grf = c->nr_payload_regs;
1394    } else {
1395       c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1396    }
1397
1398    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1399    foreach_list(node, &this->instructions) {
1400       fs_inst *inst = (fs_inst *)node;
1401
1402       for (unsigned int i = 0; i < 3; i++) {
1403          if (inst->src[i].file == UNIFORM) {
1404             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1405             struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1406                                                   constant_nr / 8,
1407                                                   constant_nr % 8);
1408
1409             inst->src[i].file = HW_REG;
1410             inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
1411          }
1412       }
1413    }
1414 }
1415
1416 void
1417 fs_visitor::calculate_urb_setup()
1418 {
1419    for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1420       c->prog_data.urb_setup[i] = -1;
1421    }
1422
1423    int urb_next = 0;
1424    /* Figure out where each of the incoming setup attributes lands. */
1425    if (brw->gen >= 6) {
1426       if (_mesa_bitcount_64(fp->Base.InputsRead &
1427                             BRW_FS_VARYING_INPUT_MASK) <= 16) {
1428          /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1429           * first 16 varying inputs, so we can put them wherever we want.
1430           * Just put them in order.
1431           *
1432           * This is useful because it means that (a) inputs not used by the
1433           * fragment shader won't take up valuable register space, and (b) we
1434           * won't have to recompile the fragment shader if it gets paired with
1435           * a different vertex (or geometry) shader.
1436           */
1437          for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1438             if (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1439                 BITFIELD64_BIT(i)) {
1440                c->prog_data.urb_setup[i] = urb_next++;
1441             }
1442          }
1443       } else {
1444          /* We have enough input varyings that the SF/SBE pipeline stage can't
1445           * arbitrarily rearrange them to suit our whim; we have to put them
1446           * in an order that matches the output of the previous pipeline stage
1447           * (geometry or vertex shader).
1448           */
1449          struct brw_vue_map prev_stage_vue_map;
1450          brw_compute_vue_map(brw, &prev_stage_vue_map,
1451                              c->key.input_slots_valid);
1452          int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1453          assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1454          for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1455               slot++) {
1456             int varying = prev_stage_vue_map.slot_to_varying[slot];
1457             /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1458              * unused.
1459              */
1460             if (varying != BRW_VARYING_SLOT_COUNT &&
1461                 (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1462                  BITFIELD64_BIT(varying))) {
1463                c->prog_data.urb_setup[varying] = slot - first_slot;
1464             }
1465          }
1466          urb_next = prev_stage_vue_map.num_slots - first_slot;
1467       }
1468    } else {
1469       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1470       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1471          /* Point size is packed into the header, not as a general attribute */
1472          if (i == VARYING_SLOT_PSIZ)
1473             continue;
1474
1475          if (c->key.input_slots_valid & BITFIELD64_BIT(i)) {
1476             /* The back color slot is skipped when the front color is
1477              * also written to.  In addition, some slots can be
1478              * written in the vertex shader and not read in the
1479              * fragment shader.  So the register number must always be
1480              * incremented, mapped or not.
1481              */
1482             if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1483                c->prog_data.urb_setup[i] = urb_next;
1484             urb_next++;
1485          }
1486       }
1487
1488       /*
1489        * It's a FS only attribute, and we did interpolation for this attribute
1490        * in SF thread. So, count it here, too.
1491        *
1492        * See compile_sf_prog() for more info.
1493        */
1494       if (fp->Base.InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1495          c->prog_data.urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1496    }
1497
1498    c->prog_data.num_varying_inputs = urb_next;
1499 }
1500
1501 void
1502 fs_visitor::assign_urb_setup()
1503 {
1504    int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1505
1506    /* Offset all the urb_setup[] index by the actual position of the
1507     * setup regs, now that the location of the constants has been chosen.
1508     */
1509    foreach_list(node, &this->instructions) {
1510       fs_inst *inst = (fs_inst *)node;
1511
1512       if (inst->opcode == FS_OPCODE_LINTERP) {
1513          assert(inst->src[2].file == HW_REG);
1514          inst->src[2].fixed_hw_reg.nr += urb_start;
1515       }
1516
1517       if (inst->opcode == FS_OPCODE_CINTERP) {
1518          assert(inst->src[0].file == HW_REG);
1519          inst->src[0].fixed_hw_reg.nr += urb_start;
1520       }
1521    }
1522
1523    /* Each attribute is 4 setup channels, each of which is half a reg. */
1524    this->first_non_payload_grf =
1525       urb_start + c->prog_data.num_varying_inputs * 2;
1526 }
1527
1528 /**
1529  * Split large virtual GRFs into separate components if we can.
1530  *
1531  * This is mostly duplicated with what brw_fs_vector_splitting does,
1532  * but that's really conservative because it's afraid of doing
1533  * splitting that doesn't result in real progress after the rest of
1534  * the optimization phases, which would cause infinite looping in
1535  * optimization.  We can do it once here, safely.  This also has the
1536  * opportunity to split interpolated values, or maybe even uniforms,
1537  * which we don't have at the IR level.
1538  *
1539  * We want to split, because virtual GRFs are what we register
1540  * allocate and spill (due to contiguousness requirements for some
1541  * instructions), and they're what we naturally generate in the
1542  * codegen process, but most virtual GRFs don't actually need to be
1543  * contiguous sets of GRFs.  If we split, we'll end up with reduced
1544  * live intervals and better dead code elimination and coalescing.
1545  */
1546 void
1547 fs_visitor::split_virtual_grfs()
1548 {
1549    int num_vars = this->virtual_grf_count;
1550    bool split_grf[num_vars];
1551    int new_virtual_grf[num_vars];
1552
1553    /* Try to split anything > 0 sized. */
1554    for (int i = 0; i < num_vars; i++) {
1555       if (this->virtual_grf_sizes[i] != 1)
1556          split_grf[i] = true;
1557       else
1558          split_grf[i] = false;
1559    }
1560
1561    if (brw->has_pln &&
1562        this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1563       /* PLN opcodes rely on the delta_xy being contiguous.  We only have to
1564        * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1565        * Gen6, that was the only supported interpolation mode, and since Gen6,
1566        * delta_x and delta_y are in fixed hardware registers.
1567        */
1568       split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1569          false;
1570    }
1571
1572    foreach_list(node, &this->instructions) {
1573       fs_inst *inst = (fs_inst *)node;
1574
1575       /* If there's a SEND message that requires contiguous destination
1576        * registers, no splitting is allowed.
1577        */
1578       if (inst->regs_written > 1) {
1579          split_grf[inst->dst.reg] = false;
1580       }
1581
1582       /* If we're sending from a GRF, don't split it, on the assumption that
1583        * the send is reading the whole thing.
1584        */
1585       if (inst->is_send_from_grf()) {
1586          for (int i = 0; i < 3; i++) {
1587             if (inst->src[i].file == GRF) {
1588                split_grf[inst->src[i].reg] = false;
1589             }
1590          }
1591       }
1592    }
1593
1594    /* Allocate new space for split regs.  Note that the virtual
1595     * numbers will be contiguous.
1596     */
1597    for (int i = 0; i < num_vars; i++) {
1598       if (split_grf[i]) {
1599          new_virtual_grf[i] = virtual_grf_alloc(1);
1600          for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1601             int reg = virtual_grf_alloc(1);
1602             assert(reg == new_virtual_grf[i] + j - 1);
1603             (void) reg;
1604          }
1605          this->virtual_grf_sizes[i] = 1;
1606       }
1607    }
1608
1609    foreach_list(node, &this->instructions) {
1610       fs_inst *inst = (fs_inst *)node;
1611
1612       if (inst->dst.file == GRF &&
1613           split_grf[inst->dst.reg] &&
1614           inst->dst.reg_offset != 0) {
1615          inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1616                           inst->dst.reg_offset - 1);
1617          inst->dst.reg_offset = 0;
1618       }
1619       for (int i = 0; i < 3; i++) {
1620          if (inst->src[i].file == GRF &&
1621              split_grf[inst->src[i].reg] &&
1622              inst->src[i].reg_offset != 0) {
1623             inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1624                                 inst->src[i].reg_offset - 1);
1625             inst->src[i].reg_offset = 0;
1626          }
1627       }
1628    }
1629    invalidate_live_intervals();
1630 }
1631
1632 /**
1633  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1634  *
1635  * During code generation, we create tons of temporary variables, many of
1636  * which get immediately killed and are never used again.  Yet, in later
1637  * optimization and analysis passes, such as compute_live_intervals, we need
1638  * to loop over all the virtual GRFs.  Compacting them can save a lot of
1639  * overhead.
1640  */
1641 void
1642 fs_visitor::compact_virtual_grfs()
1643 {
1644    /* Mark which virtual GRFs are used, and count how many. */
1645    int remap_table[this->virtual_grf_count];
1646    memset(remap_table, -1, sizeof(remap_table));
1647
1648    foreach_list(node, &this->instructions) {
1649       const fs_inst *inst = (const fs_inst *) node;
1650
1651       if (inst->dst.file == GRF)
1652          remap_table[inst->dst.reg] = 0;
1653
1654       for (int i = 0; i < 3; i++) {
1655          if (inst->src[i].file == GRF)
1656             remap_table[inst->src[i].reg] = 0;
1657       }
1658    }
1659
1660    /* In addition to registers used in instructions, fs_visitor keeps
1661     * direct references to certain special values which must be patched:
1662     */
1663    fs_reg *special[] = {
1664       &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1665       &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1666       &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1667       &delta_x[0], &delta_x[1], &delta_x[2],
1668       &delta_x[3], &delta_x[4], &delta_x[5],
1669       &delta_y[0], &delta_y[1], &delta_y[2],
1670       &delta_y[3], &delta_y[4], &delta_y[5],
1671    };
1672    STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1673    STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1674
1675    /* Treat all special values as used, to be conservative */
1676    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1677       if (special[i]->file == GRF)
1678          remap_table[special[i]->reg] = 0;
1679    }
1680
1681    /* Compact the GRF arrays. */
1682    int new_index = 0;
1683    for (int i = 0; i < this->virtual_grf_count; i++) {
1684       if (remap_table[i] != -1) {
1685          remap_table[i] = new_index;
1686          virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1687          invalidate_live_intervals();
1688          ++new_index;
1689       }
1690    }
1691
1692    this->virtual_grf_count = new_index;
1693
1694    /* Patch all the instructions to use the newly renumbered registers */
1695    foreach_list(node, &this->instructions) {
1696       fs_inst *inst = (fs_inst *) node;
1697
1698       if (inst->dst.file == GRF)
1699          inst->dst.reg = remap_table[inst->dst.reg];
1700
1701       for (int i = 0; i < 3; i++) {
1702          if (inst->src[i].file == GRF)
1703             inst->src[i].reg = remap_table[inst->src[i].reg];
1704       }
1705    }
1706
1707    /* Patch all the references to special values */
1708    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1709       if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1710          special[i]->reg = remap_table[special[i]->reg];
1711    }
1712 }
1713
1714 bool
1715 fs_visitor::remove_dead_constants()
1716 {
1717    if (dispatch_width == 8) {
1718       this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
1719       this->nr_params_remap = c->prog_data.nr_params;
1720
1721       for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
1722          this->params_remap[i] = -1;
1723
1724       /* Find which params are still in use. */
1725       foreach_list(node, &this->instructions) {
1726          fs_inst *inst = (fs_inst *)node;
1727
1728          for (int i = 0; i < 3; i++) {
1729             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1730
1731             if (inst->src[i].file != UNIFORM)
1732                continue;
1733
1734             /* Section 5.11 of the OpenGL 4.3 spec says:
1735              *
1736              *     "Out-of-bounds reads return undefined values, which include
1737              *     values from other variables of the active program or zero."
1738              */
1739             if (constant_nr < 0 || constant_nr >= (int)c->prog_data.nr_params) {
1740                constant_nr = 0;
1741             }
1742
1743             /* For now, set this to non-negative.  We'll give it the
1744              * actual new number in a moment, in order to keep the
1745              * register numbers nicely ordered.
1746              */
1747             this->params_remap[constant_nr] = 0;
1748          }
1749       }
1750
1751       /* Figure out what the new numbers for the params will be.  At some
1752        * point when we're doing uniform array access, we're going to want
1753        * to keep the distinction between .reg and .reg_offset, but for
1754        * now we don't care.
1755        */
1756       unsigned int new_nr_params = 0;
1757       for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1758          if (this->params_remap[i] != -1) {
1759             this->params_remap[i] = new_nr_params++;
1760          }
1761       }
1762
1763       /* Update the list of params to be uploaded to match our new numbering. */
1764       for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1765          int remapped = this->params_remap[i];
1766
1767          if (remapped == -1)
1768             continue;
1769
1770          c->prog_data.param[remapped] = c->prog_data.param[i];
1771       }
1772
1773       c->prog_data.nr_params = new_nr_params;
1774    } else {
1775       /* This should have been generated in the SIMD8 pass already. */
1776       assert(this->params_remap);
1777    }
1778
1779    /* Now do the renumbering of the shader to remove unused params. */
1780    foreach_list(node, &this->instructions) {
1781       fs_inst *inst = (fs_inst *)node;
1782
1783       for (int i = 0; i < 3; i++) {
1784          int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1785
1786          if (inst->src[i].file != UNIFORM)
1787             continue;
1788
1789          /* as above alias to 0 */
1790          if (constant_nr < 0 || constant_nr >= (int)this->nr_params_remap) {
1791             constant_nr = 0;
1792          }
1793          assert(this->params_remap[constant_nr] != -1);
1794          inst->src[i].reg = this->params_remap[constant_nr];
1795          inst->src[i].reg_offset = 0;
1796       }
1797    }
1798
1799    return true;
1800 }
1801
1802 /*
1803  * Implements array access of uniforms by inserting a
1804  * PULL_CONSTANT_LOAD instruction.
1805  *
1806  * Unlike temporary GRF array access (where we don't support it due to
1807  * the difficulty of doing relative addressing on instruction
1808  * destinations), we could potentially do array access of uniforms
1809  * that were loaded in GRF space as push constants.  In real-world
1810  * usage we've seen, though, the arrays being used are always larger
1811  * than we could load as push constants, so just always move all
1812  * uniform array access out to a pull constant buffer.
1813  */
1814 void
1815 fs_visitor::move_uniform_array_access_to_pull_constants()
1816 {
1817    int pull_constant_loc[c->prog_data.nr_params];
1818
1819    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1820       pull_constant_loc[i] = -1;
1821    }
1822
1823    /* Walk through and find array access of uniforms.  Put a copy of that
1824     * uniform in the pull constant buffer.
1825     *
1826     * Note that we don't move constant-indexed accesses to arrays.  No
1827     * testing has been done of the performance impact of this choice.
1828     */
1829    foreach_list_safe(node, &this->instructions) {
1830       fs_inst *inst = (fs_inst *)node;
1831
1832       for (int i = 0 ; i < 3; i++) {
1833          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1834             continue;
1835
1836          int uniform = inst->src[i].reg;
1837
1838          /* If this array isn't already present in the pull constant buffer,
1839           * add it.
1840           */
1841          if (pull_constant_loc[uniform] == -1) {
1842             const float **values = &c->prog_data.param[uniform];
1843
1844             pull_constant_loc[uniform] = c->prog_data.nr_pull_params;
1845
1846             assert(param_size[uniform]);
1847
1848             for (int j = 0; j < param_size[uniform]; j++) {
1849                c->prog_data.pull_param[c->prog_data.nr_pull_params++] =
1850                   values[j];
1851             }
1852          }
1853
1854          /* Set up the annotation tracking for new generated instructions. */
1855          base_ir = inst->ir;
1856          current_annotation = inst->annotation;
1857
1858          fs_reg surf_index = fs_reg(c->prog_data.base.binding_table.pull_constants_start);
1859          fs_reg temp = fs_reg(this, glsl_type::float_type);
1860          exec_list list = VARYING_PULL_CONSTANT_LOAD(temp,
1861                                                      surf_index,
1862                                                      *inst->src[i].reladdr,
1863                                                      pull_constant_loc[uniform] +
1864                                                      inst->src[i].reg_offset);
1865          inst->insert_before(&list);
1866
1867          inst->src[i].file = temp.file;
1868          inst->src[i].reg = temp.reg;
1869          inst->src[i].reg_offset = temp.reg_offset;
1870          inst->src[i].reladdr = NULL;
1871       }
1872    }
1873 }
1874
1875 /**
1876  * Choose accesses from the UNIFORM file to demote to using the pull
1877  * constant buffer.
1878  *
1879  * We allow a fragment shader to have more than the specified minimum
1880  * maximum number of fragment shader uniform components (64).  If
1881  * there are too many of these, they'd fill up all of register space.
1882  * So, this will push some of them out to the pull constant buffer and
1883  * update the program to load them.
1884  */
1885 void
1886 fs_visitor::setup_pull_constants()
1887 {
1888    /* Only allow 16 registers (128 uniform components) as push constants. */
1889    unsigned int max_uniform_components = 16 * 8;
1890    if (c->prog_data.nr_params <= max_uniform_components)
1891       return;
1892
1893    if (dispatch_width == 16) {
1894       fail("Pull constants not supported in SIMD16\n");
1895       return;
1896    }
1897
1898    /* Just demote the end of the list.  We could probably do better
1899     * here, demoting things that are rarely used in the program first.
1900     */
1901    unsigned int pull_uniform_base = max_uniform_components;
1902
1903    int pull_constant_loc[c->prog_data.nr_params];
1904    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1905       if (i < pull_uniform_base) {
1906          pull_constant_loc[i] = -1;
1907       } else {
1908          pull_constant_loc[i] = -1;
1909          /* If our constant is already being uploaded for reladdr purposes,
1910           * reuse it.
1911           */
1912          for (unsigned int j = 0; j < c->prog_data.nr_pull_params; j++) {
1913             if (c->prog_data.pull_param[j] == c->prog_data.param[i]) {
1914                pull_constant_loc[i] = j;
1915                break;
1916             }
1917          }
1918          if (pull_constant_loc[i] == -1) {
1919             int pull_index = c->prog_data.nr_pull_params++;
1920             c->prog_data.pull_param[pull_index] = c->prog_data.param[i];
1921             pull_constant_loc[i] = pull_index;;
1922          }
1923       }
1924    }
1925    c->prog_data.nr_params = pull_uniform_base;
1926
1927    foreach_list(node, &this->instructions) {
1928       fs_inst *inst = (fs_inst *)node;
1929
1930       for (int i = 0; i < 3; i++) {
1931          if (inst->src[i].file != UNIFORM)
1932             continue;
1933
1934          int pull_index = pull_constant_loc[inst->src[i].reg +
1935                                             inst->src[i].reg_offset];
1936          if (pull_index == -1)
1937             continue;
1938
1939          assert(!inst->src[i].reladdr);
1940
1941          fs_reg dst = fs_reg(this, glsl_type::float_type);
1942          fs_reg index = fs_reg(c->prog_data.base.binding_table.pull_constants_start);
1943          fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1944          fs_inst *pull =
1945             new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1946                                  dst, index, offset);
1947          pull->ir = inst->ir;
1948          pull->annotation = inst->annotation;
1949
1950          inst->insert_before(pull);
1951
1952          inst->src[i].file = GRF;
1953          inst->src[i].reg = dst.reg;
1954          inst->src[i].reg_offset = 0;
1955          inst->src[i].smear = pull_index & 3;
1956       }
1957    }
1958 }
1959
1960 bool
1961 fs_visitor::opt_algebraic()
1962 {
1963    bool progress = false;
1964
1965    foreach_list(node, &this->instructions) {
1966       fs_inst *inst = (fs_inst *)node;
1967
1968       switch (inst->opcode) {
1969       case BRW_OPCODE_MUL:
1970          if (inst->src[1].file != IMM)
1971             continue;
1972
1973          /* a * 1.0 = a */
1974          if (inst->src[1].is_one()) {
1975             inst->opcode = BRW_OPCODE_MOV;
1976             inst->src[1] = reg_undef;
1977             progress = true;
1978             break;
1979          }
1980
1981          /* a * 0.0 = 0.0 */
1982          if (inst->src[1].is_zero()) {
1983             inst->opcode = BRW_OPCODE_MOV;
1984             inst->src[0] = inst->src[1];
1985             inst->src[1] = reg_undef;
1986             progress = true;
1987             break;
1988          }
1989
1990          break;
1991       case BRW_OPCODE_ADD:
1992          if (inst->src[1].file != IMM)
1993             continue;
1994
1995          /* a + 0.0 = a */
1996          if (inst->src[1].is_zero()) {
1997             inst->opcode = BRW_OPCODE_MOV;
1998             inst->src[1] = reg_undef;
1999             progress = true;
2000             break;
2001          }
2002          break;
2003       case BRW_OPCODE_OR:
2004          if (inst->src[0].equals(inst->src[1])) {
2005             inst->opcode = BRW_OPCODE_MOV;
2006             inst->src[1] = reg_undef;
2007             progress = true;
2008             break;
2009          }
2010          break;
2011       case BRW_OPCODE_LRP:
2012          if (inst->src[1].equals(inst->src[2])) {
2013             inst->opcode = BRW_OPCODE_MOV;
2014             inst->src[0] = inst->src[1];
2015             inst->src[1] = reg_undef;
2016             inst->src[2] = reg_undef;
2017             progress = true;
2018             break;
2019          }
2020          break;
2021       case BRW_OPCODE_SEL:
2022          if (inst->saturate && inst->src[1].file == IMM) {
2023             switch (inst->conditional_mod) {
2024             case BRW_CONDITIONAL_LE:
2025             case BRW_CONDITIONAL_L:
2026                switch (inst->src[1].type) {
2027                case BRW_REGISTER_TYPE_F:
2028                   if (inst->src[1].imm.f >= 1.0f) {
2029                      inst->opcode = BRW_OPCODE_MOV;
2030                      inst->src[1] = reg_undef;
2031                      progress = true;
2032                   }
2033                   break;
2034                default:
2035                   break;
2036                }
2037                break;
2038             case BRW_CONDITIONAL_GE:
2039             case BRW_CONDITIONAL_G:
2040                switch (inst->src[1].type) {
2041                case BRW_REGISTER_TYPE_F:
2042                   if (inst->src[1].imm.f <= 0.0f) {
2043                      inst->opcode = BRW_OPCODE_MOV;
2044                      inst->src[1] = reg_undef;
2045                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
2046                      progress = true;
2047                   }
2048                   break;
2049                default:
2050                   break;
2051                }
2052             default:
2053                break;
2054             }
2055          }
2056          break;
2057       default:
2058          break;
2059       }
2060    }
2061
2062    return progress;
2063 }
2064
2065 /**
2066  * Removes any instructions writing a VGRF where that VGRF is not used by any
2067  * later instruction.
2068  */
2069 bool
2070 fs_visitor::dead_code_eliminate()
2071 {
2072    bool progress = false;
2073    int pc = 0;
2074
2075    calculate_live_intervals();
2076
2077    foreach_list_safe(node, &this->instructions) {
2078       fs_inst *inst = (fs_inst *)node;
2079
2080       if (inst->dst.file == GRF && !inst->has_side_effects()) {
2081          bool dead = true;
2082
2083          for (int i = 0; i < inst->regs_written; i++) {
2084             int var = live_intervals->var_from_vgrf[inst->dst.reg];
2085             assert(live_intervals->end[var + inst->dst.reg_offset + i] >= pc);
2086             if (live_intervals->end[var + inst->dst.reg_offset + i] != pc) {
2087                dead = false;
2088                break;
2089             }
2090          }
2091
2092          if (dead) {
2093             /* Don't dead code eliminate instructions that write to the
2094              * accumulator as a side-effect. Instead just set the destination
2095              * to the null register to free it.
2096              */
2097             switch (inst->opcode) {
2098             case BRW_OPCODE_ADDC:
2099             case BRW_OPCODE_SUBB:
2100             case BRW_OPCODE_MACH:
2101                inst->dst = fs_reg(retype(brw_null_reg(), inst->dst.type));
2102                break;
2103             default:
2104                inst->remove();
2105                progress = true;
2106                break;
2107             }
2108          }
2109       }
2110
2111       pc++;
2112    }
2113
2114    if (progress)
2115       invalidate_live_intervals();
2116
2117    return progress;
2118 }
2119
2120 struct dead_code_hash_key
2121 {
2122    int vgrf;
2123    int reg_offset;
2124 };
2125
2126 static bool
2127 dead_code_hash_compare(const void *a, const void *b)
2128 {
2129    return memcmp(a, b, sizeof(struct dead_code_hash_key)) == 0;
2130 }
2131
2132 static void
2133 clear_dead_code_hash(struct hash_table *ht)
2134 {
2135    struct hash_entry *entry;
2136
2137    hash_table_foreach(ht, entry) {
2138       _mesa_hash_table_remove(ht, entry);
2139    }
2140 }
2141
2142 static void
2143 insert_dead_code_hash(struct hash_table *ht,
2144                       int vgrf, int reg_offset, fs_inst *inst)
2145 {
2146    /* We don't bother freeing keys, because they'll be GCed with the ht. */
2147    struct dead_code_hash_key *key = ralloc(ht, struct dead_code_hash_key);
2148
2149    key->vgrf = vgrf;
2150    key->reg_offset = reg_offset;
2151
2152    _mesa_hash_table_insert(ht, _mesa_hash_data(key, sizeof(*key)), key, inst);
2153 }
2154
2155 static struct hash_entry *
2156 get_dead_code_hash_entry(struct hash_table *ht, int vgrf, int reg_offset)
2157 {
2158    struct dead_code_hash_key key;
2159
2160    key.vgrf = vgrf;
2161    key.reg_offset = reg_offset;
2162
2163    return _mesa_hash_table_search(ht, _mesa_hash_data(&key, sizeof(key)), &key);
2164 }
2165
2166 static void
2167 remove_dead_code_hash(struct hash_table *ht,
2168                       int vgrf, int reg_offset)
2169 {
2170    struct hash_entry *entry = get_dead_code_hash_entry(ht, vgrf, reg_offset);
2171    if (!entry)
2172       return;
2173
2174    _mesa_hash_table_remove(ht, entry);
2175 }
2176
2177 /**
2178  * Walks basic blocks, removing any regs that are written but not read before
2179  * being redefined.
2180  *
2181  * The dead_code_eliminate() function implements a global dead code
2182  * elimination, but it only handles the removing the last write to a register
2183  * if it's never read.  This one can handle intermediate writes, but only
2184  * within a basic block.
2185  */
2186 bool
2187 fs_visitor::dead_code_eliminate_local()
2188 {
2189    struct hash_table *ht;
2190    bool progress = false;
2191
2192    ht = _mesa_hash_table_create(mem_ctx, dead_code_hash_compare);
2193
2194    if (ht == NULL) {
2195       return false;
2196    }
2197
2198    foreach_list_safe(node, &this->instructions) {
2199       fs_inst *inst = (fs_inst *)node;
2200
2201       /* At a basic block, empty the HT since we don't understand dataflow
2202        * here.
2203        */
2204       if (inst->is_control_flow()) {
2205          clear_dead_code_hash(ht);
2206          continue;
2207       }
2208
2209       /* Clear the HT of any instructions that got read. */
2210       for (int i = 0; i < 3; i++) {
2211          fs_reg src = inst->src[i];
2212          if (src.file != GRF)
2213             continue;
2214
2215          int read = 1;
2216          if (inst->is_send_from_grf())
2217             read = virtual_grf_sizes[src.reg] - src.reg_offset;
2218
2219          for (int reg_offset = src.reg_offset;
2220               reg_offset < src.reg_offset + read;
2221               reg_offset++) {
2222             remove_dead_code_hash(ht, src.reg, reg_offset);
2223          }
2224       }
2225
2226       /* Add any update of a GRF to the HT, removing a previous write if it
2227        * wasn't read.
2228        */
2229       if (inst->dst.file == GRF) {
2230          if (inst->regs_written > 1) {
2231             /* We don't know how to trim channels from an instruction's
2232              * writes, so we can't incrementally remove unread channels from
2233              * it.  Just remove whatever it overwrites from the table
2234              */
2235             for (int i = 0; i < inst->regs_written; i++) {
2236                remove_dead_code_hash(ht,
2237                                      inst->dst.reg,
2238                                      inst->dst.reg_offset + i);
2239             }
2240          } else {
2241             struct hash_entry *entry =
2242                get_dead_code_hash_entry(ht, inst->dst.reg,
2243                                         inst->dst.reg_offset);
2244
2245             if (entry) {
2246                if (inst->is_partial_write()) {
2247                   /* For a partial write, we can't remove any previous dead code
2248                    * candidate, since we're just modifying their result.
2249                    */
2250                } else {
2251                   /* We're completely updating a channel, and there was a
2252                    * previous write to the channel that wasn't read.  Kill it!
2253                    */
2254                   fs_inst *inst = (fs_inst *)entry->data;
2255                   inst->remove();
2256                   progress = true;
2257                }
2258
2259                _mesa_hash_table_remove(ht, entry);
2260             }
2261
2262             if (!inst->has_side_effects())
2263                insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
2264                                      inst);
2265          }
2266       }
2267    }
2268
2269    _mesa_hash_table_destroy(ht, NULL);
2270
2271    if (progress)
2272       invalidate_live_intervals();
2273
2274    return progress;
2275 }
2276
2277 /**
2278  * Implements register coalescing: Checks if the two registers involved in a
2279  * raw move don't interfere, in which case they can both be stored in the same
2280  * place and the MOV removed.
2281  *
2282  * To do this, all uses of the source of the MOV in the shader are replaced
2283  * with the destination of the MOV. For example:
2284  *
2285  * add vgrf3:F, vgrf1:F, vgrf2:F
2286  * mov vgrf4:F, vgrf3:F
2287  * mul vgrf5:F, vgrf5:F, vgrf4:F
2288  *
2289  * becomes
2290  *
2291  * add vgrf4:F, vgrf1:F, vgrf2:F
2292  * mul vgrf5:F, vgrf5:F, vgrf4:F
2293  */
2294 bool
2295 fs_visitor::register_coalesce()
2296 {
2297    bool progress = false;
2298
2299    calculate_live_intervals();
2300
2301    int src_size = 0;
2302    int channels_remaining = 0;
2303    int reg_from = -1, reg_to = -1;
2304    int reg_to_offset[MAX_SAMPLER_MESSAGE_SIZE];
2305    fs_inst *mov[MAX_SAMPLER_MESSAGE_SIZE];
2306
2307    foreach_list(node, &this->instructions) {
2308       fs_inst *inst = (fs_inst *)node;
2309
2310       if (inst->opcode != BRW_OPCODE_MOV ||
2311           inst->is_partial_write() ||
2312           inst->saturate ||
2313           inst->src[0].file != GRF ||
2314           inst->src[0].negate ||
2315           inst->src[0].abs ||
2316           inst->src[0].smear != -1 ||
2317           inst->dst.file != GRF ||
2318           inst->dst.type != inst->src[0].type) {
2319          continue;
2320       }
2321
2322       if (virtual_grf_sizes[inst->src[0].reg] >
2323           virtual_grf_sizes[inst->dst.reg])
2324          continue;
2325
2326       int var_from = live_intervals->var_from_reg(&inst->src[0]);
2327       int var_to = live_intervals->var_from_reg(&inst->dst);
2328
2329       if (live_intervals->vars_interfere(var_from, var_to) &&
2330           !inst->dst.equals(inst->src[0])) {
2331
2332          /* We know that the live ranges of A (var_from) and B (var_to)
2333           * interfere because of the ->vars_interfere() call above. If the end
2334           * of B's live range is after the end of A's range, then we know two
2335           * things:
2336           *  - the start of B's live range must be in A's live range (since we
2337           *    already know the two ranges interfere, this is the only remaining
2338           *    possibility)
2339           *  - the interference isn't of the form we're looking for (where B is
2340           *    entirely inside A)
2341           */
2342          if (live_intervals->end[var_to] > live_intervals->end[var_from])
2343             continue;
2344
2345          bool overwritten = false;
2346          int scan_ip = -1;
2347
2348          foreach_list(n, &this->instructions) {
2349             fs_inst *scan_inst = (fs_inst *)n;
2350             scan_ip++;
2351
2352             if (scan_inst->is_control_flow()) {
2353                overwritten = true;
2354                break;
2355             }
2356
2357             if (scan_ip <= live_intervals->start[var_to])
2358                continue;
2359
2360             if (scan_ip > live_intervals->end[var_to])
2361                break;
2362
2363             if (scan_inst->dst.equals(inst->dst) ||
2364                 scan_inst->dst.equals(inst->src[0])) {
2365                overwritten = true;
2366                break;
2367             }
2368          }
2369
2370          if (overwritten)
2371             continue;
2372       }
2373
2374       if (reg_from != inst->src[0].reg) {
2375          reg_from = inst->src[0].reg;
2376
2377          src_size = virtual_grf_sizes[inst->src[0].reg];
2378          assert(src_size <= MAX_SAMPLER_MESSAGE_SIZE);
2379
2380          channels_remaining = src_size;
2381          memset(mov, 0, sizeof(mov));
2382
2383          reg_to = inst->dst.reg;
2384       }
2385
2386       if (reg_to != inst->dst.reg)
2387          continue;
2388
2389       const int offset = inst->src[0].reg_offset;
2390       reg_to_offset[offset] = inst->dst.reg_offset;
2391       mov[offset] = inst;
2392       channels_remaining--;
2393
2394       if (channels_remaining)
2395          continue;
2396
2397       bool removed = false;
2398       for (int i = 0; i < src_size; i++) {
2399          if (mov[i]) {
2400             removed = true;
2401
2402             mov[i]->opcode = BRW_OPCODE_NOP;
2403             mov[i]->conditional_mod = BRW_CONDITIONAL_NONE;
2404             mov[i]->dst = reg_undef;
2405             mov[i]->src[0] = reg_undef;
2406             mov[i]->src[1] = reg_undef;
2407             mov[i]->src[2] = reg_undef;
2408          }
2409       }
2410
2411       foreach_list(node, &this->instructions) {
2412          fs_inst *scan_inst = (fs_inst *)node;
2413
2414          for (int i = 0; i < src_size; i++) {
2415             if (mov[i]) {
2416                if (scan_inst->dst.file == GRF &&
2417                    scan_inst->dst.reg == reg_from &&
2418                    scan_inst->dst.reg_offset == i) {
2419                   scan_inst->dst.reg = reg_to;
2420                   scan_inst->dst.reg_offset = reg_to_offset[i];
2421                }
2422                for (int j = 0; j < 3; j++) {
2423                   if (scan_inst->src[j].file == GRF &&
2424                       scan_inst->src[j].reg == reg_from &&
2425                       scan_inst->src[j].reg_offset == i) {
2426                      scan_inst->src[j].reg = reg_to;
2427                      scan_inst->src[j].reg_offset = reg_to_offset[i];
2428                   }
2429                }
2430             }
2431          }
2432       }
2433
2434       if (removed) {
2435          live_intervals->start[var_to] = MIN2(live_intervals->start[var_to],
2436                                               live_intervals->start[var_from]);
2437          live_intervals->end[var_to] = MAX2(live_intervals->end[var_to],
2438                                             live_intervals->end[var_from]);
2439          reg_from = -1;
2440       }
2441    }
2442
2443    foreach_list_safe(node, &this->instructions) {
2444       fs_inst *inst = (fs_inst *)node;
2445
2446       if (inst->opcode == BRW_OPCODE_NOP) {
2447          inst->remove();
2448          progress = true;
2449       }
2450    }
2451
2452    if (progress)
2453       invalidate_live_intervals();
2454
2455    return progress;
2456 }
2457
2458 bool
2459 fs_visitor::compute_to_mrf()
2460 {
2461    bool progress = false;
2462    int next_ip = 0;
2463
2464    calculate_live_intervals();
2465
2466    foreach_list_safe(node, &this->instructions) {
2467       fs_inst *inst = (fs_inst *)node;
2468
2469       int ip = next_ip;
2470       next_ip++;
2471
2472       if (inst->opcode != BRW_OPCODE_MOV ||
2473           inst->is_partial_write() ||
2474           inst->dst.file != MRF || inst->src[0].file != GRF ||
2475           inst->dst.type != inst->src[0].type ||
2476           inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
2477          continue;
2478
2479       /* Work out which hardware MRF registers are written by this
2480        * instruction.
2481        */
2482       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2483       int mrf_high;
2484       if (inst->dst.reg & BRW_MRF_COMPR4) {
2485          mrf_high = mrf_low + 4;
2486       } else if (dispatch_width == 16 &&
2487                  (!inst->force_uncompressed && !inst->force_sechalf)) {
2488          mrf_high = mrf_low + 1;
2489       } else {
2490          mrf_high = mrf_low;
2491       }
2492
2493       /* Can't compute-to-MRF this GRF if someone else was going to
2494        * read it later.
2495        */
2496       if (this->virtual_grf_end[inst->src[0].reg] > ip)
2497          continue;
2498
2499       /* Found a move of a GRF to a MRF.  Let's see if we can go
2500        * rewrite the thing that made this GRF to write into the MRF.
2501        */
2502       fs_inst *scan_inst;
2503       for (scan_inst = (fs_inst *)inst->prev;
2504            scan_inst->prev != NULL;
2505            scan_inst = (fs_inst *)scan_inst->prev) {
2506          if (scan_inst->dst.file == GRF &&
2507              scan_inst->dst.reg == inst->src[0].reg) {
2508             /* Found the last thing to write our reg we want to turn
2509              * into a compute-to-MRF.
2510              */
2511
2512             /* If this one instruction didn't populate all the
2513              * channels, bail.  We might be able to rewrite everything
2514              * that writes that reg, but it would require smarter
2515              * tracking to delay the rewriting until complete success.
2516              */
2517             if (scan_inst->is_partial_write())
2518                break;
2519
2520             /* Things returning more than one register would need us to
2521              * understand coalescing out more than one MOV at a time.
2522              */
2523             if (scan_inst->regs_written > 1)
2524                break;
2525
2526             /* SEND instructions can't have MRF as a destination. */
2527             if (scan_inst->mlen)
2528                break;
2529
2530             if (brw->gen == 6) {
2531                /* gen6 math instructions must have the destination be
2532                 * GRF, so no compute-to-MRF for them.
2533                 */
2534                if (scan_inst->is_math()) {
2535                   break;
2536                }
2537             }
2538
2539             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2540                /* Found the creator of our MRF's source value. */
2541                scan_inst->dst.file = MRF;
2542                scan_inst->dst.reg = inst->dst.reg;
2543                scan_inst->saturate |= inst->saturate;
2544                inst->remove();
2545                progress = true;
2546             }
2547             break;
2548          }
2549
2550          /* We don't handle control flow here.  Most computation of
2551           * values that end up in MRFs are shortly before the MRF
2552           * write anyway.
2553           */
2554          if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2555             break;
2556
2557          /* You can't read from an MRF, so if someone else reads our
2558           * MRF's source GRF that we wanted to rewrite, that stops us.
2559           */
2560          bool interfered = false;
2561          for (int i = 0; i < 3; i++) {
2562             if (scan_inst->src[i].file == GRF &&
2563                 scan_inst->src[i].reg == inst->src[0].reg &&
2564                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2565                interfered = true;
2566             }
2567          }
2568          if (interfered)
2569             break;
2570
2571          if (scan_inst->dst.file == MRF) {
2572             /* If somebody else writes our MRF here, we can't
2573              * compute-to-MRF before that.
2574              */
2575             int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2576             int scan_mrf_high;
2577
2578             if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2579                scan_mrf_high = scan_mrf_low + 4;
2580             } else if (dispatch_width == 16 &&
2581                        (!scan_inst->force_uncompressed &&
2582                         !scan_inst->force_sechalf)) {
2583                scan_mrf_high = scan_mrf_low + 1;
2584             } else {
2585                scan_mrf_high = scan_mrf_low;
2586             }
2587
2588             if (mrf_low == scan_mrf_low ||
2589                 mrf_low == scan_mrf_high ||
2590                 mrf_high == scan_mrf_low ||
2591                 mrf_high == scan_mrf_high) {
2592                break;
2593             }
2594          }
2595
2596          if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2597             /* Found a SEND instruction, which means that there are
2598              * live values in MRFs from base_mrf to base_mrf +
2599              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
2600              * above it.
2601              */
2602             if (mrf_low >= scan_inst->base_mrf &&
2603                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2604                break;
2605             }
2606             if (mrf_high >= scan_inst->base_mrf &&
2607                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2608                break;
2609             }
2610          }
2611       }
2612    }
2613
2614    if (progress)
2615       invalidate_live_intervals();
2616
2617    return progress;
2618 }
2619
2620 /**
2621  * Walks through basic blocks, looking for repeated MRF writes and
2622  * removing the later ones.
2623  */
2624 bool
2625 fs_visitor::remove_duplicate_mrf_writes()
2626 {
2627    fs_inst *last_mrf_move[16];
2628    bool progress = false;
2629
2630    /* Need to update the MRF tracking for compressed instructions. */
2631    if (dispatch_width == 16)
2632       return false;
2633
2634    memset(last_mrf_move, 0, sizeof(last_mrf_move));
2635
2636    foreach_list_safe(node, &this->instructions) {
2637       fs_inst *inst = (fs_inst *)node;
2638
2639       if (inst->is_control_flow()) {
2640          memset(last_mrf_move, 0, sizeof(last_mrf_move));
2641       }
2642
2643       if (inst->opcode == BRW_OPCODE_MOV &&
2644           inst->dst.file == MRF) {
2645          fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2646          if (prev_inst && inst->equals(prev_inst)) {
2647             inst->remove();
2648             progress = true;
2649             continue;
2650          }
2651       }
2652
2653       /* Clear out the last-write records for MRFs that were overwritten. */
2654       if (inst->dst.file == MRF) {
2655          last_mrf_move[inst->dst.reg] = NULL;
2656       }
2657
2658       if (inst->mlen > 0 && inst->base_mrf != -1) {
2659          /* Found a SEND instruction, which will include two or fewer
2660           * implied MRF writes.  We could do better here.
2661           */
2662          for (int i = 0; i < implied_mrf_writes(inst); i++) {
2663             last_mrf_move[inst->base_mrf + i] = NULL;
2664          }
2665       }
2666
2667       /* Clear out any MRF move records whose sources got overwritten. */
2668       if (inst->dst.file == GRF) {
2669          for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2670             if (last_mrf_move[i] &&
2671                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2672                last_mrf_move[i] = NULL;
2673             }
2674          }
2675       }
2676
2677       if (inst->opcode == BRW_OPCODE_MOV &&
2678           inst->dst.file == MRF &&
2679           inst->src[0].file == GRF &&
2680           !inst->is_partial_write()) {
2681          last_mrf_move[inst->dst.reg] = inst;
2682       }
2683    }
2684
2685    if (progress)
2686       invalidate_live_intervals();
2687
2688    return progress;
2689 }
2690
2691 static void
2692 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2693                         int first_grf, int grf_len)
2694 {
2695    bool inst_simd16 = (dispatch_width > 8 &&
2696                        !inst->force_uncompressed &&
2697                        !inst->force_sechalf);
2698
2699    /* Clear the flag for registers that actually got read (as expected). */
2700    for (int i = 0; i < 3; i++) {
2701       int grf;
2702       if (inst->src[i].file == GRF) {
2703          grf = inst->src[i].reg;
2704       } else if (inst->src[i].file == HW_REG &&
2705                  inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2706          grf = inst->src[i].fixed_hw_reg.nr;
2707       } else {
2708          continue;
2709       }
2710
2711       if (grf >= first_grf &&
2712           grf < first_grf + grf_len) {
2713          deps[grf - first_grf] = false;
2714          if (inst_simd16)
2715             deps[grf - first_grf + 1] = false;
2716       }
2717    }
2718 }
2719
2720 /**
2721  * Implements this workaround for the original 965:
2722  *
2723  *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2724  *      check for post destination dependencies on this instruction, software
2725  *      must ensure that there is no destination hazard for the case of ‘write
2726  *      followed by a posted write’ shown in the following example.
2727  *
2728  *      1. mov r3 0
2729  *      2. send r3.xy <rest of send instruction>
2730  *      3. mov r2 r3
2731  *
2732  *      Due to no post-destination dependency check on the ‘send’, the above
2733  *      code sequence could have two instructions (1 and 2) in flight at the
2734  *      same time that both consider ‘r3’ as the target of their final writes.
2735  */
2736 void
2737 fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
2738 {
2739    int reg_size = dispatch_width / 8;
2740    int write_len = inst->regs_written * reg_size;
2741    int first_write_grf = inst->dst.reg;
2742    bool needs_dep[BRW_MAX_MRF];
2743    assert(write_len < (int)sizeof(needs_dep) - 1);
2744
2745    memset(needs_dep, false, sizeof(needs_dep));
2746    memset(needs_dep, true, write_len);
2747
2748    clear_deps_for_inst_src(inst, dispatch_width,
2749                            needs_dep, first_write_grf, write_len);
2750
2751    /* Walk backwards looking for writes to registers we're writing which
2752     * aren't read since being written.  If we hit the start of the program,
2753     * we assume that there are no outstanding dependencies on entry to the
2754     * program.
2755     */
2756    for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2757         scan_inst != NULL;
2758         scan_inst = (fs_inst *)scan_inst->prev) {
2759
2760       /* If we hit control flow, assume that there *are* outstanding
2761        * dependencies, and force their cleanup before our instruction.
2762        */
2763       if (scan_inst->is_control_flow()) {
2764          for (int i = 0; i < write_len; i++) {
2765             if (needs_dep[i]) {
2766                inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2767             }
2768          }
2769          return;
2770       }
2771
2772       bool scan_inst_simd16 = (dispatch_width > 8 &&
2773                                !scan_inst->force_uncompressed &&
2774                                !scan_inst->force_sechalf);
2775
2776       /* We insert our reads as late as possible on the assumption that any
2777        * instruction but a MOV that might have left us an outstanding
2778        * dependency has more latency than a MOV.
2779        */
2780       if (scan_inst->dst.file == GRF) {
2781          for (int i = 0; i < scan_inst->regs_written; i++) {
2782             int reg = scan_inst->dst.reg + i * reg_size;
2783
2784             if (reg >= first_write_grf &&
2785                 reg < first_write_grf + write_len &&
2786                 needs_dep[reg - first_write_grf]) {
2787                inst->insert_before(DEP_RESOLVE_MOV(reg));
2788                needs_dep[reg - first_write_grf] = false;
2789                if (scan_inst_simd16)
2790                   needs_dep[reg - first_write_grf + 1] = false;
2791             }
2792          }
2793       }
2794
2795       /* Clear the flag for registers that actually got read (as expected). */
2796       clear_deps_for_inst_src(scan_inst, dispatch_width,
2797                               needs_dep, first_write_grf, write_len);
2798
2799       /* Continue the loop only if we haven't resolved all the dependencies */
2800       int i;
2801       for (i = 0; i < write_len; i++) {
2802          if (needs_dep[i])
2803             break;
2804       }
2805       if (i == write_len)
2806          return;
2807    }
2808 }
2809
2810 /**
2811  * Implements this workaround for the original 965:
2812  *
2813  *     "[DevBW, DevCL] Errata: A destination register from a send can not be
2814  *      used as a destination register until after it has been sourced by an
2815  *      instruction with a different destination register.
2816  */
2817 void
2818 fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
2819 {
2820    int write_len = inst->regs_written * dispatch_width / 8;
2821    int first_write_grf = inst->dst.reg;
2822    bool needs_dep[BRW_MAX_MRF];
2823    assert(write_len < (int)sizeof(needs_dep) - 1);
2824
2825    memset(needs_dep, false, sizeof(needs_dep));
2826    memset(needs_dep, true, write_len);
2827    /* Walk forwards looking for writes to registers we're writing which aren't
2828     * read before being written.
2829     */
2830    for (fs_inst *scan_inst = (fs_inst *)inst->next;
2831         !scan_inst->is_tail_sentinel();
2832         scan_inst = (fs_inst *)scan_inst->next) {
2833       /* If we hit control flow, force resolve all remaining dependencies. */
2834       if (scan_inst->is_control_flow()) {
2835          for (int i = 0; i < write_len; i++) {
2836             if (needs_dep[i])
2837                scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2838          }
2839          return;
2840       }
2841
2842       /* Clear the flag for registers that actually got read (as expected). */
2843       clear_deps_for_inst_src(scan_inst, dispatch_width,
2844                               needs_dep, first_write_grf, write_len);
2845
2846       /* We insert our reads as late as possible since they're reading the
2847        * result of a SEND, which has massive latency.
2848        */
2849       if (scan_inst->dst.file == GRF &&
2850           scan_inst->dst.reg >= first_write_grf &&
2851           scan_inst->dst.reg < first_write_grf + write_len &&
2852           needs_dep[scan_inst->dst.reg - first_write_grf]) {
2853          scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2854          needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2855       }
2856
2857       /* Continue the loop only if we haven't resolved all the dependencies */
2858       int i;
2859       for (i = 0; i < write_len; i++) {
2860          if (needs_dep[i])
2861             break;
2862       }
2863       if (i == write_len)
2864          return;
2865    }
2866
2867    /* If we hit the end of the program, resolve all remaining dependencies out
2868     * of paranoia.
2869     */
2870    fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2871    assert(last_inst->eot);
2872    for (int i = 0; i < write_len; i++) {
2873       if (needs_dep[i])
2874          last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2875    }
2876 }
2877
2878 void
2879 fs_visitor::insert_gen4_send_dependency_workarounds()
2880 {
2881    if (brw->gen != 4 || brw->is_g4x)
2882       return;
2883
2884    /* Note that we're done with register allocation, so GRF fs_regs always
2885     * have a .reg_offset of 0.
2886     */
2887
2888    foreach_list_safe(node, &this->instructions) {
2889       fs_inst *inst = (fs_inst *)node;
2890
2891       if (inst->mlen != 0 && inst->dst.file == GRF) {
2892          insert_gen4_pre_send_dependency_workarounds(inst);
2893          insert_gen4_post_send_dependency_workarounds(inst);
2894       }
2895    }
2896 }
2897
2898 /**
2899  * Turns the generic expression-style uniform pull constant load instruction
2900  * into a hardware-specific series of instructions for loading a pull
2901  * constant.
2902  *
2903  * The expression style allows the CSE pass before this to optimize out
2904  * repeated loads from the same offset, and gives the pre-register-allocation
2905  * scheduling full flexibility, while the conversion to native instructions
2906  * allows the post-register-allocation scheduler the best information
2907  * possible.
2908  *
2909  * Note that execution masking for setting up pull constant loads is special:
2910  * the channels that need to be written are unrelated to the current execution
2911  * mask, since a later instruction will use one of the result channels as a
2912  * source operand for all 8 or 16 of its channels.
2913  */
2914 void
2915 fs_visitor::lower_uniform_pull_constant_loads()
2916 {
2917    foreach_list(node, &this->instructions) {
2918       fs_inst *inst = (fs_inst *)node;
2919
2920       if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2921          continue;
2922
2923       if (brw->gen >= 7) {
2924          /* The offset arg before was a vec4-aligned byte offset.  We need to
2925           * turn it into a dword offset.
2926           */
2927          fs_reg const_offset_reg = inst->src[1];
2928          assert(const_offset_reg.file == IMM &&
2929                 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2930          const_offset_reg.imm.u /= 4;
2931          fs_reg payload = fs_reg(this, glsl_type::uint_type);
2932
2933          /* This is actually going to be a MOV, but since only the first dword
2934           * is accessed, we have a special opcode to do just that one.  Note
2935           * that this needs to be an operation that will be considered a def
2936           * by live variable analysis, or register allocation will explode.
2937           */
2938          fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2939                                                payload, const_offset_reg);
2940          setup->force_writemask_all = true;
2941
2942          setup->ir = inst->ir;
2943          setup->annotation = inst->annotation;
2944          inst->insert_before(setup);
2945
2946          /* Similarly, this will only populate the first 4 channels of the
2947           * result register (since we only use smear values from 0-3), but we
2948           * don't tell the optimizer.
2949           */
2950          inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2951          inst->src[1] = payload;
2952
2953          invalidate_live_intervals();
2954       } else {
2955          /* Before register allocation, we didn't tell the scheduler about the
2956           * MRF we use.  We know it's safe to use this MRF because nothing
2957           * else does except for register spill/unspill, which generates and
2958           * uses its MRF within a single IR instruction.
2959           */
2960          inst->base_mrf = 14;
2961          inst->mlen = 1;
2962       }
2963    }
2964 }
2965
2966 void
2967 fs_visitor::dump_instructions()
2968 {
2969    calculate_register_pressure();
2970
2971    int ip = 0, max_pressure = 0;
2972    foreach_list(node, &this->instructions) {
2973       backend_instruction *inst = (backend_instruction *)node;
2974       max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
2975       printf("{%3d} %4d: ", regs_live_at_ip[ip], ip);
2976       dump_instruction(inst);
2977       ++ip;
2978    }
2979    printf("Maximum %3d registers live at once.\n", max_pressure);
2980 }
2981
2982 void
2983 fs_visitor::dump_instruction(backend_instruction *be_inst)
2984 {
2985    fs_inst *inst = (fs_inst *)be_inst;
2986
2987    if (inst->predicate) {
2988       printf("(%cf0.%d) ",
2989              inst->predicate_inverse ? '-' : '+',
2990              inst->flag_subreg);
2991    }
2992
2993    printf("%s", brw_instruction_name(inst->opcode));
2994    if (inst->saturate)
2995       printf(".sat");
2996    if (inst->conditional_mod) {
2997       printf("%s", conditional_modifier[inst->conditional_mod]);
2998       if (!inst->predicate &&
2999           (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
3000                               inst->opcode != BRW_OPCODE_IF &&
3001                               inst->opcode != BRW_OPCODE_WHILE))) {
3002          printf(".f0.%d", inst->flag_subreg);
3003       }
3004    }
3005    printf(" ");
3006
3007
3008    switch (inst->dst.file) {
3009    case GRF:
3010       printf("vgrf%d", inst->dst.reg);
3011       if (virtual_grf_sizes[inst->dst.reg] != 1)
3012          printf("+%d", inst->dst.reg_offset);
3013       break;
3014    case MRF:
3015       printf("m%d", inst->dst.reg);
3016       break;
3017    case BAD_FILE:
3018       printf("(null)");
3019       break;
3020    case UNIFORM:
3021       printf("***u%d***", inst->dst.reg);
3022       break;
3023    case HW_REG:
3024       if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3025          switch (inst->dst.fixed_hw_reg.nr) {
3026          case BRW_ARF_NULL:
3027             printf("null");
3028             break;
3029          case BRW_ARF_ADDRESS:
3030             printf("a0.%d", inst->dst.fixed_hw_reg.subnr);
3031             break;
3032          case BRW_ARF_ACCUMULATOR:
3033             printf("acc%d", inst->dst.fixed_hw_reg.subnr);
3034             break;
3035          case BRW_ARF_FLAG:
3036             printf("f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3037                              inst->dst.fixed_hw_reg.subnr);
3038             break;
3039          default:
3040             printf("arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3041                                inst->dst.fixed_hw_reg.subnr);
3042             break;
3043          }
3044       } else {
3045          printf("hw_reg%d", inst->dst.fixed_hw_reg.nr);
3046       }
3047       if (inst->dst.fixed_hw_reg.subnr)
3048          printf("+%d", inst->dst.fixed_hw_reg.subnr);
3049       break;
3050    default:
3051       printf("???");
3052       break;
3053    }
3054    printf(":%s, ", reg_encoding[inst->dst.type]);
3055
3056    for (int i = 0; i < 3 && inst->src[i].file != BAD_FILE; i++) {
3057       if (inst->src[i].negate)
3058          printf("-");
3059       if (inst->src[i].abs)
3060          printf("|");
3061       switch (inst->src[i].file) {
3062       case GRF:
3063          printf("vgrf%d", inst->src[i].reg);
3064          if (virtual_grf_sizes[inst->src[i].reg] != 1)
3065             printf("+%d", inst->src[i].reg_offset);
3066          break;
3067       case MRF:
3068          printf("***m%d***", inst->src[i].reg);
3069          break;
3070       case UNIFORM:
3071          printf("u%d", inst->src[i].reg);
3072          if (virtual_grf_sizes[inst->src[i].reg] != 1)
3073             printf(".%d", inst->src[i].reg_offset);
3074          break;
3075       case BAD_FILE:
3076          printf("(null)");
3077          break;
3078       case IMM:
3079          switch (inst->src[i].type) {
3080          case BRW_REGISTER_TYPE_F:
3081             printf("%ff", inst->src[i].imm.f);
3082             break;
3083          case BRW_REGISTER_TYPE_D:
3084             printf("%dd", inst->src[i].imm.i);
3085             break;
3086          case BRW_REGISTER_TYPE_UD:
3087             printf("%uu", inst->src[i].imm.u);
3088             break;
3089          default:
3090             printf("???");
3091             break;
3092          }
3093          break;
3094       case HW_REG:
3095          if (inst->src[i].fixed_hw_reg.negate)
3096             printf("-");
3097          if (inst->src[i].fixed_hw_reg.abs)
3098             printf("|");
3099          if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3100             switch (inst->src[i].fixed_hw_reg.nr) {
3101             case BRW_ARF_NULL:
3102                printf("null");
3103                break;
3104             case BRW_ARF_ADDRESS:
3105                printf("a0.%d", inst->src[i].fixed_hw_reg.subnr);
3106                break;
3107             case BRW_ARF_ACCUMULATOR:
3108                printf("acc%d", inst->src[i].fixed_hw_reg.subnr);
3109                break;
3110             case BRW_ARF_FLAG:
3111                printf("f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3112                                 inst->src[i].fixed_hw_reg.subnr);
3113                break;
3114             default:
3115                printf("arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3116                                   inst->src[i].fixed_hw_reg.subnr);
3117                break;
3118             }
3119          } else {
3120             printf("hw_reg%d", inst->src[i].fixed_hw_reg.nr);
3121          }
3122          if (inst->src[i].fixed_hw_reg.subnr)
3123             printf("+%d", inst->src[i].fixed_hw_reg.subnr);
3124          if (inst->src[i].fixed_hw_reg.abs)
3125             printf("|");
3126          break;
3127       default:
3128          printf("???");
3129          break;
3130       }
3131       if (inst->src[i].abs)
3132          printf("|");
3133
3134       if (inst->src[i].file != IMM) {
3135          printf(":%s", brw_reg_type_letters(inst->src[i].type));
3136       }
3137
3138       if (i < 2 && inst->src[i + 1].file != BAD_FILE)
3139          printf(", ");
3140    }
3141
3142    printf(" ");
3143
3144    if (inst->force_uncompressed)
3145       printf("1sthalf ");
3146
3147    if (inst->force_sechalf)
3148       printf("2ndhalf ");
3149
3150    printf("\n");
3151 }
3152
3153 /**
3154  * Possibly returns an instruction that set up @param reg.
3155  *
3156  * Sometimes we want to take the result of some expression/variable
3157  * dereference tree and rewrite the instruction generating the result
3158  * of the tree.  When processing the tree, we know that the
3159  * instructions generated are all writing temporaries that are dead
3160  * outside of this tree.  So, if we have some instructions that write
3161  * a temporary, we're free to point that temp write somewhere else.
3162  *
3163  * Note that this doesn't guarantee that the instruction generated
3164  * only reg -- it might be the size=4 destination of a texture instruction.
3165  */
3166 fs_inst *
3167 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3168                                            fs_inst *end,
3169                                            fs_reg reg)
3170 {
3171    if (end == start ||
3172        end->is_partial_write() ||
3173        reg.reladdr ||
3174        !reg.equals(end->dst)) {
3175       return NULL;
3176    } else {
3177       return end;
3178    }
3179 }
3180
3181 void
3182 fs_visitor::setup_payload_gen6()
3183 {
3184    bool uses_depth =
3185       (fp->Base.InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3186    unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
3187
3188    assert(brw->gen >= 6);
3189
3190    /* R0-1: masks, pixel X/Y coordinates. */
3191    c->nr_payload_regs = 2;
3192    /* R2: only for 32-pixel dispatch.*/
3193
3194    /* R3-26: barycentric interpolation coordinates.  These appear in the
3195     * same order that they appear in the brw_wm_barycentric_interp_mode
3196     * enum.  Each set of coordinates occupies 2 registers if dispatch width
3197     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
3198     * appear if they were enabled using the "Barycentric Interpolation
3199     * Mode" bits in WM_STATE.
3200     */
3201    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3202       if (barycentric_interp_modes & (1 << i)) {
3203          c->barycentric_coord_reg[i] = c->nr_payload_regs;
3204          c->nr_payload_regs += 2;
3205          if (dispatch_width == 16) {
3206             c->nr_payload_regs += 2;
3207          }
3208       }
3209    }
3210
3211    /* R27: interpolated depth if uses source depth */
3212    if (uses_depth) {
3213       c->source_depth_reg = c->nr_payload_regs;
3214       c->nr_payload_regs++;
3215       if (dispatch_width == 16) {
3216          /* R28: interpolated depth if not SIMD8. */
3217          c->nr_payload_regs++;
3218       }
3219    }
3220    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3221    if (uses_depth) {
3222       c->source_w_reg = c->nr_payload_regs;
3223       c->nr_payload_regs++;
3224       if (dispatch_width == 16) {
3225          /* R30: interpolated W if not SIMD8. */
3226          c->nr_payload_regs++;
3227       }
3228    }
3229
3230    c->prog_data.uses_pos_offset = c->key.compute_pos_offset;
3231    /* R31: MSAA position offsets. */
3232    if (c->prog_data.uses_pos_offset) {
3233       c->sample_pos_reg = c->nr_payload_regs;
3234       c->nr_payload_regs++;
3235    }
3236
3237    /* R32: MSAA input coverage mask */
3238    if (fp->Base.SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
3239       assert(brw->gen >= 7);
3240       c->sample_mask_reg = c->nr_payload_regs;
3241       c->nr_payload_regs++;
3242       if (dispatch_width == 16) {
3243          /* R33: input coverage mask if not SIMD8. */
3244          c->nr_payload_regs++;
3245       }
3246    }
3247
3248    /* R34-: bary for 32-pixel. */
3249    /* R58-59: interp W for 32-pixel. */
3250
3251    if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3252       c->source_depth_to_render_target = true;
3253    }
3254 }
3255
3256 void
3257 fs_visitor::assign_binding_table_offsets()
3258 {
3259    uint32_t next_binding_table_offset = 0;
3260
3261    /* If there are no color regions, we still perform an FB write to a null
3262     * renderbuffer, which we place at surface index 0.
3263     */
3264    c->prog_data.binding_table.render_target_start = next_binding_table_offset;
3265    next_binding_table_offset += MAX2(c->key.nr_color_regions, 1);
3266
3267    assign_common_binding_table_offsets(next_binding_table_offset);
3268 }
3269
3270 void
3271 fs_visitor::calculate_register_pressure()
3272 {
3273    calculate_live_intervals();
3274
3275    int num_instructions = 0;
3276    foreach_list(node, &this->instructions) {
3277       ++num_instructions;
3278    }
3279
3280    regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
3281
3282    for (int reg = 0; reg < virtual_grf_count; reg++) {
3283       for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
3284          regs_live_at_ip[ip] += virtual_grf_sizes[reg];
3285    }
3286 }
3287
3288 bool
3289 fs_visitor::run()
3290 {
3291    sanity_param_count = fp->Base.Parameters->NumParameters;
3292    uint32_t orig_nr_params = c->prog_data.nr_params;
3293    bool allocated_without_spills;
3294
3295    assign_binding_table_offsets();
3296
3297    if (brw->gen >= 6)
3298       setup_payload_gen6();
3299    else
3300       setup_payload_gen4();
3301
3302    if (0) {
3303       emit_dummy_fs();
3304    } else {
3305       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3306          emit_shader_time_begin();
3307
3308       calculate_urb_setup();
3309       if (fp->Base.InputsRead > 0) {
3310          if (brw->gen < 6)
3311             emit_interpolation_setup_gen4();
3312          else
3313             emit_interpolation_setup_gen6();
3314       }
3315
3316       /* We handle discards by keeping track of the still-live pixels in f0.1.
3317        * Initialize it with the dispatched pixels.
3318        */
3319       if (fp->UsesKill || c->key.alpha_test_func) {
3320          fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
3321          discard_init->flag_subreg = 1;
3322       }
3323
3324       /* Generate FS IR for main().  (the visitor only descends into
3325        * functions called "main").
3326        */
3327       if (shader) {
3328          foreach_list(node, &*shader->base.ir) {
3329             ir_instruction *ir = (ir_instruction *)node;
3330             base_ir = ir;
3331             this->result = reg_undef;
3332             ir->accept(this);
3333          }
3334       } else {
3335          emit_fragment_program_code();
3336       }
3337       base_ir = NULL;
3338       if (failed)
3339          return false;
3340
3341       emit(FS_OPCODE_PLACEHOLDER_HALT);
3342
3343       if (c->key.alpha_test_func)
3344          emit_alpha_test();
3345
3346       emit_fb_writes();
3347
3348       split_virtual_grfs();
3349
3350       move_uniform_array_access_to_pull_constants();
3351       remove_dead_constants();
3352       setup_pull_constants();
3353
3354       bool progress;
3355       do {
3356          progress = false;
3357
3358          compact_virtual_grfs();
3359
3360          progress = remove_duplicate_mrf_writes() || progress;
3361
3362          progress = opt_algebraic() || progress;
3363          progress = opt_cse() || progress;
3364          progress = opt_copy_propagate() || progress;
3365          progress = opt_peephole_predicated_break() || progress;
3366          progress = dead_code_eliminate() || progress;
3367          progress = dead_code_eliminate_local() || progress;
3368          progress = opt_peephole_sel() || progress;
3369          progress = dead_control_flow_eliminate(this) || progress;
3370          progress = opt_saturate_propagation() || progress;
3371          progress = register_coalesce() || progress;
3372          progress = compute_to_mrf() || progress;
3373       } while (progress);
3374
3375       lower_uniform_pull_constant_loads();
3376
3377       assign_curb_setup();
3378       assign_urb_setup();
3379
3380       static enum instruction_scheduler_mode pre_modes[] = {
3381          SCHEDULE_PRE,
3382          SCHEDULE_PRE_NON_LIFO,
3383          SCHEDULE_PRE_LIFO,
3384       };
3385
3386       /* Try each scheduling heuristic to see if it can successfully register
3387        * allocate without spilling.  They should be ordered by decreasing
3388        * performance but increasing likelihood of allocating.
3389        */
3390       for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3391          schedule_instructions(pre_modes[i]);
3392
3393          if (0) {
3394             assign_regs_trivial();
3395             allocated_without_spills = true;
3396          } else {
3397             allocated_without_spills = assign_regs(false);
3398          }
3399          if (allocated_without_spills)
3400             break;
3401       }
3402
3403       if (!allocated_without_spills) {
3404          /* We assume that any spilling is worse than just dropping back to
3405           * SIMD8.  There's probably actually some intermediate point where
3406           * SIMD16 with a couple of spills is still better.
3407           */
3408          if (dispatch_width == 16) {
3409             fail("Failure to register allocate.  Reduce number of "
3410                  "live scalar values to avoid this.");
3411          }
3412
3413          /* Since we're out of heuristics, just go spill registers until we
3414           * get an allocation.
3415           */
3416          while (!assign_regs(true)) {
3417             if (failed)
3418                break;
3419          }
3420       }
3421    }
3422    assert(force_uncompressed_stack == 0);
3423
3424    /* This must come after all optimization and register allocation, since
3425     * it inserts dead code that happens to have side effects, and it does
3426     * so based on the actual physical registers in use.
3427     */
3428    insert_gen4_send_dependency_workarounds();
3429
3430    if (failed)
3431       return false;
3432
3433    if (!allocated_without_spills)
3434       schedule_instructions(SCHEDULE_POST);
3435
3436    if (dispatch_width == 8) {
3437       c->prog_data.reg_blocks = brw_register_blocks(grf_used);
3438    } else {
3439       c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
3440
3441       /* Make sure we didn't try to sneak in an extra uniform */
3442       assert(orig_nr_params == c->prog_data.nr_params);
3443       (void) orig_nr_params;
3444    }
3445
3446    /* If any state parameters were appended, then ParameterValues could have
3447     * been realloced, in which case the driver uniform storage set up by
3448     * _mesa_associate_uniform_storage() would point to freed memory.  Make
3449     * sure that didn't happen.
3450     */
3451    assert(sanity_param_count == fp->Base.Parameters->NumParameters);
3452
3453    return !failed;
3454 }
3455
3456 const unsigned *
3457 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
3458                struct gl_fragment_program *fp,
3459                struct gl_shader_program *prog,
3460                unsigned *final_assembly_size)
3461 {
3462    bool start_busy = false;
3463    float start_time = 0;
3464
3465    if (unlikely(brw->perf_debug)) {
3466       start_busy = (brw->batch.last_bo &&
3467                     drm_intel_bo_busy(brw->batch.last_bo));
3468       start_time = get_time();
3469    }
3470
3471    struct brw_shader *shader = NULL;
3472    if (prog)
3473       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3474
3475    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3476       if (prog) {
3477          printf("GLSL IR for native fragment shader %d:\n", prog->Name);
3478          _mesa_print_ir(shader->base.ir, NULL);
3479          printf("\n\n");
3480       } else {
3481          printf("ARB_fragment_program %d ir for native fragment shader\n",
3482                 fp->Base.Id);
3483          _mesa_print_program(&fp->Base);
3484       }
3485    }
3486
3487    /* Now the main event: Visit the shader IR and generate our FS IR for it.
3488     */
3489    fs_visitor v(brw, c, prog, fp, 8);
3490    if (!v.run()) {
3491       if (prog) {
3492          prog->LinkStatus = false;
3493          ralloc_strcat(&prog->InfoLog, v.fail_msg);
3494       }
3495
3496       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3497                     v.fail_msg);
3498
3499       return NULL;
3500    }
3501
3502    exec_list *simd16_instructions = NULL;
3503    fs_visitor v2(brw, c, prog, fp, 16);
3504    if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16))) {
3505       if (c->prog_data.nr_pull_params == 0) {
3506          /* Try a SIMD16 compile */
3507          v2.import_uniforms(&v);
3508          if (!v2.run()) {
3509             perf_debug("SIMD16 shader failed to compile, falling back to "
3510                        "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
3511          } else {
3512             simd16_instructions = &v2.instructions;
3513          }
3514       } else {
3515          perf_debug("Skipping SIMD16 due to pull parameters.\n");
3516       }
3517    }
3518
3519    const unsigned *assembly = NULL;
3520    if (brw->gen >= 8) {
3521       gen8_fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
3522       assembly = g.generate_assembly(&v.instructions, simd16_instructions,
3523                                      final_assembly_size);
3524    } else {
3525       fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
3526       assembly = g.generate_assembly(&v.instructions, simd16_instructions,
3527                                      final_assembly_size);
3528    }
3529
3530    if (unlikely(brw->perf_debug) && shader) {
3531       if (shader->compiled_once)
3532          brw_wm_debug_recompile(brw, prog, &c->key);
3533       shader->compiled_once = true;
3534
3535       if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
3536          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3537                     (get_time() - start_time) * 1000);
3538       }
3539    }
3540
3541    return assembly;
3542 }
3543
3544 bool
3545 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
3546 {
3547    struct brw_context *brw = brw_context(ctx);
3548    struct brw_wm_prog_key key;
3549
3550    if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
3551       return true;
3552
3553    struct gl_fragment_program *fp = (struct gl_fragment_program *)
3554       prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
3555    struct brw_fragment_program *bfp = brw_fragment_program(fp);
3556    bool program_uses_dfdy = fp->UsesDFdy;
3557
3558    memset(&key, 0, sizeof(key));
3559
3560    if (brw->gen < 6) {
3561       if (fp->UsesKill)
3562          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3563
3564       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3565          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3566
3567       /* Just assume depth testing. */
3568       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3569       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3570    }
3571
3572    if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
3573                                          BRW_FS_VARYING_INPUT_MASK) > 16)
3574       key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
3575
3576    key.clamp_fragment_color = ctx->API == API_OPENGL_COMPAT;
3577
3578    unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
3579    for (unsigned i = 0; i < sampler_count; i++) {
3580       if (fp->Base.ShadowSamplers & (1 << i)) {
3581          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3582          key.tex.swizzles[i] =
3583             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3584       } else {
3585          /* Color sampler: assume no swizzling. */
3586          key.tex.swizzles[i] = SWIZZLE_XYZW;
3587       }
3588    }
3589
3590    if (fp->Base.InputsRead & VARYING_BIT_POS) {
3591       key.drawable_height = ctx->DrawBuffer->Height;
3592    }
3593
3594    key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
3595          ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
3596          BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
3597
3598    if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3599       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
3600                           key.nr_color_regions > 1;
3601    }
3602
3603    /* GL_FRAGMENT_SHADER_DERIVATIVE_HINT is almost always GL_DONT_CARE.  The
3604     * quality of the derivatives is likely to be determined by the driconf
3605     * option.
3606     */
3607    key.high_quality_derivatives = brw->disable_derivative_optimization;
3608
3609    key.program_string_id = bfp->id;
3610
3611    uint32_t old_prog_offset = brw->wm.base.prog_offset;
3612    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3613
3614    bool success = do_wm_prog(brw, prog, bfp, &key);
3615
3616    brw->wm.base.prog_offset = old_prog_offset;
3617    brw->wm.prog_data = old_prog_data;
3618
3619    return success;
3620 }