src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 extern "C" {
  32
  33 #include <sys/types.h>
  34
  35 #include "main/hash_table.h"
  36 #include "main/macros.h"
  37 #include "main/shaderobj.h"
  38 #include "main/fbobject.h"
  39 #include "program/prog_parameter.h"
  40 #include "program/prog_print.h"
  41 #include "program/register_allocate.h"
  42 #include "program/sampler.h"
  43 #include "program/hash_table.h"
  44 #include "brw_context.h"
  45 #include "brw_eu.h"
  46 #include "brw_wm.h"
  47 }
  48 #include "brw_fs.h"
  49 #include "brw_dead_control_flow.h"
  50 #include "main/uniforms.h"
  51 #include "brw_fs_live_variables.h"
  52 #include "glsl/glsl_types.h"
  53
  54 void
  55 fs_inst::init()
  56 {
  57    memset(this, 0, sizeof(*this));
  58    this->conditional_mod = BRW_CONDITIONAL_NONE;
  59
  60    this->dst = reg_undef;
  61    this->src[0] = reg_undef;
  62    this->src[1] = reg_undef;
  63    this->src[2] = reg_undef;
  64
  65    /* This will be the case for almost all instructions. */
  66    this->regs_written = 1;
  67 }
  68
  69 fs_inst::fs_inst()
  70 {
  71    init();
  72    this->opcode = BRW_OPCODE_NOP;
  73 }
  74
  75 fs_inst::fs_inst(enum opcode opcode)
  76 {
  77    init();
  78    this->opcode = opcode;
  79 }
  80
  81 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
  82 {
  83    init();
  84    this->opcode = opcode;
  85    this->dst = dst;
  86
  87    if (dst.file == GRF)
  88       assert(dst.reg_offset >= 0);
  89 }
  90
  91 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
  92 {
  93    init();
  94    this->opcode = opcode;
  95    this->dst = dst;
  96    this->src[0] = src0;
  97
  98    if (dst.file == GRF)
  99       assert(dst.reg_offset >= 0);
 100    if (src[0].file == GRF)
 101       assert(src[0].reg_offset >= 0);
 102 }
 103
 104 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 105 {
 106    init();
 107    this->opcode = opcode;
 108    this->dst = dst;
 109    this->src[0] = src0;
 110    this->src[1] = src1;
 111
 112    if (dst.file == GRF)
 113       assert(dst.reg_offset >= 0);
 114    if (src[0].file == GRF)
 115       assert(src[0].reg_offset >= 0);
 116    if (src[1].file == GRF)
 117       assert(src[1].reg_offset >= 0);
 118 }
 119
 120 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
 121                  fs_reg src0, fs_reg src1, fs_reg src2)
 122 {
 123    init();
 124    this->opcode = opcode;
 125    this->dst = dst;
 126    this->src[0] = src0;
 127    this->src[1] = src1;
 128    this->src[2] = src2;
 129
 130    if (dst.file == GRF)
 131       assert(dst.reg_offset >= 0);
 132    if (src[0].file == GRF)
 133       assert(src[0].reg_offset >= 0);
 134    if (src[1].file == GRF)
 135       assert(src[1].reg_offset >= 0);
 136    if (src[2].file == GRF)
 137       assert(src[2].reg_offset >= 0);
 138 }
 139
 140 #define ALU1(op)                                                        \
 141    fs_inst *                                                            \
 142    fs_visitor::op(fs_reg dst, fs_reg src0)                              \
 143    {                                                                    \
 144       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
 145    }
 146
 147 #define ALU2(op)                                                        \
 148    fs_inst *                                                            \
 149    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1)                 \
 150    {                                                                    \
 151       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
 152    }
 153
 154 #define ALU3(op)                                                        \
 155    fs_inst *                                                            \
 156    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1, fs_reg src2)    \
 157    {                                                                    \
 158       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
 159    }
 160
 161 ALU1(NOT)
 162 ALU1(MOV)
 163 ALU1(FRC)
 164 ALU1(RNDD)
 165 ALU1(RNDE)
 166 ALU1(RNDZ)
 167 ALU2(ADD)
 168 ALU2(MUL)
 169 ALU2(MACH)
 170 ALU2(AND)
 171 ALU2(OR)
 172 ALU2(XOR)
 173 ALU2(SHL)
 174 ALU2(SHR)
 175 ALU2(ASR)
 176 ALU3(LRP)
 177 ALU1(BFREV)
 178 ALU3(BFE)
 179 ALU2(BFI1)
 180 ALU3(BFI2)
 181 ALU1(FBH)
 182 ALU1(FBL)
 183 ALU1(CBIT)
 184 ALU3(MAD)
 185 ALU2(ADDC)
 186 ALU2(SUBB)
 187 ALU2(SEL)
 188
 189 /** Gen4 predicated IF. */
 190 fs_inst *
 191 fs_visitor::IF(uint32_t predicate)
 192 {
 193    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
 194    inst->predicate = predicate;
 195    return inst;
 196 }
 197
 198 /** Gen6 IF with embedded comparison. */
 199 fs_inst *
 200 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
 201 {
 202    assert(brw->gen == 6);
 203    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
 204                                         reg_null_d, src0, src1);
 205    inst->conditional_mod = condition;
 206    return inst;
 207 }
 208
 209 /**
 210  * CMP: Sets the low bit of the destination channels with the result
 211  * of the comparison, while the upper bits are undefined, and updates
 212  * the flag register with the packed 16 bits of the result.
 213  */
 214 fs_inst *
 215 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
 216 {
 217    fs_inst *inst;
 218
 219    /* Take the instruction:
 220     *
 221     * CMP null<d> src0<f> src1<f>
 222     *
 223     * Original gen4 does type conversion to the destination type before
 224     * comparison, producing garbage results for floating point comparisons.
 225     * gen5 does the comparison on the execution type (resolved source types),
 226     * so dst type doesn't matter.  gen6 does comparison and then uses the
 227     * result as if it was the dst type with no conversion, which happens to
 228     * mostly work out for float-interpreted-as-int since our comparisons are
 229     * for >0, =0, <0.
 230     */
 231    if (brw->gen == 4) {
 232       dst.type = src0.type;
 233       if (dst.file == HW_REG)
 234          dst.fixed_hw_reg.type = dst.type;
 235    }
 236
 237    resolve_ud_negate(&src0);
 238    resolve_ud_negate(&src1);
 239
 240    inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
 241    inst->conditional_mod = condition;
 242
 243    return inst;
 244 }
 245
 246 exec_list
 247 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
 248                                        const fs_reg &surf_index,
 249                                        const fs_reg &varying_offset,
 250                                        uint32_t const_offset)
 251 {
 252    exec_list instructions;
 253    fs_inst *inst;
 254
 255    /* We have our constant surface use a pitch of 4 bytes, so our index can
 256     * be any component of a vector, and then we load 4 contiguous
 257     * components starting from that.
 258     *
 259     * We break down the const_offset to a portion added to the variable
 260     * offset and a portion done using reg_offset, which means that if you
 261     * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
 262     * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
 263     * CSE can later notice that those loads are all the same and eliminate
 264     * the redundant ones.
 265     */
 266    fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
 267    instructions.push_tail(ADD(vec4_offset,
 268                               varying_offset, const_offset & ~3));
 269
 270    int scale = 1;
 271    if (brw->gen == 4 && dispatch_width == 8) {
 272       /* Pre-gen5, we can either use a SIMD8 message that requires (header,
 273        * u, v, r) as parameters, or we can just use the SIMD16 message
 274        * consisting of (header, u).  We choose the second, at the cost of a
 275        * longer return length.
 276        */
 277       scale = 2;
 278    }
 279
 280    enum opcode op;
 281    if (brw->gen >= 7)
 282       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
 283    else
 284       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
 285    fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4 * scale), dst.type);
 286    inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
 287    inst->regs_written = 4 * scale;
 288    instructions.push_tail(inst);
 289
 290    if (brw->gen < 7) {
 291       inst->base_mrf = 13;
 292       inst->header_present = true;
 293       if (brw->gen == 4)
 294          inst->mlen = 3;
 295       else
 296          inst->mlen = 1 + dispatch_width / 8;
 297    }
 298
 299    vec4_result.reg_offset += (const_offset & 3) * scale;
 300    instructions.push_tail(MOV(dst, vec4_result));
 301
 302    return instructions;
 303 }
 304
 305 /**
 306  * A helper for MOV generation for fixing up broken hardware SEND dependency
 307  * handling.
 308  */
 309 fs_inst *
 310 fs_visitor::DEP_RESOLVE_MOV(int grf)
 311 {
 312    fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
 313
 314    inst->ir = NULL;
 315    inst->annotation = "send dependency resolve";
 316
 317    /* The caller always wants uncompressed to emit the minimal extra
 318     * dependencies, and to avoid having to deal with aligning its regs to 2.
 319     */
 320    inst->force_uncompressed = true;
 321
 322    return inst;
 323 }
 324
 325 bool
 326 fs_inst::equals(fs_inst *inst)
 327 {
 328    return (opcode == inst->opcode &&
 329            dst.equals(inst->dst) &&
 330            src[0].equals(inst->src[0]) &&
 331            src[1].equals(inst->src[1]) &&
 332            src[2].equals(inst->src[2]) &&
 333            saturate == inst->saturate &&
 334            predicate == inst->predicate &&
 335            conditional_mod == inst->conditional_mod &&
 336            mlen == inst->mlen &&
 337            base_mrf == inst->base_mrf &&
 338            sampler == inst->sampler &&
 339            target == inst->target &&
 340            eot == inst->eot &&
 341            header_present == inst->header_present &&
 342            shadow_compare == inst->shadow_compare &&
 343            offset == inst->offset);
 344 }
 345
 346 bool
 347 fs_inst::overwrites_reg(const fs_reg &reg)
 348 {
 349    return (reg.file == dst.file &&
 350            reg.reg == dst.reg &&
 351            reg.reg_offset >= dst.reg_offset  &&
 352            reg.reg_offset < dst.reg_offset + regs_written);
 353 }
 354
 355 bool
 356 fs_inst::is_send_from_grf()
 357 {
 358    return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
 359            opcode == SHADER_OPCODE_SHADER_TIME_ADD ||
 360            (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
 361             src[1].file == GRF) ||
 362            (is_tex() && src[0].file == GRF));
 363 }
 364
 365 bool
 366 fs_visitor::can_do_source_mods(fs_inst *inst)
 367 {
 368    if (brw->gen == 6 && inst->is_math())
 369       return false;
 370
 371    if (inst->is_send_from_grf())
 372       return false;
 373
 374    if (!inst->can_do_source_mods())
 375       return false;
 376
 377    return true;
 378 }
 379
 380 void
 381 fs_reg::init()
 382 {
 383    memset(this, 0, sizeof(*this));
 384    stride = 1;
 385 }
 386
 387 /** Generic unset register constructor. */
 388 fs_reg::fs_reg()
 389 {
 390    init();
 391    this->file = BAD_FILE;
 392 }
 393
 394 /** Immediate value constructor. */
 395 fs_reg::fs_reg(float f)
 396 {
 397    init();
 398    this->file = IMM;
 399    this->type = BRW_REGISTER_TYPE_F;
 400    this->imm.f = f;
 401 }
 402
 403 /** Immediate value constructor. */
 404 fs_reg::fs_reg(int32_t i)
 405 {
 406    init();
 407    this->file = IMM;
 408    this->type = BRW_REGISTER_TYPE_D;
 409    this->imm.i = i;
 410 }
 411
 412 /** Immediate value constructor. */
 413 fs_reg::fs_reg(uint32_t u)
 414 {
 415    init();
 416    this->file = IMM;
 417    this->type = BRW_REGISTER_TYPE_UD;
 418    this->imm.u = u;
 419 }
 420
 421 /** Fixed brw_reg. */
 422 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
 423 {
 424    init();
 425    this->file = HW_REG;
 426    this->fixed_hw_reg = fixed_hw_reg;
 427    this->type = fixed_hw_reg.type;
 428 }
 429
 430 bool
 431 fs_reg::equals(const fs_reg &r) const
 432 {
 433    return (file == r.file &&
 434            reg == r.reg &&
 435            reg_offset == r.reg_offset &&
 436            subreg_offset == r.subreg_offset &&
 437            type == r.type &&
 438            negate == r.negate &&
 439            abs == r.abs &&
 440            !reladdr && !r.reladdr &&
 441            memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
 442                   sizeof(fixed_hw_reg)) == 0 &&
 443            stride == r.stride &&
 444            imm.u == r.imm.u);
 445 }
 446
 447 fs_reg &
 448 fs_reg::apply_stride(unsigned stride)
 449 {
 450    assert((this->stride * stride) <= 4 &&
 451           (is_power_of_two(stride) || stride == 0) &&
 452           file != HW_REG && file != IMM);
 453    this->stride *= stride;
 454    return *this;
 455 }
 456
 457 fs_reg &
 458 fs_reg::set_smear(unsigned subreg)
 459 {
 460    assert(file != HW_REG && file != IMM);
 461    subreg_offset = subreg * type_sz(type);
 462    stride = 0;
 463    return *this;
 464 }
 465
 466 bool
 467 fs_reg::is_contiguous() const
 468 {
 469    return stride == 1;
 470 }
 471
 472 bool
 473 fs_reg::is_zero() const
 474 {
 475    if (file != IMM)
 476       return false;
 477
 478    return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
 479 }
 480
 481 bool
 482 fs_reg::is_one() const
 483 {
 484    if (file != IMM)
 485       return false;
 486
 487    return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
 488 }
 489
 490 bool
 491 fs_reg::is_null() const
 492 {
 493    return file == HW_REG &&
 494           fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
 495           fixed_hw_reg.nr == BRW_ARF_NULL;
 496 }
 497
 498 bool
 499 fs_reg::is_valid_3src() const
 500 {
 501    return file == GRF || file == UNIFORM;
 502 }
 503
 504 int
 505 fs_visitor::type_size(const struct glsl_type *type)
 506 {
 507    unsigned int size, i;
 508
 509    switch (type->base_type) {
 510    case GLSL_TYPE_UINT:
 511    case GLSL_TYPE_INT:
 512    case GLSL_TYPE_FLOAT:
 513    case GLSL_TYPE_BOOL:
 514       return type->components();
 515    case GLSL_TYPE_ARRAY:
 516       return type_size(type->fields.array) * type->length;
 517    case GLSL_TYPE_STRUCT:
 518       size = 0;
 519       for (i = 0; i < type->length; i++) {
 520          size += type_size(type->fields.structure[i].type);
 521       }
 522       return size;
 523    case GLSL_TYPE_SAMPLER:
 524       /* Samplers take up no register space, since they're baked in at
 525        * link time.
 526        */
 527       return 0;
 528    case GLSL_TYPE_ATOMIC_UINT:
 529       return 0;
 530    case GLSL_TYPE_IMAGE:
 531    case GLSL_TYPE_VOID:
 532    case GLSL_TYPE_ERROR:
 533    case GLSL_TYPE_INTERFACE:
 534       assert(!"not reached");
 535       break;
 536    }
 537
 538    return 0;
 539 }
 540
 541 fs_reg
 542 fs_visitor::get_timestamp()
 543 {
 544    assert(brw->gen >= 7);
 545
 546    fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 547                                           BRW_ARF_TIMESTAMP,
 548                                           0),
 549                              BRW_REGISTER_TYPE_UD));
 550
 551    fs_reg dst = fs_reg(this, glsl_type::uint_type);
 552
 553    fs_inst *mov = emit(MOV(dst, ts));
 554    /* We want to read the 3 fields we care about (mostly field 0, but also 2)
 555     * even if it's not enabled in the dispatch.
 556     */
 557    mov->force_writemask_all = true;
 558    mov->force_uncompressed = true;
 559
 560    /* The caller wants the low 32 bits of the timestamp.  Since it's running
 561     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
 562     * which is plenty of time for our purposes.  It is identical across the
 563     * EUs, but since it's tracking GPU core speed it will increment at a
 564     * varying rate as render P-states change.
 565     *
 566     * The caller could also check if render P-states have changed (or anything
 567     * else that might disrupt timing) by setting smear to 2 and checking if
 568     * that field is != 0.
 569     */
 570    dst.set_smear(0);
 571
 572    return dst;
 573 }
 574
 575 void
 576 fs_visitor::emit_shader_time_begin()
 577 {
 578    current_annotation = "shader time start";
 579    shader_start_time = get_timestamp();
 580 }
 581
 582 void
 583 fs_visitor::emit_shader_time_end()
 584 {
 585    current_annotation = "shader time end";
 586
 587    enum shader_time_shader_type type, written_type, reset_type;
 588    if (dispatch_width == 8) {
 589       type = ST_FS8;
 590       written_type = ST_FS8_WRITTEN;
 591       reset_type = ST_FS8_RESET;
 592    } else {
 593       assert(dispatch_width == 16);
 594       type = ST_FS16;
 595       written_type = ST_FS16_WRITTEN;
 596       reset_type = ST_FS16_RESET;
 597    }
 598
 599    fs_reg shader_end_time = get_timestamp();
 600
 601    /* Check that there weren't any timestamp reset events (assuming these
 602     * were the only two timestamp reads that happened).
 603     */
 604    fs_reg reset = shader_end_time;
 605    reset.set_smear(2);
 606    fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
 607    test->conditional_mod = BRW_CONDITIONAL_Z;
 608    emit(IF(BRW_PREDICATE_NORMAL));
 609
 610    push_force_uncompressed();
 611    fs_reg start = shader_start_time;
 612    start.negate = true;
 613    fs_reg diff = fs_reg(this, glsl_type::uint_type);
 614    emit(ADD(diff, start, shader_end_time));
 615
 616    /* If there were no instructions between the two timestamp gets, the diff
 617     * is 2 cycles.  Remove that overhead, so I can forget about that when
 618     * trying to determine the time taken for single instructions.
 619     */
 620    emit(ADD(diff, diff, fs_reg(-2u)));
 621
 622    emit_shader_time_write(type, diff);
 623    emit_shader_time_write(written_type, fs_reg(1u));
 624    emit(BRW_OPCODE_ELSE);
 625    emit_shader_time_write(reset_type, fs_reg(1u));
 626    emit(BRW_OPCODE_ENDIF);
 627
 628    pop_force_uncompressed();
 629 }
 630
 631 void
 632 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
 633                                    fs_reg value)
 634 {
 635    int shader_time_index =
 636       brw_get_shader_time_index(brw, shader_prog, &fp->Base, type);
 637    fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
 638
 639    fs_reg payload;
 640    if (dispatch_width == 8)
 641       payload = fs_reg(this, glsl_type::uvec2_type);
 642    else
 643       payload = fs_reg(this, glsl_type::uint_type);
 644
 645    emit(new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
 646                              fs_reg(), payload, offset, value));
 647 }
 648
 649 void
 650 fs_visitor::fail(const char *format, ...)
 651 {
 652    va_list va;
 653    char *msg;
 654
 655    if (failed)
 656       return;
 657
 658    failed = true;
 659
 660    va_start(va, format);
 661    msg = ralloc_vasprintf(mem_ctx, format, va);
 662    va_end(va);
 663    msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
 664
 665    this->fail_msg = msg;
 666
 667    if (INTEL_DEBUG & DEBUG_WM) {
 668       fprintf(stderr, "%s",  msg);
 669    }
 670 }
 671
 672 fs_inst *
 673 fs_visitor::emit(enum opcode opcode)
 674 {
 675    return emit(new(mem_ctx) fs_inst(opcode));
 676 }
 677
 678 fs_inst *
 679 fs_visitor::emit(enum opcode opcode, fs_reg dst)
 680 {
 681    return emit(new(mem_ctx) fs_inst(opcode, dst));
 682 }
 683
 684 fs_inst *
 685 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
 686 {
 687    return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
 688 }
 689
 690 fs_inst *
 691 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 692 {
 693    return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
 694 }
 695
 696 fs_inst *
 697 fs_visitor::emit(enum opcode opcode, fs_reg dst,
 698                  fs_reg src0, fs_reg src1, fs_reg src2)
 699 {
 700    return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
 701 }
 702
 703 void
 704 fs_visitor::push_force_uncompressed()
 705 {
 706    force_uncompressed_stack++;
 707 }
 708
 709 void
 710 fs_visitor::pop_force_uncompressed()
 711 {
 712    force_uncompressed_stack--;
 713    assert(force_uncompressed_stack >= 0);
 714 }
 715
 716 /**
 717  * Returns true if the instruction has a flag that means it won't
 718  * update an entire destination register.
 719  *
 720  * For example, dead code elimination and live variable analysis want to know
 721  * when a write to a variable screens off any preceding values that were in
 722  * it.
 723  */
 724 bool
 725 fs_inst::is_partial_write()
 726 {
 727    return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
 728            this->force_uncompressed ||
 729            this->force_sechalf || !this->dst.is_contiguous());
 730 }
 731
 732 int
 733 fs_inst::regs_read(fs_visitor *v, int arg)
 734 {
 735    if (is_tex() && arg == 0 && src[0].file == GRF) {
 736       if (v->dispatch_width == 16)
 737          return (mlen + 1) / 2;
 738       else
 739          return mlen;
 740    }
 741    return 1;
 742 }
 743
 744 bool
 745 fs_inst::reads_flag()
 746 {
 747    return predicate;
 748 }
 749
 750 bool
 751 fs_inst::writes_flag()
 752 {
 753    return (conditional_mod && opcode != BRW_OPCODE_SEL) ||
 754           opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
 755 }
 756
 757 /**
 758  * Returns how many MRFs an FS opcode will write over.
 759  *
 760  * Note that this is not the 0 or 1 implied writes in an actual gen
 761  * instruction -- the FS opcodes often generate MOVs in addition.
 762  */
 763 int
 764 fs_visitor::implied_mrf_writes(fs_inst *inst)
 765 {
 766    if (inst->mlen == 0)
 767       return 0;
 768
 769    if (inst->base_mrf == -1)
 770       return 0;
 771
 772    switch (inst->opcode) {
 773    case SHADER_OPCODE_RCP:
 774    case SHADER_OPCODE_RSQ:
 775    case SHADER_OPCODE_SQRT:
 776    case SHADER_OPCODE_EXP2:
 777    case SHADER_OPCODE_LOG2:
 778    case SHADER_OPCODE_SIN:
 779    case SHADER_OPCODE_COS:
 780       return 1 * dispatch_width / 8;
 781    case SHADER_OPCODE_POW:
 782    case SHADER_OPCODE_INT_QUOTIENT:
 783    case SHADER_OPCODE_INT_REMAINDER:
 784       return 2 * dispatch_width / 8;
 785    case SHADER_OPCODE_TEX:
 786    case FS_OPCODE_TXB:
 787    case SHADER_OPCODE_TXD:
 788    case SHADER_OPCODE_TXF:
 789    case SHADER_OPCODE_TXF_CMS:
 790    case SHADER_OPCODE_TXF_MCS:
 791    case SHADER_OPCODE_TG4:
 792    case SHADER_OPCODE_TG4_OFFSET:
 793    case SHADER_OPCODE_TXL:
 794    case SHADER_OPCODE_TXS:
 795    case SHADER_OPCODE_LOD:
 796       return 1;
 797    case FS_OPCODE_FB_WRITE:
 798       return 2;
 799    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 800    case SHADER_OPCODE_GEN4_SCRATCH_READ:
 801       return 1;
 802    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
 803       return inst->mlen;
 804    case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
 805       return 2;
 806    case SHADER_OPCODE_UNTYPED_ATOMIC:
 807    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
 808       return 0;
 809    default:
 810       assert(!"not reached");
 811       return inst->mlen;
 812    }
 813 }
 814
 815 int
 816 fs_visitor::virtual_grf_alloc(int size)
 817 {
 818    if (virtual_grf_array_size <= virtual_grf_count) {
 819       if (virtual_grf_array_size == 0)
 820          virtual_grf_array_size = 16;
 821       else
 822          virtual_grf_array_size *= 2;
 823       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 824                                    virtual_grf_array_size);
 825    }
 826    virtual_grf_sizes[virtual_grf_count] = size;
 827    return virtual_grf_count++;
 828 }
 829
 830 /** Fixed HW reg constructor. */
 831 fs_reg::fs_reg(enum register_file file, int reg)
 832 {
 833    init();
 834    this->file = file;
 835    this->reg = reg;
 836    this->type = BRW_REGISTER_TYPE_F;
 837 }
 838
 839 /** Fixed HW reg constructor. */
 840 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
 841 {
 842    init();
 843    this->file = file;
 844    this->reg = reg;
 845    this->type = type;
 846 }
 847
 848 /** Automatic reg constructor. */
 849 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
 850 {
 851    init();
 852
 853    this->file = GRF;
 854    this->reg = v->virtual_grf_alloc(v->type_size(type));
 855    this->reg_offset = 0;
 856    this->type = brw_type_for_base_type(type);
 857 }
 858
 859 fs_reg *
 860 fs_visitor::variable_storage(ir_variable *var)
 861 {
 862    return (fs_reg *)hash_table_find(this->variable_ht, var);
 863 }
 864
 865 void
 866 import_uniforms_callback(const void *key,
 867                          void *data,
 868                          void *closure)
 869 {
 870    struct hash_table *dst_ht = (struct hash_table *)closure;
 871    const fs_reg *reg = (const fs_reg *)data;
 872
 873    if (reg->file != UNIFORM)
 874       return;
 875
 876    hash_table_insert(dst_ht, data, key);
 877 }
 878
 879 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
 880  * This brings in those uniform definitions
 881  */
 882 void
 883 fs_visitor::import_uniforms(fs_visitor *v)
 884 {
 885    hash_table_call_foreach(v->variable_ht,
 886                            import_uniforms_callback,
 887                            variable_ht);
 888    this->push_constant_loc = v->push_constant_loc;
 889    this->uniforms = v->uniforms;
 890 }
 891
 892 /* Our support for uniforms is piggy-backed on the struct
 893  * gl_fragment_program, because that's where the values actually
 894  * get stored, rather than in some global gl_shader_program uniform
 895  * store.
 896  */
 897 void
 898 fs_visitor::setup_uniform_values(ir_variable *ir)
 899 {
 900    int namelen = strlen(ir->name);
 901
 902    /* The data for our (non-builtin) uniforms is stored in a series of
 903     * gl_uniform_driver_storage structs for each subcomponent that
 904     * glGetUniformLocation() could name.  We know it's been set up in the same
 905     * order we'd walk the type, so walk the list of storage and find anything
 906     * with our name, or the prefix of a component that starts with our name.
 907     */
 908    unsigned params_before = uniforms;
 909    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
 910       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 911
 912       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 913           (storage->name[namelen] != 0 &&
 914            storage->name[namelen] != '.' &&
 915            storage->name[namelen] != '[')) {
 916          continue;
 917       }
 918
 919       unsigned slots = storage->type->component_slots();
 920       if (storage->array_elements)
 921          slots *= storage->array_elements;
 922
 923       for (unsigned i = 0; i < slots; i++) {
 924          stage_prog_data->param[uniforms++] = &storage->storage[i].f;
 925       }
 926    }
 927
 928    /* Make sure we actually initialized the right amount of stuff here. */
 929    assert(params_before + ir->type->component_slots() == uniforms);
 930    (void)params_before;
 931 }
 932
 933
 934 /* Our support for builtin uniforms is even scarier than non-builtin.
 935  * It sits on top of the PROG_STATE_VAR parameters that are
 936  * automatically updated from GL context state.
 937  */
 938 void
 939 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
 940 {
 941    const ir_state_slot *const slots = ir->state_slots;
 942    assert(ir->state_slots != NULL);
 943
 944    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 945       /* This state reference has already been setup by ir_to_mesa, but we'll
 946        * get the same index back here.
 947        */
 948       int index = _mesa_add_state_reference(this->fp->Base.Parameters,
 949                                             (gl_state_index *)slots[i].tokens);
 950
 951       /* Add each of the unique swizzles of the element as a parameter.
 952        * This'll end up matching the expected layout of the
 953        * array/matrix/structure we're trying to fill in.
 954        */
 955       int last_swiz = -1;
 956       for (unsigned int j = 0; j < 4; j++) {
 957          int swiz = GET_SWZ(slots[i].swizzle, j);
 958          if (swiz == last_swiz)
 959             break;
 960          last_swiz = swiz;
 961
 962          stage_prog_data->param[uniforms++] =
 963             &fp->Base.Parameters->ParameterValues[index][swiz].f;
 964       }
 965    }
 966 }
 967
 968 fs_reg *
 969 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
 970 {
 971    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 972    fs_reg wpos = *reg;
 973    bool flip = !ir->data.origin_upper_left ^ c->key.render_to_fbo;
 974
 975    /* gl_FragCoord.x */
 976    if (ir->data.pixel_center_integer) {
 977       emit(MOV(wpos, this->pixel_x));
 978    } else {
 979       emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
 980    }
 981    wpos.reg_offset++;
 982
 983    /* gl_FragCoord.y */
 984    if (!flip && ir->data.pixel_center_integer) {
 985       emit(MOV(wpos, this->pixel_y));
 986    } else {
 987       fs_reg pixel_y = this->pixel_y;
 988       float offset = (ir->data.pixel_center_integer ? 0.0 : 0.5);
 989
 990       if (flip) {
 991          pixel_y.negate = true;
 992          offset += c->key.drawable_height - 1.0;
 993       }
 994
 995       emit(ADD(wpos, pixel_y, fs_reg(offset)));
 996    }
 997    wpos.reg_offset++;
 998
 999    /* gl_FragCoord.z */
1000    if (brw->gen >= 6) {
1001       emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
1002    } else {
1003       emit(FS_OPCODE_LINTERP, wpos,
1004            this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1005            this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1006            interp_reg(VARYING_SLOT_POS, 2));
1007    }
1008    wpos.reg_offset++;
1009
1010    /* gl_FragCoord.w: Already set up in emit_interpolation */
1011    emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1012
1013    return reg;
1014 }
1015
1016 fs_inst *
1017 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1018                          glsl_interp_qualifier interpolation_mode,
1019                          bool is_centroid, bool is_sample)
1020 {
1021    brw_wm_barycentric_interp_mode barycoord_mode;
1022    if (brw->gen >= 6) {
1023       if (is_centroid) {
1024          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1025             barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1026          else
1027             barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1028       } else if (is_sample) {
1029           if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1030             barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1031          else
1032             barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1033       } else {
1034          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1035             barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1036          else
1037             barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1038       }
1039    } else {
1040       /* On Ironlake and below, there is only one interpolation mode.
1041        * Centroid interpolation doesn't mean anything on this hardware --
1042        * there is no multisampling.
1043        */
1044       barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1045    }
1046    return emit(FS_OPCODE_LINTERP, attr,
1047                this->delta_x[barycoord_mode],
1048                this->delta_y[barycoord_mode], interp);
1049 }
1050
1051 fs_reg *
1052 fs_visitor::emit_general_interpolation(ir_variable *ir)
1053 {
1054    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1055    reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
1056    fs_reg attr = *reg;
1057
1058    unsigned int array_elements;
1059    const glsl_type *type;
1060
1061    if (ir->type->is_array()) {
1062       array_elements = ir->type->length;
1063       if (array_elements == 0) {
1064          fail("dereferenced array '%s' has length 0\n", ir->name);
1065       }
1066       type = ir->type->fields.array;
1067    } else {
1068       array_elements = 1;
1069       type = ir->type;
1070    }
1071
1072    glsl_interp_qualifier interpolation_mode =
1073       ir->determine_interpolation_mode(c->key.flat_shade);
1074
1075    int location = ir->data.location;
1076    for (unsigned int i = 0; i < array_elements; i++) {
1077       for (unsigned int j = 0; j < type->matrix_columns; j++) {
1078          if (c->prog_data.urb_setup[location] == -1) {
1079             /* If there's no incoming setup data for this slot, don't
1080              * emit interpolation for it.
1081              */
1082             attr.reg_offset += type->vector_elements;
1083             location++;
1084             continue;
1085          }
1086
1087          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1088             /* Constant interpolation (flat shading) case. The SF has
1089              * handed us defined values in only the constant offset
1090              * field of the setup reg.
1091              */
1092             for (unsigned int k = 0; k < type->vector_elements; k++) {
1093                struct brw_reg interp = interp_reg(location, k);
1094                interp = suboffset(interp, 3);
1095                interp.type = reg->type;
1096                emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1097                attr.reg_offset++;
1098             }
1099          } else {
1100             /* Smooth/noperspective interpolation case. */
1101             for (unsigned int k = 0; k < type->vector_elements; k++) {
1102                struct brw_reg interp = interp_reg(location, k);
1103                emit_linterp(attr, fs_reg(interp), interpolation_mode,
1104                             ir->data.centroid && !c->key.persample_shading,
1105                             ir->data.sample || c->key.persample_shading);
1106                if (brw->needs_unlit_centroid_workaround && ir->data.centroid) {
1107                   /* Get the pixel/sample mask into f0 so that we know
1108                    * which pixels are lit.  Then, for each channel that is
1109                    * unlit, replace the centroid data with non-centroid
1110                    * data.
1111                    */
1112                   emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1113                   fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1114                                                interpolation_mode,
1115                                                false, false);
1116                   inst->predicate = BRW_PREDICATE_NORMAL;
1117                   inst->predicate_inverse = true;
1118                }
1119                if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1120                   emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1121                }
1122                attr.reg_offset++;
1123             }
1124
1125          }
1126          location++;
1127       }
1128    }
1129
1130    return reg;
1131 }
1132
1133 fs_reg *
1134 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1135 {
1136    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1137
1138    /* The frontfacing comes in as a bit in the thread payload. */
1139    if (brw->gen >= 6) {
1140       emit(BRW_OPCODE_ASR, *reg,
1141            fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1142            fs_reg(15));
1143       emit(BRW_OPCODE_NOT, *reg, *reg);
1144       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1145    } else {
1146       struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1147       /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1148        * us front face
1149        */
1150       emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1151       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1152    }
1153
1154    return reg;
1155 }
1156
1157 void
1158 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1159 {
1160    assert(dst.type == BRW_REGISTER_TYPE_F);
1161
1162    if (c->key.compute_pos_offset) {
1163       /* Convert int_sample_pos to floating point */
1164       emit(MOV(dst, int_sample_pos));
1165       /* Scale to the range [0, 1] */
1166       emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1167    }
1168    else {
1169       /* From ARB_sample_shading specification:
1170        * "When rendering to a non-multisample buffer, or if multisample
1171        *  rasterization is disabled, gl_SamplePosition will always be
1172        *  (0.5, 0.5).
1173        */
1174       emit(MOV(dst, fs_reg(0.5f)));
1175    }
1176 }
1177
1178 fs_reg *
1179 fs_visitor::emit_samplepos_setup(ir_variable *ir)
1180 {
1181    assert(brw->gen >= 6);
1182    assert(ir->type == glsl_type::vec2_type);
1183
1184    this->current_annotation = "compute sample position";
1185    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1186    fs_reg pos = *reg;
1187    fs_reg int_sample_x = fs_reg(this, glsl_type::int_type);
1188    fs_reg int_sample_y = fs_reg(this, glsl_type::int_type);
1189
1190    /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1191     * mode will be enabled.
1192     *
1193     * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1194     * R31.1:0         Position Offset X/Y for Slot[3:0]
1195     * R31.3:2         Position Offset X/Y for Slot[7:4]
1196     * .....
1197     *
1198     * The X, Y sample positions come in as bytes in  thread payload. So, read
1199     * the positions using vstride=16, width=8, hstride=2.
1200     */
1201    struct brw_reg sample_pos_reg =
1202       stride(retype(brw_vec1_grf(c->sample_pos_reg, 0),
1203                     BRW_REGISTER_TYPE_B), 16, 8, 2);
1204
1205    emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1206    if (dispatch_width == 16) {
1207       fs_inst *inst = emit(MOV(half(int_sample_x, 1),
1208                                fs_reg(suboffset(sample_pos_reg, 16))));
1209       inst->force_sechalf = true;
1210    }
1211    /* Compute gl_SamplePosition.x */
1212    compute_sample_position(pos, int_sample_x);
1213    pos.reg_offset++;
1214    emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1215    if (dispatch_width == 16) {
1216       fs_inst *inst = emit(MOV(half(int_sample_y, 1),
1217                                fs_reg(suboffset(sample_pos_reg, 17))));
1218       inst->force_sechalf = true;
1219    }
1220    /* Compute gl_SamplePosition.y */
1221    compute_sample_position(pos, int_sample_y);
1222    return reg;
1223 }
1224
1225 fs_reg *
1226 fs_visitor::emit_sampleid_setup(ir_variable *ir)
1227 {
1228    assert(brw->gen >= 6);
1229
1230    this->current_annotation = "compute sample id";
1231    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1232
1233    if (c->key.compute_sample_id) {
1234       fs_reg t1 = fs_reg(this, glsl_type::int_type);
1235       fs_reg t2 = fs_reg(this, glsl_type::int_type);
1236       t2.type = BRW_REGISTER_TYPE_UW;
1237
1238       /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1239        * 8x multisampling, subspan 0 will represent sample N (where N
1240        * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1241        * 7. We can find the value of N by looking at R0.0 bits 7:6
1242        * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1243        * (since samples are always delivered in pairs). That is, we
1244        * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1245        * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1246        * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1247        * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1248        * populating a temporary variable with the sequence (0, 1, 2, 3),
1249        * and then reading from it using vstride=1, width=4, hstride=0.
1250        * These computations hold good for 4x multisampling as well.
1251        */
1252       emit(BRW_OPCODE_AND, t1,
1253            fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1254            fs_reg(brw_imm_d(0xc0)));
1255       emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1256       /* This works for both SIMD8 and SIMD16 */
1257       emit(MOV(t2, brw_imm_v(0x3210)));
1258       /* This special instruction takes care of setting vstride=1,
1259        * width=4, hstride=0 of t2 during an ADD instruction.
1260        */
1261       emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1262    } else {
1263       /* As per GL_ARB_sample_shading specification:
1264        * "When rendering to a non-multisample buffer, or if multisample
1265        *  rasterization is disabled, gl_SampleID will always be zero."
1266        */
1267       emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1268    }
1269
1270    return reg;
1271 }
1272
1273 fs_reg *
1274 fs_visitor::emit_samplemaskin_setup(ir_variable *ir)
1275 {
1276    assert(brw->gen >= 7);
1277    this->current_annotation = "compute gl_SampleMaskIn";
1278    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1279    emit(MOV(*reg, fs_reg(retype(brw_vec8_grf(c->sample_mask_reg, 0), BRW_REGISTER_TYPE_D))));
1280    return reg;
1281 }
1282
1283 fs_reg
1284 fs_visitor::fix_math_operand(fs_reg src)
1285 {
1286    /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1287     * might be able to do better by doing execsize = 1 math and then
1288     * expanding that result out, but we would need to be careful with
1289     * masking.
1290     *
1291     * The hardware ignores source modifiers (negate and abs) on math
1292     * instructions, so we also move to a temp to set those up.
1293     */
1294    if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1295        !src.abs && !src.negate)
1296       return src;
1297
1298    /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1299     * operands to math
1300     */
1301    if (brw->gen >= 7 && src.file != IMM)
1302       return src;
1303
1304    fs_reg expanded = fs_reg(this, glsl_type::float_type);
1305    expanded.type = src.type;
1306    emit(BRW_OPCODE_MOV, expanded, src);
1307    return expanded;
1308 }
1309
1310 fs_inst *
1311 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1312 {
1313    switch (opcode) {
1314    case SHADER_OPCODE_RCP:
1315    case SHADER_OPCODE_RSQ:
1316    case SHADER_OPCODE_SQRT:
1317    case SHADER_OPCODE_EXP2:
1318    case SHADER_OPCODE_LOG2:
1319    case SHADER_OPCODE_SIN:
1320    case SHADER_OPCODE_COS:
1321       break;
1322    default:
1323       assert(!"not reached: bad math opcode");
1324       return NULL;
1325    }
1326
1327    /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
1328     * might be able to do better by doing execsize = 1 math and then
1329     * expanding that result out, but we would need to be careful with
1330     * masking.
1331     *
1332     * Gen 6 hardware ignores source modifiers (negate and abs) on math
1333     * instructions, so we also move to a temp to set those up.
1334     */
1335    if (brw->gen >= 6)
1336       src = fix_math_operand(src);
1337
1338    fs_inst *inst = emit(opcode, dst, src);
1339
1340    if (brw->gen < 6) {
1341       inst->base_mrf = 2;
1342       inst->mlen = dispatch_width / 8;
1343    }
1344
1345    return inst;
1346 }
1347
1348 fs_inst *
1349 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1350 {
1351    int base_mrf = 2;
1352    fs_inst *inst;
1353
1354    switch (opcode) {
1355    case SHADER_OPCODE_INT_QUOTIENT:
1356    case SHADER_OPCODE_INT_REMAINDER:
1357       if (brw->gen >= 7 && dispatch_width == 16)
1358          fail("SIMD16 INTDIV unsupported\n");
1359       break;
1360    case SHADER_OPCODE_POW:
1361       break;
1362    default:
1363       assert(!"not reached: unsupported binary math opcode.");
1364       return NULL;
1365    }
1366
1367    if (brw->gen >= 6) {
1368       src0 = fix_math_operand(src0);
1369       src1 = fix_math_operand(src1);
1370
1371       inst = emit(opcode, dst, src0, src1);
1372    } else {
1373       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1374        * "Message Payload":
1375        *
1376        * "Operand0[7].  For the INT DIV functions, this operand is the
1377        *  denominator."
1378        *  ...
1379        * "Operand1[7].  For the INT DIV functions, this operand is the
1380        *  numerator."
1381        */
1382       bool is_int_div = opcode != SHADER_OPCODE_POW;
1383       fs_reg &op0 = is_int_div ? src1 : src0;
1384       fs_reg &op1 = is_int_div ? src0 : src1;
1385
1386       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1387       inst = emit(opcode, dst, op0, reg_null_f);
1388
1389       inst->base_mrf = base_mrf;
1390       inst->mlen = 2 * dispatch_width / 8;
1391    }
1392    return inst;
1393 }
1394
1395 void
1396 fs_visitor::assign_curb_setup()
1397 {
1398    if (dispatch_width == 8) {
1399       c->prog_data.first_curbe_grf = c->nr_payload_regs;
1400    } else {
1401       c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1402    }
1403
1404    c->prog_data.curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1405
1406    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1407    foreach_list(node, &this->instructions) {
1408       fs_inst *inst = (fs_inst *)node;
1409
1410       for (unsigned int i = 0; i < 3; i++) {
1411          if (inst->src[i].file == UNIFORM) {
1412             int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1413             int constant_nr;
1414             if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1415                constant_nr = push_constant_loc[uniform_nr];
1416             } else {
1417                /* Section 5.11 of the OpenGL 4.1 spec says:
1418                 * "Out-of-bounds reads return undefined values, which include
1419                 *  values from other variables of the active program or zero."
1420                 * Just return the first push constant.
1421                 */
1422                constant_nr = 0;
1423             }
1424
1425             struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1426                                                   constant_nr / 8,
1427                                                   constant_nr % 8);
1428
1429             inst->src[i].file = HW_REG;
1430             inst->src[i].fixed_hw_reg = byte_offset(
1431                retype(brw_reg, inst->src[i].type),
1432                inst->src[i].subreg_offset);
1433          }
1434       }
1435    }
1436 }
1437
1438 void
1439 fs_visitor::calculate_urb_setup()
1440 {
1441    for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1442       c->prog_data.urb_setup[i] = -1;
1443    }
1444
1445    int urb_next = 0;
1446    /* Figure out where each of the incoming setup attributes lands. */
1447    if (brw->gen >= 6) {
1448       if (_mesa_bitcount_64(fp->Base.InputsRead &
1449                             BRW_FS_VARYING_INPUT_MASK) <= 16) {
1450          /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1451           * first 16 varying inputs, so we can put them wherever we want.
1452           * Just put them in order.
1453           *
1454           * This is useful because it means that (a) inputs not used by the
1455           * fragment shader won't take up valuable register space, and (b) we
1456           * won't have to recompile the fragment shader if it gets paired with
1457           * a different vertex (or geometry) shader.
1458           */
1459          for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1460             if (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1461                 BITFIELD64_BIT(i)) {
1462                c->prog_data.urb_setup[i] = urb_next++;
1463             }
1464          }
1465       } else {
1466          /* We have enough input varyings that the SF/SBE pipeline stage can't
1467           * arbitrarily rearrange them to suit our whim; we have to put them
1468           * in an order that matches the output of the previous pipeline stage
1469           * (geometry or vertex shader).
1470           */
1471          struct brw_vue_map prev_stage_vue_map;
1472          brw_compute_vue_map(brw, &prev_stage_vue_map,
1473                              c->key.input_slots_valid);
1474          int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1475          assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1476          for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1477               slot++) {
1478             int varying = prev_stage_vue_map.slot_to_varying[slot];
1479             /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1480              * unused.
1481              */
1482             if (varying != BRW_VARYING_SLOT_COUNT &&
1483                 (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1484                  BITFIELD64_BIT(varying))) {
1485                c->prog_data.urb_setup[varying] = slot - first_slot;
1486             }
1487          }
1488          urb_next = prev_stage_vue_map.num_slots - first_slot;
1489       }
1490    } else {
1491       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1492       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1493          /* Point size is packed into the header, not as a general attribute */
1494          if (i == VARYING_SLOT_PSIZ)
1495             continue;
1496
1497          if (c->key.input_slots_valid & BITFIELD64_BIT(i)) {
1498             /* The back color slot is skipped when the front color is
1499              * also written to.  In addition, some slots can be
1500              * written in the vertex shader and not read in the
1501              * fragment shader.  So the register number must always be
1502              * incremented, mapped or not.
1503              */
1504             if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1505                c->prog_data.urb_setup[i] = urb_next;
1506             urb_next++;
1507          }
1508       }
1509
1510       /*
1511        * It's a FS only attribute, and we did interpolation for this attribute
1512        * in SF thread. So, count it here, too.
1513        *
1514        * See compile_sf_prog() for more info.
1515        */
1516       if (fp->Base.InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1517          c->prog_data.urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1518    }
1519
1520    c->prog_data.num_varying_inputs = urb_next;
1521 }
1522
1523 void
1524 fs_visitor::assign_urb_setup()
1525 {
1526    int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1527
1528    /* Offset all the urb_setup[] index by the actual position of the
1529     * setup regs, now that the location of the constants has been chosen.
1530     */
1531    foreach_list(node, &this->instructions) {
1532       fs_inst *inst = (fs_inst *)node;
1533
1534       if (inst->opcode == FS_OPCODE_LINTERP) {
1535          assert(inst->src[2].file == HW_REG);
1536          inst->src[2].fixed_hw_reg.nr += urb_start;
1537       }
1538
1539       if (inst->opcode == FS_OPCODE_CINTERP) {
1540          assert(inst->src[0].file == HW_REG);
1541          inst->src[0].fixed_hw_reg.nr += urb_start;
1542       }
1543    }
1544
1545    /* Each attribute is 4 setup channels, each of which is half a reg. */
1546    this->first_non_payload_grf =
1547       urb_start + c->prog_data.num_varying_inputs * 2;
1548 }
1549
1550 /**
1551  * Split large virtual GRFs into separate components if we can.
1552  *
1553  * This is mostly duplicated with what brw_fs_vector_splitting does,
1554  * but that's really conservative because it's afraid of doing
1555  * splitting that doesn't result in real progress after the rest of
1556  * the optimization phases, which would cause infinite looping in
1557  * optimization.  We can do it once here, safely.  This also has the
1558  * opportunity to split interpolated values, or maybe even uniforms,
1559  * which we don't have at the IR level.
1560  *
1561  * We want to split, because virtual GRFs are what we register
1562  * allocate and spill (due to contiguousness requirements for some
1563  * instructions), and they're what we naturally generate in the
1564  * codegen process, but most virtual GRFs don't actually need to be
1565  * contiguous sets of GRFs.  If we split, we'll end up with reduced
1566  * live intervals and better dead code elimination and coalescing.
1567  */
1568 void
1569 fs_visitor::split_virtual_grfs()
1570 {
1571    int num_vars = this->virtual_grf_count;
1572    bool split_grf[num_vars];
1573    int new_virtual_grf[num_vars];
1574
1575    /* Try to split anything > 0 sized. */
1576    for (int i = 0; i < num_vars; i++) {
1577       if (this->virtual_grf_sizes[i] != 1)
1578          split_grf[i] = true;
1579       else
1580          split_grf[i] = false;
1581    }
1582
1583    if (brw->has_pln &&
1584        this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1585       /* PLN opcodes rely on the delta_xy being contiguous.  We only have to
1586        * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1587        * Gen6, that was the only supported interpolation mode, and since Gen6,
1588        * delta_x and delta_y are in fixed hardware registers.
1589        */
1590       split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1591          false;
1592    }
1593
1594    foreach_list(node, &this->instructions) {
1595       fs_inst *inst = (fs_inst *)node;
1596
1597       /* If there's a SEND message that requires contiguous destination
1598        * registers, no splitting is allowed.
1599        */
1600       if (inst->regs_written > 1) {
1601          split_grf[inst->dst.reg] = false;
1602       }
1603
1604       /* If we're sending from a GRF, don't split it, on the assumption that
1605        * the send is reading the whole thing.
1606        */
1607       if (inst->is_send_from_grf()) {
1608          for (int i = 0; i < 3; i++) {
1609             if (inst->src[i].file == GRF) {
1610                split_grf[inst->src[i].reg] = false;
1611             }
1612          }
1613       }
1614    }
1615
1616    /* Allocate new space for split regs.  Note that the virtual
1617     * numbers will be contiguous.
1618     */
1619    for (int i = 0; i < num_vars; i++) {
1620       if (split_grf[i]) {
1621          new_virtual_grf[i] = virtual_grf_alloc(1);
1622          for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1623             int reg = virtual_grf_alloc(1);
1624             assert(reg == new_virtual_grf[i] + j - 1);
1625             (void) reg;
1626          }
1627          this->virtual_grf_sizes[i] = 1;
1628       }
1629    }
1630
1631    foreach_list(node, &this->instructions) {
1632       fs_inst *inst = (fs_inst *)node;
1633
1634       if (inst->dst.file == GRF &&
1635           split_grf[inst->dst.reg] &&
1636           inst->dst.reg_offset != 0) {
1637          inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1638                           inst->dst.reg_offset - 1);
1639          inst->dst.reg_offset = 0;
1640       }
1641       for (int i = 0; i < 3; i++) {
1642          if (inst->src[i].file == GRF &&
1643              split_grf[inst->src[i].reg] &&
1644              inst->src[i].reg_offset != 0) {
1645             inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1646                                 inst->src[i].reg_offset - 1);
1647             inst->src[i].reg_offset = 0;
1648          }
1649       }
1650    }
1651    invalidate_live_intervals();
1652 }
1653
1654 /**
1655  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1656  *
1657  * During code generation, we create tons of temporary variables, many of
1658  * which get immediately killed and are never used again.  Yet, in later
1659  * optimization and analysis passes, such as compute_live_intervals, we need
1660  * to loop over all the virtual GRFs.  Compacting them can save a lot of
1661  * overhead.
1662  */
1663 void
1664 fs_visitor::compact_virtual_grfs()
1665 {
1666    /* Mark which virtual GRFs are used, and count how many. */
1667    int remap_table[this->virtual_grf_count];
1668    memset(remap_table, -1, sizeof(remap_table));
1669
1670    foreach_list(node, &this->instructions) {
1671       const fs_inst *inst = (const fs_inst *) node;
1672
1673       if (inst->dst.file == GRF)
1674          remap_table[inst->dst.reg] = 0;
1675
1676       for (int i = 0; i < 3; i++) {
1677          if (inst->src[i].file == GRF)
1678             remap_table[inst->src[i].reg] = 0;
1679       }
1680    }
1681
1682    /* In addition to registers used in instructions, fs_visitor keeps
1683     * direct references to certain special values which must be patched:
1684     */
1685    fs_reg *special[] = {
1686       &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1687       &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1688       &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1689       &delta_x[0], &delta_x[1], &delta_x[2],
1690       &delta_x[3], &delta_x[4], &delta_x[5],
1691       &delta_y[0], &delta_y[1], &delta_y[2],
1692       &delta_y[3], &delta_y[4], &delta_y[5],
1693    };
1694    STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1695    STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1696
1697    /* Treat all special values as used, to be conservative */
1698    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1699       if (special[i]->file == GRF)
1700          remap_table[special[i]->reg] = 0;
1701    }
1702
1703    /* Compact the GRF arrays. */
1704    int new_index = 0;
1705    for (int i = 0; i < this->virtual_grf_count; i++) {
1706       if (remap_table[i] != -1) {
1707          remap_table[i] = new_index;
1708          virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1709          invalidate_live_intervals();
1710          ++new_index;
1711       }
1712    }
1713
1714    this->virtual_grf_count = new_index;
1715
1716    /* Patch all the instructions to use the newly renumbered registers */
1717    foreach_list(node, &this->instructions) {
1718       fs_inst *inst = (fs_inst *) node;
1719
1720       if (inst->dst.file == GRF)
1721          inst->dst.reg = remap_table[inst->dst.reg];
1722
1723       for (int i = 0; i < 3; i++) {
1724          if (inst->src[i].file == GRF)
1725             inst->src[i].reg = remap_table[inst->src[i].reg];
1726       }
1727    }
1728
1729    /* Patch all the references to special values */
1730    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1731       if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1732          special[i]->reg = remap_table[special[i]->reg];
1733    }
1734 }
1735
1736 /*
1737  * Implements array access of uniforms by inserting a
1738  * PULL_CONSTANT_LOAD instruction.
1739  *
1740  * Unlike temporary GRF array access (where we don't support it due to
1741  * the difficulty of doing relative addressing on instruction
1742  * destinations), we could potentially do array access of uniforms
1743  * that were loaded in GRF space as push constants.  In real-world
1744  * usage we've seen, though, the arrays being used are always larger
1745  * than we could load as push constants, so just always move all
1746  * uniform array access out to a pull constant buffer.
1747  */
1748 void
1749 fs_visitor::move_uniform_array_access_to_pull_constants()
1750 {
1751    pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
1752
1753    for (unsigned int i = 0; i < uniforms; i++) {
1754       pull_constant_loc[i] = -1;
1755    }
1756
1757    /* Walk through and find array access of uniforms.  Put a copy of that
1758     * uniform in the pull constant buffer.
1759     *
1760     * Note that we don't move constant-indexed accesses to arrays.  No
1761     * testing has been done of the performance impact of this choice.
1762     */
1763    foreach_list_safe(node, &this->instructions) {
1764       fs_inst *inst = (fs_inst *)node;
1765
1766       for (int i = 0 ; i < 3; i++) {
1767          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1768             continue;
1769
1770          int uniform = inst->src[i].reg;
1771
1772          /* If this array isn't already present in the pull constant buffer,
1773           * add it.
1774           */
1775          if (pull_constant_loc[uniform] == -1) {
1776             const float **values = &stage_prog_data->param[uniform];
1777
1778             assert(param_size[uniform]);
1779
1780             for (int j = 0; j < param_size[uniform]; j++) {
1781                pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
1782
1783                stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
1784                   values[j];
1785             }
1786          }
1787       }
1788    }
1789 }
1790
1791 /**
1792  * Assign UNIFORM file registers to either push constants or pull constants.
1793  *
1794  * We allow a fragment shader to have more than the specified minimum
1795  * maximum number of fragment shader uniform components (64).  If
1796  * there are too many of these, they'd fill up all of register space.
1797  * So, this will push some of them out to the pull constant buffer and
1798  * update the program to load them.
1799  */
1800 void
1801 fs_visitor::assign_constant_locations()
1802 {
1803    /* Only the first compile (SIMD8 mode) gets to decide on locations. */
1804    if (dispatch_width != 8)
1805       return;
1806
1807    /* Find which UNIFORM registers are still in use. */
1808    bool is_live[uniforms];
1809    for (unsigned int i = 0; i < uniforms; i++) {
1810       is_live[i] = false;
1811    }
1812
1813    foreach_list(node, &this->instructions) {
1814       fs_inst *inst = (fs_inst *) node;
1815
1816       for (int i = 0; i < 3; i++) {
1817          if (inst->src[i].file != UNIFORM)
1818             continue;
1819
1820          int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1821          if (constant_nr >= 0 && constant_nr < (int) uniforms)
1822             is_live[constant_nr] = true;
1823       }
1824    }
1825
1826    /* Only allow 16 registers (128 uniform components) as push constants.
1827     *
1828     * Just demote the end of the list.  We could probably do better
1829     * here, demoting things that are rarely used in the program first.
1830     */
1831    unsigned int max_push_components = 16 * 8;
1832    unsigned int num_push_constants = 0;
1833
1834    push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
1835
1836    for (unsigned int i = 0; i < uniforms; i++) {
1837       if (!is_live[i] || pull_constant_loc[i] != -1) {
1838          /* This UNIFORM register is either dead, or has already been demoted
1839           * to a pull const.  Mark it as no longer living in the param[] array.
1840           */
1841          push_constant_loc[i] = -1;
1842          continue;
1843       }
1844
1845       if (num_push_constants < max_push_components) {
1846          /* Retain as a push constant.  Record the location in the params[]
1847           * array.
1848           */
1849          push_constant_loc[i] = num_push_constants++;
1850       } else {
1851          /* Demote to a pull constant. */
1852          push_constant_loc[i] = -1;
1853
1854          int pull_index = stage_prog_data->nr_pull_params++;
1855          stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
1856          pull_constant_loc[i] = pull_index;
1857       }
1858    }
1859
1860    stage_prog_data->nr_params = num_push_constants;
1861
1862    /* Up until now, the param[] array has been indexed by reg + reg_offset
1863     * of UNIFORM registers.  Condense it to only contain the uniforms we
1864     * chose to upload as push constants.
1865     */
1866    for (unsigned int i = 0; i < uniforms; i++) {
1867       int remapped = push_constant_loc[i];
1868
1869       if (remapped == -1)
1870          continue;
1871
1872       assert(remapped <= i);
1873       stage_prog_data->param[remapped] = stage_prog_data->param[i];
1874    }
1875 }
1876
1877 /**
1878  * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
1879  * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
1880  */
1881 void
1882 fs_visitor::demote_pull_constants()
1883 {
1884    foreach_list(node, &this->instructions) {
1885       fs_inst *inst = (fs_inst *)node;
1886
1887       for (int i = 0; i < 3; i++) {
1888          if (inst->src[i].file != UNIFORM)
1889             continue;
1890
1891          int pull_index = pull_constant_loc[inst->src[i].reg +
1892                                             inst->src[i].reg_offset];
1893          if (pull_index == -1)
1894             continue;
1895
1896          /* Set up the annotation tracking for new generated instructions. */
1897          base_ir = inst->ir;
1898          current_annotation = inst->annotation;
1899
1900          fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
1901          fs_reg dst = fs_reg(this, glsl_type::float_type);
1902
1903          /* Generate a pull load into dst. */
1904          if (inst->src[i].reladdr) {
1905             exec_list list = VARYING_PULL_CONSTANT_LOAD(dst,
1906                                                         surf_index,
1907                                                         *inst->src[i].reladdr,
1908                                                         pull_index);
1909             inst->insert_before(&list);
1910             inst->src[i].reladdr = NULL;
1911          } else {
1912             fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1913             fs_inst *pull =
1914                new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1915                                     dst, surf_index, offset);
1916             inst->insert_before(pull);
1917             inst->src[i].set_smear(pull_index & 3);
1918          }
1919
1920          /* Rewrite the instruction to use the temporary VGRF. */
1921          inst->src[i].file = GRF;
1922          inst->src[i].reg = dst.reg;
1923          inst->src[i].reg_offset = 0;
1924       }
1925    }
1926    invalidate_live_intervals();
1927 }
1928
1929 bool
1930 fs_visitor::opt_algebraic()
1931 {
1932    bool progress = false;
1933
1934    foreach_list(node, &this->instructions) {
1935       fs_inst *inst = (fs_inst *)node;
1936
1937       switch (inst->opcode) {
1938       case BRW_OPCODE_MUL:
1939          if (inst->src[1].file != IMM)
1940             continue;
1941
1942          /* a * 1.0 = a */
1943          if (inst->src[1].is_one()) {
1944             inst->opcode = BRW_OPCODE_MOV;
1945             inst->src[1] = reg_undef;
1946             progress = true;
1947             break;
1948          }
1949
1950          /* a * 0.0 = 0.0 */
1951          if (inst->src[1].is_zero()) {
1952             inst->opcode = BRW_OPCODE_MOV;
1953             inst->src[0] = inst->src[1];
1954             inst->src[1] = reg_undef;
1955             progress = true;
1956             break;
1957          }
1958
1959          break;
1960       case BRW_OPCODE_ADD:
1961          if (inst->src[1].file != IMM)
1962             continue;
1963
1964          /* a + 0.0 = a */
1965          if (inst->src[1].is_zero()) {
1966             inst->opcode = BRW_OPCODE_MOV;
1967             inst->src[1] = reg_undef;
1968             progress = true;
1969             break;
1970          }
1971          break;
1972       case BRW_OPCODE_OR:
1973          if (inst->src[0].equals(inst->src[1])) {
1974             inst->opcode = BRW_OPCODE_MOV;
1975             inst->src[1] = reg_undef;
1976             progress = true;
1977             break;
1978          }
1979          break;
1980       case BRW_OPCODE_LRP:
1981          if (inst->src[1].equals(inst->src[2])) {
1982             inst->opcode = BRW_OPCODE_MOV;
1983             inst->src[0] = inst->src[1];
1984             inst->src[1] = reg_undef;
1985             inst->src[2] = reg_undef;
1986             progress = true;
1987             break;
1988          }
1989          break;
1990       case BRW_OPCODE_SEL:
1991          if (inst->saturate && inst->src[1].file == IMM) {
1992             switch (inst->conditional_mod) {
1993             case BRW_CONDITIONAL_LE:
1994             case BRW_CONDITIONAL_L:
1995                switch (inst->src[1].type) {
1996                case BRW_REGISTER_TYPE_F:
1997                   if (inst->src[1].imm.f >= 1.0f) {
1998                      inst->opcode = BRW_OPCODE_MOV;
1999                      inst->src[1] = reg_undef;
2000                      progress = true;
2001                   }
2002                   break;
2003                default:
2004                   break;
2005                }
2006                break;
2007             case BRW_CONDITIONAL_GE:
2008             case BRW_CONDITIONAL_G:
2009                switch (inst->src[1].type) {
2010                case BRW_REGISTER_TYPE_F:
2011                   if (inst->src[1].imm.f <= 0.0f) {
2012                      inst->opcode = BRW_OPCODE_MOV;
2013                      inst->src[1] = reg_undef;
2014                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
2015                      progress = true;
2016                   }
2017                   break;
2018                default:
2019                   break;
2020                }
2021             default:
2022                break;
2023             }
2024          }
2025          break;
2026       default:
2027          break;
2028       }
2029    }
2030
2031    return progress;
2032 }
2033
2034 /**
2035  * Removes any instructions writing a VGRF where that VGRF is not used by any
2036  * later instruction.
2037  */
2038 bool
2039 fs_visitor::dead_code_eliminate()
2040 {
2041    bool progress = false;
2042    int pc = 0;
2043
2044    calculate_live_intervals();
2045
2046    foreach_list_safe(node, &this->instructions) {
2047       fs_inst *inst = (fs_inst *)node;
2048
2049       if (inst->dst.file == GRF && !inst->has_side_effects()) {
2050          bool dead = true;
2051
2052          for (int i = 0; i < inst->regs_written; i++) {
2053             int var = live_intervals->var_from_vgrf[inst->dst.reg];
2054             assert(live_intervals->end[var + inst->dst.reg_offset + i] >= pc);
2055             if (live_intervals->end[var + inst->dst.reg_offset + i] != pc) {
2056                dead = false;
2057                break;
2058             }
2059          }
2060
2061          if (dead) {
2062             /* Don't dead code eliminate instructions that write to the
2063              * accumulator as a side-effect. Instead just set the destination
2064              * to the null register to free it.
2065              */
2066             switch (inst->opcode) {
2067             case BRW_OPCODE_ADDC:
2068             case BRW_OPCODE_SUBB:
2069             case BRW_OPCODE_MACH:
2070                inst->dst = fs_reg(retype(brw_null_reg(), inst->dst.type));
2071                break;
2072             default:
2073                inst->remove();
2074                progress = true;
2075                break;
2076             }
2077          }
2078       }
2079
2080       pc++;
2081    }
2082
2083    if (progress)
2084       invalidate_live_intervals();
2085
2086    return progress;
2087 }
2088
2089 struct dead_code_hash_key
2090 {
2091    int vgrf;
2092    int reg_offset;
2093 };
2094
2095 static bool
2096 dead_code_hash_compare(const void *a, const void *b)
2097 {
2098    return memcmp(a, b, sizeof(struct dead_code_hash_key)) == 0;
2099 }
2100
2101 static void
2102 clear_dead_code_hash(struct hash_table *ht)
2103 {
2104    struct hash_entry *entry;
2105
2106    hash_table_foreach(ht, entry) {
2107       _mesa_hash_table_remove(ht, entry);
2108    }
2109 }
2110
2111 static void
2112 insert_dead_code_hash(struct hash_table *ht,
2113                       int vgrf, int reg_offset, fs_inst *inst)
2114 {
2115    /* We don't bother freeing keys, because they'll be GCed with the ht. */
2116    struct dead_code_hash_key *key = ralloc(ht, struct dead_code_hash_key);
2117
2118    key->vgrf = vgrf;
2119    key->reg_offset = reg_offset;
2120
2121    _mesa_hash_table_insert(ht, _mesa_hash_data(key, sizeof(*key)), key, inst);
2122 }
2123
2124 static struct hash_entry *
2125 get_dead_code_hash_entry(struct hash_table *ht, int vgrf, int reg_offset)
2126 {
2127    struct dead_code_hash_key key;
2128
2129    key.vgrf = vgrf;
2130    key.reg_offset = reg_offset;
2131
2132    return _mesa_hash_table_search(ht, _mesa_hash_data(&key, sizeof(key)), &key);
2133 }
2134
2135 static void
2136 remove_dead_code_hash(struct hash_table *ht,
2137                       int vgrf, int reg_offset)
2138 {
2139    struct hash_entry *entry = get_dead_code_hash_entry(ht, vgrf, reg_offset);
2140    if (!entry)
2141       return;
2142
2143    _mesa_hash_table_remove(ht, entry);
2144 }
2145
2146 /**
2147  * Walks basic blocks, removing any regs that are written but not read before
2148  * being redefined.
2149  *
2150  * The dead_code_eliminate() function implements a global dead code
2151  * elimination, but it only handles the removing the last write to a register
2152  * if it's never read.  This one can handle intermediate writes, but only
2153  * within a basic block.
2154  */
2155 bool
2156 fs_visitor::dead_code_eliminate_local()
2157 {
2158    struct hash_table *ht;
2159    bool progress = false;
2160
2161    ht = _mesa_hash_table_create(mem_ctx, dead_code_hash_compare);
2162
2163    if (ht == NULL) {
2164       return false;
2165    }
2166
2167    foreach_list_safe(node, &this->instructions) {
2168       fs_inst *inst = (fs_inst *)node;
2169
2170       /* At a basic block, empty the HT since we don't understand dataflow
2171        * here.
2172        */
2173       if (inst->is_control_flow()) {
2174          clear_dead_code_hash(ht);
2175          continue;
2176       }
2177
2178       /* Clear the HT of any instructions that got read. */
2179       for (int i = 0; i < 3; i++) {
2180          fs_reg src = inst->src[i];
2181          if (src.file != GRF)
2182             continue;
2183
2184          int read = 1;
2185          if (inst->is_send_from_grf())
2186             read = virtual_grf_sizes[src.reg] - src.reg_offset;
2187
2188          for (int reg_offset = src.reg_offset;
2189               reg_offset < src.reg_offset + read;
2190               reg_offset++) {
2191             remove_dead_code_hash(ht, src.reg, reg_offset);
2192          }
2193       }
2194
2195       /* Add any update of a GRF to the HT, removing a previous write if it
2196        * wasn't read.
2197        */
2198       if (inst->dst.file == GRF) {
2199          if (inst->regs_written > 1) {
2200             /* We don't know how to trim channels from an instruction's
2201              * writes, so we can't incrementally remove unread channels from
2202              * it.  Just remove whatever it overwrites from the table
2203              */
2204             for (int i = 0; i < inst->regs_written; i++) {
2205                remove_dead_code_hash(ht,
2206                                      inst->dst.reg,
2207                                      inst->dst.reg_offset + i);
2208             }
2209          } else {
2210             struct hash_entry *entry =
2211                get_dead_code_hash_entry(ht, inst->dst.reg,
2212                                         inst->dst.reg_offset);
2213
2214             if (entry) {
2215                if (inst->is_partial_write()) {
2216                   /* For a partial write, we can't remove any previous dead code
2217                    * candidate, since we're just modifying their result.
2218                    */
2219                } else {
2220                   /* We're completely updating a channel, and there was a
2221                    * previous write to the channel that wasn't read.  Kill it!
2222                    */
2223                   fs_inst *inst = (fs_inst *)entry->data;
2224                   inst->remove();
2225                   progress = true;
2226                }
2227
2228                _mesa_hash_table_remove(ht, entry);
2229             }
2230
2231             if (!inst->has_side_effects())
2232                insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
2233                                      inst);
2234          }
2235       }
2236    }
2237
2238    _mesa_hash_table_destroy(ht, NULL);
2239
2240    if (progress)
2241       invalidate_live_intervals();
2242
2243    return progress;
2244 }
2245
2246 /**
2247  * Implements register coalescing: Checks if the two registers involved in a
2248  * raw move don't interfere, in which case they can both be stored in the same
2249  * place and the MOV removed.
2250  *
2251  * To do this, all uses of the source of the MOV in the shader are replaced
2252  * with the destination of the MOV. For example:
2253  *
2254  * add vgrf3:F, vgrf1:F, vgrf2:F
2255  * mov vgrf4:F, vgrf3:F
2256  * mul vgrf5:F, vgrf5:F, vgrf4:F
2257  *
2258  * becomes
2259  *
2260  * add vgrf4:F, vgrf1:F, vgrf2:F
2261  * mul vgrf5:F, vgrf5:F, vgrf4:F
2262  */
2263 bool
2264 fs_visitor::register_coalesce()
2265 {
2266    bool progress = false;
2267
2268    calculate_live_intervals();
2269
2270    int src_size = 0;
2271    int channels_remaining = 0;
2272    int reg_from = -1, reg_to = -1;
2273    int reg_to_offset[MAX_SAMPLER_MESSAGE_SIZE];
2274    fs_inst *mov[MAX_SAMPLER_MESSAGE_SIZE];
2275
2276    foreach_list(node, &this->instructions) {
2277       fs_inst *inst = (fs_inst *)node;
2278
2279       if (inst->opcode != BRW_OPCODE_MOV ||
2280           inst->is_partial_write() ||
2281           inst->saturate ||
2282           inst->src[0].file != GRF ||
2283           inst->src[0].negate ||
2284           inst->src[0].abs ||
2285           !inst->src[0].is_contiguous() ||
2286           inst->dst.file != GRF ||
2287           inst->dst.type != inst->src[0].type) {
2288          continue;
2289       }
2290
2291       if (virtual_grf_sizes[inst->src[0].reg] >
2292           virtual_grf_sizes[inst->dst.reg])
2293          continue;
2294
2295       int var_from = live_intervals->var_from_reg(&inst->src[0]);
2296       int var_to = live_intervals->var_from_reg(&inst->dst);
2297
2298       if (live_intervals->vars_interfere(var_from, var_to) &&
2299           !inst->dst.equals(inst->src[0])) {
2300
2301          /* We know that the live ranges of A (var_from) and B (var_to)
2302           * interfere because of the ->vars_interfere() call above. If the end
2303           * of B's live range is after the end of A's range, then we know two
2304           * things:
2305           *  - the start of B's live range must be in A's live range (since we
2306           *    already know the two ranges interfere, this is the only remaining
2307           *    possibility)
2308           *  - the interference isn't of the form we're looking for (where B is
2309           *    entirely inside A)
2310           */
2311          if (live_intervals->end[var_to] > live_intervals->end[var_from])
2312             continue;
2313
2314          bool overwritten = false;
2315          int scan_ip = -1;
2316
2317          foreach_list(n, &this->instructions) {
2318             fs_inst *scan_inst = (fs_inst *)n;
2319             scan_ip++;
2320
2321             if (scan_inst->is_control_flow()) {
2322                overwritten = true;
2323                break;
2324             }
2325
2326             if (scan_ip <= live_intervals->start[var_to])
2327                continue;
2328
2329             if (scan_ip > live_intervals->end[var_to])
2330                break;
2331
2332             if (scan_inst->dst.equals(inst->dst) ||
2333                 scan_inst->dst.equals(inst->src[0])) {
2334                overwritten = true;
2335                break;
2336             }
2337          }
2338
2339          if (overwritten)
2340             continue;
2341       }
2342
2343       if (reg_from != inst->src[0].reg) {
2344          reg_from = inst->src[0].reg;
2345
2346          src_size = virtual_grf_sizes[inst->src[0].reg];
2347          assert(src_size <= MAX_SAMPLER_MESSAGE_SIZE);
2348
2349          channels_remaining = src_size;
2350          memset(mov, 0, sizeof(mov));
2351
2352          reg_to = inst->dst.reg;
2353       }
2354
2355       if (reg_to != inst->dst.reg)
2356          continue;
2357
2358       const int offset = inst->src[0].reg_offset;
2359       reg_to_offset[offset] = inst->dst.reg_offset;
2360       mov[offset] = inst;
2361       channels_remaining--;
2362
2363       if (channels_remaining)
2364          continue;
2365
2366       bool removed = false;
2367       for (int i = 0; i < src_size; i++) {
2368          if (mov[i]) {
2369             removed = true;
2370
2371             mov[i]->opcode = BRW_OPCODE_NOP;
2372             mov[i]->conditional_mod = BRW_CONDITIONAL_NONE;
2373             mov[i]->dst = reg_undef;
2374             mov[i]->src[0] = reg_undef;
2375             mov[i]->src[1] = reg_undef;
2376             mov[i]->src[2] = reg_undef;
2377          }
2378       }
2379
2380       foreach_list(node, &this->instructions) {
2381          fs_inst *scan_inst = (fs_inst *)node;
2382
2383          for (int i = 0; i < src_size; i++) {
2384             if (mov[i]) {
2385                if (scan_inst->dst.file == GRF &&
2386                    scan_inst->dst.reg == reg_from &&
2387                    scan_inst->dst.reg_offset == i) {
2388                   scan_inst->dst.reg = reg_to;
2389                   scan_inst->dst.reg_offset = reg_to_offset[i];
2390                }
2391                for (int j = 0; j < 3; j++) {
2392                   if (scan_inst->src[j].file == GRF &&
2393                       scan_inst->src[j].reg == reg_from &&
2394                       scan_inst->src[j].reg_offset == i) {
2395                      scan_inst->src[j].reg = reg_to;
2396                      scan_inst->src[j].reg_offset = reg_to_offset[i];
2397                   }
2398                }
2399             }
2400          }
2401       }
2402
2403       if (removed) {
2404          live_intervals->start[var_to] = MIN2(live_intervals->start[var_to],
2405                                               live_intervals->start[var_from]);
2406          live_intervals->end[var_to] = MAX2(live_intervals->end[var_to],
2407                                             live_intervals->end[var_from]);
2408          reg_from = -1;
2409       }
2410    }
2411
2412    foreach_list_safe(node, &this->instructions) {
2413       fs_inst *inst = (fs_inst *)node;
2414
2415       if (inst->opcode == BRW_OPCODE_NOP) {
2416          inst->remove();
2417          progress = true;
2418       }
2419    }
2420
2421    if (progress)
2422       invalidate_live_intervals();
2423
2424    return progress;
2425 }
2426
2427 bool
2428 fs_visitor::compute_to_mrf()
2429 {
2430    bool progress = false;
2431    int next_ip = 0;
2432
2433    calculate_live_intervals();
2434
2435    foreach_list_safe(node, &this->instructions) {
2436       fs_inst *inst = (fs_inst *)node;
2437
2438       int ip = next_ip;
2439       next_ip++;
2440
2441       if (inst->opcode != BRW_OPCODE_MOV ||
2442           inst->is_partial_write() ||
2443           inst->dst.file != MRF || inst->src[0].file != GRF ||
2444           inst->dst.type != inst->src[0].type ||
2445           inst->src[0].abs || inst->src[0].negate ||
2446           !inst->src[0].is_contiguous() ||
2447           inst->src[0].subreg_offset)
2448          continue;
2449
2450       /* Work out which hardware MRF registers are written by this
2451        * instruction.
2452        */
2453       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2454       int mrf_high;
2455       if (inst->dst.reg & BRW_MRF_COMPR4) {
2456          mrf_high = mrf_low + 4;
2457       } else if (dispatch_width == 16 &&
2458                  (!inst->force_uncompressed && !inst->force_sechalf)) {
2459          mrf_high = mrf_low + 1;
2460       } else {
2461          mrf_high = mrf_low;
2462       }
2463
2464       /* Can't compute-to-MRF this GRF if someone else was going to
2465        * read it later.
2466        */
2467       if (this->virtual_grf_end[inst->src[0].reg] > ip)
2468          continue;
2469
2470       /* Found a move of a GRF to a MRF.  Let's see if we can go
2471        * rewrite the thing that made this GRF to write into the MRF.
2472        */
2473       fs_inst *scan_inst;
2474       for (scan_inst = (fs_inst *)inst->prev;
2475            scan_inst->prev != NULL;
2476            scan_inst = (fs_inst *)scan_inst->prev) {
2477          if (scan_inst->dst.file == GRF &&
2478              scan_inst->dst.reg == inst->src[0].reg) {
2479             /* Found the last thing to write our reg we want to turn
2480              * into a compute-to-MRF.
2481              */
2482
2483             /* If this one instruction didn't populate all the
2484              * channels, bail.  We might be able to rewrite everything
2485              * that writes that reg, but it would require smarter
2486              * tracking to delay the rewriting until complete success.
2487              */
2488             if (scan_inst->is_partial_write())
2489                break;
2490
2491             /* Things returning more than one register would need us to
2492              * understand coalescing out more than one MOV at a time.
2493              */
2494             if (scan_inst->regs_written > 1)
2495                break;
2496
2497             /* SEND instructions can't have MRF as a destination. */
2498             if (scan_inst->mlen)
2499                break;
2500
2501             if (brw->gen == 6) {
2502                /* gen6 math instructions must have the destination be
2503                 * GRF, so no compute-to-MRF for them.
2504                 */
2505                if (scan_inst->is_math()) {
2506                   break;
2507                }
2508             }
2509
2510             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2511                /* Found the creator of our MRF's source value. */
2512                scan_inst->dst.file = MRF;
2513                scan_inst->dst.reg = inst->dst.reg;
2514                scan_inst->saturate |= inst->saturate;
2515                inst->remove();
2516                progress = true;
2517             }
2518             break;
2519          }
2520
2521          /* We don't handle control flow here.  Most computation of
2522           * values that end up in MRFs are shortly before the MRF
2523           * write anyway.
2524           */
2525          if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2526             break;
2527
2528          /* You can't read from an MRF, so if someone else reads our
2529           * MRF's source GRF that we wanted to rewrite, that stops us.
2530           */
2531          bool interfered = false;
2532          for (int i = 0; i < 3; i++) {
2533             if (scan_inst->src[i].file == GRF &&
2534                 scan_inst->src[i].reg == inst->src[0].reg &&
2535                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2536                interfered = true;
2537             }
2538          }
2539          if (interfered)
2540             break;
2541
2542          if (scan_inst->dst.file == MRF) {
2543             /* If somebody else writes our MRF here, we can't
2544              * compute-to-MRF before that.
2545              */
2546             int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2547             int scan_mrf_high;
2548
2549             if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2550                scan_mrf_high = scan_mrf_low + 4;
2551             } else if (dispatch_width == 16 &&
2552                        (!scan_inst->force_uncompressed &&
2553                         !scan_inst->force_sechalf)) {
2554                scan_mrf_high = scan_mrf_low + 1;
2555             } else {
2556                scan_mrf_high = scan_mrf_low;
2557             }
2558
2559             if (mrf_low == scan_mrf_low ||
2560                 mrf_low == scan_mrf_high ||
2561                 mrf_high == scan_mrf_low ||
2562                 mrf_high == scan_mrf_high) {
2563                break;
2564             }
2565          }
2566
2567          if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2568             /* Found a SEND instruction, which means that there are
2569              * live values in MRFs from base_mrf to base_mrf +
2570              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
2571              * above it.
2572              */
2573             if (mrf_low >= scan_inst->base_mrf &&
2574                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2575                break;
2576             }
2577             if (mrf_high >= scan_inst->base_mrf &&
2578                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2579                break;
2580             }
2581          }
2582       }
2583    }
2584
2585    if (progress)
2586       invalidate_live_intervals();
2587
2588    return progress;
2589 }
2590
2591 /**
2592  * Walks through basic blocks, looking for repeated MRF writes and
2593  * removing the later ones.
2594  */
2595 bool
2596 fs_visitor::remove_duplicate_mrf_writes()
2597 {
2598    fs_inst *last_mrf_move[16];
2599    bool progress = false;
2600
2601    /* Need to update the MRF tracking for compressed instructions. */
2602    if (dispatch_width == 16)
2603       return false;
2604
2605    memset(last_mrf_move, 0, sizeof(last_mrf_move));
2606
2607    foreach_list_safe(node, &this->instructions) {
2608       fs_inst *inst = (fs_inst *)node;
2609
2610       if (inst->is_control_flow()) {
2611          memset(last_mrf_move, 0, sizeof(last_mrf_move));
2612       }
2613
2614       if (inst->opcode == BRW_OPCODE_MOV &&
2615           inst->dst.file == MRF) {
2616          fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2617          if (prev_inst && inst->equals(prev_inst)) {
2618             inst->remove();
2619             progress = true;
2620             continue;
2621          }
2622       }
2623
2624       /* Clear out the last-write records for MRFs that were overwritten. */
2625       if (inst->dst.file == MRF) {
2626          last_mrf_move[inst->dst.reg] = NULL;
2627       }
2628
2629       if (inst->mlen > 0 && inst->base_mrf != -1) {
2630          /* Found a SEND instruction, which will include two or fewer
2631           * implied MRF writes.  We could do better here.
2632           */
2633          for (int i = 0; i < implied_mrf_writes(inst); i++) {
2634             last_mrf_move[inst->base_mrf + i] = NULL;
2635          }
2636       }
2637
2638       /* Clear out any MRF move records whose sources got overwritten. */
2639       if (inst->dst.file == GRF) {
2640          for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2641             if (last_mrf_move[i] &&
2642                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2643                last_mrf_move[i] = NULL;
2644             }
2645          }
2646       }
2647
2648       if (inst->opcode == BRW_OPCODE_MOV &&
2649           inst->dst.file == MRF &&
2650           inst->src[0].file == GRF &&
2651           !inst->is_partial_write()) {
2652          last_mrf_move[inst->dst.reg] = inst;
2653       }
2654    }
2655
2656    if (progress)
2657       invalidate_live_intervals();
2658
2659    return progress;
2660 }
2661
2662 static void
2663 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2664                         int first_grf, int grf_len)
2665 {
2666    bool inst_simd16 = (dispatch_width > 8 &&
2667                        !inst->force_uncompressed &&
2668                        !inst->force_sechalf);
2669
2670    /* Clear the flag for registers that actually got read (as expected). */
2671    for (int i = 0; i < 3; i++) {
2672       int grf;
2673       if (inst->src[i].file == GRF) {
2674          grf = inst->src[i].reg;
2675       } else if (inst->src[i].file == HW_REG &&
2676                  inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2677          grf = inst->src[i].fixed_hw_reg.nr;
2678       } else {
2679          continue;
2680       }
2681
2682       if (grf >= first_grf &&
2683           grf < first_grf + grf_len) {
2684          deps[grf - first_grf] = false;
2685          if (inst_simd16)
2686             deps[grf - first_grf + 1] = false;
2687       }
2688    }
2689 }
2690
2691 /**
2692  * Implements this workaround for the original 965:
2693  *
2694  *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2695  *      check for post destination dependencies on this instruction, software
2696  *      must ensure that there is no destination hazard for the case of ‘write
2697  *      followed by a posted write’ shown in the following example.
2698  *
2699  *      1. mov r3 0
2700  *      2. send r3.xy <rest of send instruction>
2701  *      3. mov r2 r3
2702  *
2703  *      Due to no post-destination dependency check on the ‘send’, the above
2704  *      code sequence could have two instructions (1 and 2) in flight at the
2705  *      same time that both consider ‘r3’ as the target of their final writes.
2706  */
2707 void
2708 fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
2709 {
2710    int reg_size = dispatch_width / 8;
2711    int write_len = inst->regs_written * reg_size;
2712    int first_write_grf = inst->dst.reg;
2713    bool needs_dep[BRW_MAX_MRF];
2714    assert(write_len < (int)sizeof(needs_dep) - 1);
2715
2716    memset(needs_dep, false, sizeof(needs_dep));
2717    memset(needs_dep, true, write_len);
2718
2719    clear_deps_for_inst_src(inst, dispatch_width,
2720                            needs_dep, first_write_grf, write_len);
2721
2722    /* Walk backwards looking for writes to registers we're writing which
2723     * aren't read since being written.  If we hit the start of the program,
2724     * we assume that there are no outstanding dependencies on entry to the
2725     * program.
2726     */
2727    for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2728         scan_inst != NULL;
2729         scan_inst = (fs_inst *)scan_inst->prev) {
2730
2731       /* If we hit control flow, assume that there *are* outstanding
2732        * dependencies, and force their cleanup before our instruction.
2733        */
2734       if (scan_inst->is_control_flow()) {
2735          for (int i = 0; i < write_len; i++) {
2736             if (needs_dep[i]) {
2737                inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2738             }
2739          }
2740          return;
2741       }
2742
2743       bool scan_inst_simd16 = (dispatch_width > 8 &&
2744                                !scan_inst->force_uncompressed &&
2745                                !scan_inst->force_sechalf);
2746
2747       /* We insert our reads as late as possible on the assumption that any
2748        * instruction but a MOV that might have left us an outstanding
2749        * dependency has more latency than a MOV.
2750        */
2751       if (scan_inst->dst.file == GRF) {
2752          for (int i = 0; i < scan_inst->regs_written; i++) {
2753             int reg = scan_inst->dst.reg + i * reg_size;
2754
2755             if (reg >= first_write_grf &&
2756                 reg < first_write_grf + write_len &&
2757                 needs_dep[reg - first_write_grf]) {
2758                inst->insert_before(DEP_RESOLVE_MOV(reg));
2759                needs_dep[reg - first_write_grf] = false;
2760                if (scan_inst_simd16)
2761                   needs_dep[reg - first_write_grf + 1] = false;
2762             }
2763          }
2764       }
2765
2766       /* Clear the flag for registers that actually got read (as expected). */
2767       clear_deps_for_inst_src(scan_inst, dispatch_width,
2768                               needs_dep, first_write_grf, write_len);
2769
2770       /* Continue the loop only if we haven't resolved all the dependencies */
2771       int i;
2772       for (i = 0; i < write_len; i++) {
2773          if (needs_dep[i])
2774             break;
2775       }
2776       if (i == write_len)
2777          return;
2778    }
2779 }
2780
2781 /**
2782  * Implements this workaround for the original 965:
2783  *
2784  *     "[DevBW, DevCL] Errata: A destination register from a send can not be
2785  *      used as a destination register until after it has been sourced by an
2786  *      instruction with a different destination register.
2787  */
2788 void
2789 fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
2790 {
2791    int write_len = inst->regs_written * dispatch_width / 8;
2792    int first_write_grf = inst->dst.reg;
2793    bool needs_dep[BRW_MAX_MRF];
2794    assert(write_len < (int)sizeof(needs_dep) - 1);
2795
2796    memset(needs_dep, false, sizeof(needs_dep));
2797    memset(needs_dep, true, write_len);
2798    /* Walk forwards looking for writes to registers we're writing which aren't
2799     * read before being written.
2800     */
2801    for (fs_inst *scan_inst = (fs_inst *)inst->next;
2802         !scan_inst->is_tail_sentinel();
2803         scan_inst = (fs_inst *)scan_inst->next) {
2804       /* If we hit control flow, force resolve all remaining dependencies. */
2805       if (scan_inst->is_control_flow()) {
2806          for (int i = 0; i < write_len; i++) {
2807             if (needs_dep[i])
2808                scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2809          }
2810          return;
2811       }
2812
2813       /* Clear the flag for registers that actually got read (as expected). */
2814       clear_deps_for_inst_src(scan_inst, dispatch_width,
2815                               needs_dep, first_write_grf, write_len);
2816
2817       /* We insert our reads as late as possible since they're reading the
2818        * result of a SEND, which has massive latency.
2819        */
2820       if (scan_inst->dst.file == GRF &&
2821           scan_inst->dst.reg >= first_write_grf &&
2822           scan_inst->dst.reg < first_write_grf + write_len &&
2823           needs_dep[scan_inst->dst.reg - first_write_grf]) {
2824          scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2825          needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2826       }
2827
2828       /* Continue the loop only if we haven't resolved all the dependencies */
2829       int i;
2830       for (i = 0; i < write_len; i++) {
2831          if (needs_dep[i])
2832             break;
2833       }
2834       if (i == write_len)
2835          return;
2836    }
2837
2838    /* If we hit the end of the program, resolve all remaining dependencies out
2839     * of paranoia.
2840     */
2841    fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2842    assert(last_inst->eot);
2843    for (int i = 0; i < write_len; i++) {
2844       if (needs_dep[i])
2845          last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2846    }
2847 }
2848
2849 void
2850 fs_visitor::insert_gen4_send_dependency_workarounds()
2851 {
2852    if (brw->gen != 4 || brw->is_g4x)
2853       return;
2854
2855    /* Note that we're done with register allocation, so GRF fs_regs always
2856     * have a .reg_offset of 0.
2857     */
2858
2859    foreach_list_safe(node, &this->instructions) {
2860       fs_inst *inst = (fs_inst *)node;
2861
2862       if (inst->mlen != 0 && inst->dst.file == GRF) {
2863          insert_gen4_pre_send_dependency_workarounds(inst);
2864          insert_gen4_post_send_dependency_workarounds(inst);
2865       }
2866    }
2867 }
2868
2869 /**
2870  * Turns the generic expression-style uniform pull constant load instruction
2871  * into a hardware-specific series of instructions for loading a pull
2872  * constant.
2873  *
2874  * The expression style allows the CSE pass before this to optimize out
2875  * repeated loads from the same offset, and gives the pre-register-allocation
2876  * scheduling full flexibility, while the conversion to native instructions
2877  * allows the post-register-allocation scheduler the best information
2878  * possible.
2879  *
2880  * Note that execution masking for setting up pull constant loads is special:
2881  * the channels that need to be written are unrelated to the current execution
2882  * mask, since a later instruction will use one of the result channels as a
2883  * source operand for all 8 or 16 of its channels.
2884  */
2885 void
2886 fs_visitor::lower_uniform_pull_constant_loads()
2887 {
2888    foreach_list(node, &this->instructions) {
2889       fs_inst *inst = (fs_inst *)node;
2890
2891       if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2892          continue;
2893
2894       if (brw->gen >= 7) {
2895          /* The offset arg before was a vec4-aligned byte offset.  We need to
2896           * turn it into a dword offset.
2897           */
2898          fs_reg const_offset_reg = inst->src[1];
2899          assert(const_offset_reg.file == IMM &&
2900                 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2901          const_offset_reg.imm.u /= 4;
2902          fs_reg payload = fs_reg(this, glsl_type::uint_type);
2903
2904          /* This is actually going to be a MOV, but since only the first dword
2905           * is accessed, we have a special opcode to do just that one.  Note
2906           * that this needs to be an operation that will be considered a def
2907           * by live variable analysis, or register allocation will explode.
2908           */
2909          fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2910                                                payload, const_offset_reg);
2911          setup->force_writemask_all = true;
2912
2913          setup->ir = inst->ir;
2914          setup->annotation = inst->annotation;
2915          inst->insert_before(setup);
2916
2917          /* Similarly, this will only populate the first 4 channels of the
2918           * result register (since we only use smear values from 0-3), but we
2919           * don't tell the optimizer.
2920           */
2921          inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2922          inst->src[1] = payload;
2923
2924          invalidate_live_intervals();
2925       } else {
2926          /* Before register allocation, we didn't tell the scheduler about the
2927           * MRF we use.  We know it's safe to use this MRF because nothing
2928           * else does except for register spill/unspill, which generates and
2929           * uses its MRF within a single IR instruction.
2930           */
2931          inst->base_mrf = 14;
2932          inst->mlen = 1;
2933       }
2934    }
2935 }
2936
2937 void
2938 fs_visitor::dump_instructions()
2939 {
2940    calculate_register_pressure();
2941
2942    int ip = 0, max_pressure = 0;
2943    foreach_list(node, &this->instructions) {
2944       backend_instruction *inst = (backend_instruction *)node;
2945       max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
2946       fprintf(stderr, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
2947       dump_instruction(inst);
2948       ++ip;
2949    }
2950    fprintf(stderr, "Maximum %3d registers live at once.\n", max_pressure);
2951 }
2952
2953 void
2954 fs_visitor::dump_instruction(backend_instruction *be_inst)
2955 {
2956    fs_inst *inst = (fs_inst *)be_inst;
2957
2958    if (inst->predicate) {
2959       fprintf(stderr, "(%cf0.%d) ",
2960              inst->predicate_inverse ? '-' : '+',
2961              inst->flag_subreg);
2962    }
2963
2964    fprintf(stderr, "%s", brw_instruction_name(inst->opcode));
2965    if (inst->saturate)
2966       fprintf(stderr, ".sat");
2967    if (inst->conditional_mod) {
2968       fprintf(stderr, "%s", conditional_modifier[inst->conditional_mod]);
2969       if (!inst->predicate &&
2970           (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2971                               inst->opcode != BRW_OPCODE_IF &&
2972                               inst->opcode != BRW_OPCODE_WHILE))) {
2973          fprintf(stderr, ".f0.%d", inst->flag_subreg);
2974       }
2975    }
2976    fprintf(stderr, " ");
2977
2978
2979    switch (inst->dst.file) {
2980    case GRF:
2981       fprintf(stderr, "vgrf%d", inst->dst.reg);
2982       if (virtual_grf_sizes[inst->dst.reg] != 1 ||
2983           inst->dst.subreg_offset)
2984          fprintf(stderr, "+%d.%d",
2985                  inst->dst.reg_offset, inst->dst.subreg_offset);
2986       break;
2987    case MRF:
2988       fprintf(stderr, "m%d", inst->dst.reg);
2989       break;
2990    case BAD_FILE:
2991       fprintf(stderr, "(null)");
2992       break;
2993    case UNIFORM:
2994       fprintf(stderr, "***u%d***", inst->dst.reg);
2995       break;
2996    case HW_REG:
2997       if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
2998          switch (inst->dst.fixed_hw_reg.nr) {
2999          case BRW_ARF_NULL:
3000             fprintf(stderr, "null");
3001             break;
3002          case BRW_ARF_ADDRESS:
3003             fprintf(stderr, "a0.%d", inst->dst.fixed_hw_reg.subnr);
3004             break;
3005          case BRW_ARF_ACCUMULATOR:
3006             fprintf(stderr, "acc%d", inst->dst.fixed_hw_reg.subnr);
3007             break;
3008          case BRW_ARF_FLAG:
3009             fprintf(stderr, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3010                              inst->dst.fixed_hw_reg.subnr);
3011             break;
3012          default:
3013             fprintf(stderr, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3014                                inst->dst.fixed_hw_reg.subnr);
3015             break;
3016          }
3017       } else {
3018          fprintf(stderr, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
3019       }
3020       if (inst->dst.fixed_hw_reg.subnr)
3021          fprintf(stderr, "+%d", inst->dst.fixed_hw_reg.subnr);
3022       break;
3023    default:
3024       fprintf(stderr, "???");
3025       break;
3026    }
3027    fprintf(stderr, ":%s, ", brw_reg_type_letters(inst->dst.type));
3028
3029    for (int i = 0; i < 3 && inst->src[i].file != BAD_FILE; i++) {
3030       if (inst->src[i].negate)
3031          fprintf(stderr, "-");
3032       if (inst->src[i].abs)
3033          fprintf(stderr, "|");
3034       switch (inst->src[i].file) {
3035       case GRF:
3036          fprintf(stderr, "vgrf%d", inst->src[i].reg);
3037          if (virtual_grf_sizes[inst->src[i].reg] != 1 ||
3038              inst->src[i].subreg_offset)
3039             fprintf(stderr, "+%d.%d", inst->src[i].reg_offset,
3040                     inst->src[i].subreg_offset);
3041          break;
3042       case MRF:
3043          fprintf(stderr, "***m%d***", inst->src[i].reg);
3044          break;
3045       case UNIFORM:
3046          fprintf(stderr, "u%d", inst->src[i].reg);
3047          if (inst->src[i].reladdr) {
3048             fprintf(stderr, "+reladdr");
3049          } else if (virtual_grf_sizes[inst->src[i].reg] != 1 ||
3050              inst->src[i].subreg_offset) {
3051             fprintf(stderr, "+%d.%d", inst->src[i].reg_offset,
3052                     inst->src[i].subreg_offset);
3053          }
3054          break;
3055       case BAD_FILE:
3056          fprintf(stderr, "(null)");
3057          break;
3058       case IMM:
3059          switch (inst->src[i].type) {
3060          case BRW_REGISTER_TYPE_F:
3061             fprintf(stderr, "%ff", inst->src[i].imm.f);
3062             break;
3063          case BRW_REGISTER_TYPE_D:
3064             fprintf(stderr, "%dd", inst->src[i].imm.i);
3065             break;
3066          case BRW_REGISTER_TYPE_UD:
3067             fprintf(stderr, "%uu", inst->src[i].imm.u);
3068             break;
3069          default:
3070             fprintf(stderr, "???");
3071             break;
3072          }
3073          break;
3074       case HW_REG:
3075          if (inst->src[i].fixed_hw_reg.negate)
3076             fprintf(stderr, "-");
3077          if (inst->src[i].fixed_hw_reg.abs)
3078             fprintf(stderr, "|");
3079          if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3080             switch (inst->src[i].fixed_hw_reg.nr) {
3081             case BRW_ARF_NULL:
3082                fprintf(stderr, "null");
3083                break;
3084             case BRW_ARF_ADDRESS:
3085                fprintf(stderr, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
3086                break;
3087             case BRW_ARF_ACCUMULATOR:
3088                fprintf(stderr, "acc%d", inst->src[i].fixed_hw_reg.subnr);
3089                break;
3090             case BRW_ARF_FLAG:
3091                fprintf(stderr, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3092                                 inst->src[i].fixed_hw_reg.subnr);
3093                break;
3094             default:
3095                fprintf(stderr, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3096                                   inst->src[i].fixed_hw_reg.subnr);
3097                break;
3098             }
3099          } else {
3100             fprintf(stderr, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
3101          }
3102          if (inst->src[i].fixed_hw_reg.subnr)
3103             fprintf(stderr, "+%d", inst->src[i].fixed_hw_reg.subnr);
3104          if (inst->src[i].fixed_hw_reg.abs)
3105             fprintf(stderr, "|");
3106          break;
3107       default:
3108          fprintf(stderr, "???");
3109          break;
3110       }
3111       if (inst->src[i].abs)
3112          fprintf(stderr, "|");
3113
3114       if (inst->src[i].file != IMM) {
3115          fprintf(stderr, ":%s", brw_reg_type_letters(inst->src[i].type));
3116       }
3117
3118       if (i < 2 && inst->src[i + 1].file != BAD_FILE)
3119          fprintf(stderr, ", ");
3120    }
3121
3122    fprintf(stderr, " ");
3123
3124    if (inst->force_uncompressed)
3125       fprintf(stderr, "1sthalf ");
3126
3127    if (inst->force_sechalf)
3128       fprintf(stderr, "2ndhalf ");
3129
3130    fprintf(stderr, "\n");
3131 }
3132
3133 /**
3134  * Possibly returns an instruction that set up @param reg.
3135  *
3136  * Sometimes we want to take the result of some expression/variable
3137  * dereference tree and rewrite the instruction generating the result
3138  * of the tree.  When processing the tree, we know that the
3139  * instructions generated are all writing temporaries that are dead
3140  * outside of this tree.  So, if we have some instructions that write
3141  * a temporary, we're free to point that temp write somewhere else.
3142  *
3143  * Note that this doesn't guarantee that the instruction generated
3144  * only reg -- it might be the size=4 destination of a texture instruction.
3145  */
3146 fs_inst *
3147 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3148                                            fs_inst *end,
3149                                            const fs_reg &reg)
3150 {
3151    if (end == start ||
3152        end->is_partial_write() ||
3153        reg.reladdr ||
3154        !reg.equals(end->dst)) {
3155       return NULL;
3156    } else {
3157       return end;
3158    }
3159 }
3160
3161 void
3162 fs_visitor::setup_payload_gen6()
3163 {
3164    bool uses_depth =
3165       (fp->Base.InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3166    unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
3167
3168    assert(brw->gen >= 6);
3169
3170    /* R0-1: masks, pixel X/Y coordinates. */
3171    c->nr_payload_regs = 2;
3172    /* R2: only for 32-pixel dispatch.*/
3173
3174    /* R3-26: barycentric interpolation coordinates.  These appear in the
3175     * same order that they appear in the brw_wm_barycentric_interp_mode
3176     * enum.  Each set of coordinates occupies 2 registers if dispatch width
3177     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
3178     * appear if they were enabled using the "Barycentric Interpolation
3179     * Mode" bits in WM_STATE.
3180     */
3181    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3182       if (barycentric_interp_modes & (1 << i)) {
3183          c->barycentric_coord_reg[i] = c->nr_payload_regs;
3184          c->nr_payload_regs += 2;
3185          if (dispatch_width == 16) {
3186             c->nr_payload_regs += 2;
3187          }
3188       }
3189    }
3190
3191    /* R27: interpolated depth if uses source depth */
3192    if (uses_depth) {
3193       c->source_depth_reg = c->nr_payload_regs;
3194       c->nr_payload_regs++;
3195       if (dispatch_width == 16) {
3196          /* R28: interpolated depth if not SIMD8. */
3197          c->nr_payload_regs++;
3198       }
3199    }
3200    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3201    if (uses_depth) {
3202       c->source_w_reg = c->nr_payload_regs;
3203       c->nr_payload_regs++;
3204       if (dispatch_width == 16) {
3205          /* R30: interpolated W if not SIMD8. */
3206          c->nr_payload_regs++;
3207       }
3208    }
3209
3210    c->prog_data.uses_pos_offset = c->key.compute_pos_offset;
3211    /* R31: MSAA position offsets. */
3212    if (c->prog_data.uses_pos_offset) {
3213       c->sample_pos_reg = c->nr_payload_regs;
3214       c->nr_payload_regs++;
3215    }
3216
3217    /* R32: MSAA input coverage mask */
3218    if (fp->Base.SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
3219       assert(brw->gen >= 7);
3220       c->sample_mask_reg = c->nr_payload_regs;
3221       c->nr_payload_regs++;
3222       if (dispatch_width == 16) {
3223          /* R33: input coverage mask if not SIMD8. */
3224          c->nr_payload_regs++;
3225       }
3226    }
3227
3228    /* R34-: bary for 32-pixel. */
3229    /* R58-59: interp W for 32-pixel. */
3230
3231    if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3232       c->source_depth_to_render_target = true;
3233    }
3234 }
3235
3236 void
3237 fs_visitor::assign_binding_table_offsets()
3238 {
3239    uint32_t next_binding_table_offset = 0;
3240
3241    /* If there are no color regions, we still perform an FB write to a null
3242     * renderbuffer, which we place at surface index 0.
3243     */
3244    c->prog_data.binding_table.render_target_start = next_binding_table_offset;
3245    next_binding_table_offset += MAX2(c->key.nr_color_regions, 1);
3246
3247    assign_common_binding_table_offsets(next_binding_table_offset);
3248 }
3249
3250 void
3251 fs_visitor::calculate_register_pressure()
3252 {
3253    calculate_live_intervals();
3254
3255    int num_instructions = 0;
3256    foreach_list(node, &this->instructions) {
3257       ++num_instructions;
3258    }
3259
3260    regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
3261
3262    for (int reg = 0; reg < virtual_grf_count; reg++) {
3263       for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
3264          regs_live_at_ip[ip] += virtual_grf_sizes[reg];
3265    }
3266 }
3267
3268 /**
3269  * Look for repeated FS_OPCODE_MOV_DISPATCH_TO_FLAGS and drop the later ones.
3270  *
3271  * The needs_unlit_centroid_workaround ends up producing one of these per
3272  * channel of centroid input, so it's good to clean them up.
3273  *
3274  * An assumption here is that nothing ever modifies the dispatched pixels
3275  * value that FS_OPCODE_MOV_DISPATCH_TO_FLAGS reads from, but the hardware
3276  * dictates that anyway.
3277  */
3278 void
3279 fs_visitor::opt_drop_redundant_mov_to_flags()
3280 {
3281    bool flag_mov_found[2] = {false};
3282
3283    foreach_list_safe(node, &this->instructions) {
3284       fs_inst *inst = (fs_inst *)node;
3285
3286       if (inst->is_control_flow()) {
3287          memset(flag_mov_found, 0, sizeof(flag_mov_found));
3288       } else if (inst->opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS) {
3289          if (!flag_mov_found[inst->flag_subreg])
3290             flag_mov_found[inst->flag_subreg] = true;
3291          else
3292             inst->remove();
3293       } else if (inst->writes_flag()) {
3294          flag_mov_found[inst->flag_subreg] = false;
3295       }
3296    }
3297 }
3298
3299 bool
3300 fs_visitor::run()
3301 {
3302    sanity_param_count = fp->Base.Parameters->NumParameters;
3303    bool allocated_without_spills;
3304
3305    assign_binding_table_offsets();
3306
3307    if (brw->gen >= 6)
3308       setup_payload_gen6();
3309    else
3310       setup_payload_gen4();
3311
3312    if (0) {
3313       emit_dummy_fs();
3314    } else {
3315       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3316          emit_shader_time_begin();
3317
3318       calculate_urb_setup();
3319       if (fp->Base.InputsRead > 0) {
3320          if (brw->gen < 6)
3321             emit_interpolation_setup_gen4();
3322          else
3323             emit_interpolation_setup_gen6();
3324       }
3325
3326       /* We handle discards by keeping track of the still-live pixels in f0.1.
3327        * Initialize it with the dispatched pixels.
3328        */
3329       if (fp->UsesKill || c->key.alpha_test_func) {
3330          fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
3331          discard_init->flag_subreg = 1;
3332       }
3333
3334       /* Generate FS IR for main().  (the visitor only descends into
3335        * functions called "main").
3336        */
3337       if (shader) {
3338          foreach_list(node, &*shader->base.ir) {
3339             ir_instruction *ir = (ir_instruction *)node;
3340             base_ir = ir;
3341             this->result = reg_undef;
3342             ir->accept(this);
3343          }
3344       } else {
3345          emit_fragment_program_code();
3346       }
3347       base_ir = NULL;
3348       if (failed)
3349          return false;
3350
3351       emit(FS_OPCODE_PLACEHOLDER_HALT);
3352
3353       if (c->key.alpha_test_func)
3354          emit_alpha_test();
3355
3356       emit_fb_writes();
3357
3358       split_virtual_grfs();
3359
3360       move_uniform_array_access_to_pull_constants();
3361       assign_constant_locations();
3362       demote_pull_constants();
3363
3364       opt_drop_redundant_mov_to_flags();
3365
3366       bool progress;
3367       do {
3368          progress = false;
3369
3370          compact_virtual_grfs();
3371
3372          progress = remove_duplicate_mrf_writes() || progress;
3373
3374          progress = opt_algebraic() || progress;
3375          progress = opt_cse() || progress;
3376          progress = opt_copy_propagate() || progress;
3377          progress = opt_peephole_predicated_break() || progress;
3378          progress = dead_code_eliminate() || progress;
3379          progress = dead_code_eliminate_local() || progress;
3380          progress = opt_peephole_sel() || progress;
3381          progress = dead_control_flow_eliminate(this) || progress;
3382          progress = opt_saturate_propagation() || progress;
3383          progress = register_coalesce() || progress;
3384          progress = compute_to_mrf() || progress;
3385       } while (progress);
3386
3387       lower_uniform_pull_constant_loads();
3388
3389       assign_curb_setup();
3390       assign_urb_setup();
3391
3392       static enum instruction_scheduler_mode pre_modes[] = {
3393          SCHEDULE_PRE,
3394          SCHEDULE_PRE_NON_LIFO,
3395          SCHEDULE_PRE_LIFO,
3396       };
3397
3398       /* Try each scheduling heuristic to see if it can successfully register
3399        * allocate without spilling.  They should be ordered by decreasing
3400        * performance but increasing likelihood of allocating.
3401        */
3402       for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3403          schedule_instructions(pre_modes[i]);
3404
3405          if (0) {
3406             assign_regs_trivial();
3407             allocated_without_spills = true;
3408          } else {
3409             allocated_without_spills = assign_regs(false);
3410          }
3411          if (allocated_without_spills)
3412             break;
3413       }
3414
3415       if (!allocated_without_spills) {
3416          /* We assume that any spilling is worse than just dropping back to
3417           * SIMD8.  There's probably actually some intermediate point where
3418           * SIMD16 with a couple of spills is still better.
3419           */
3420          if (dispatch_width == 16) {
3421             fail("Failure to register allocate.  Reduce number of "
3422                  "live scalar values to avoid this.");
3423          }
3424
3425          /* Since we're out of heuristics, just go spill registers until we
3426           * get an allocation.
3427           */
3428          while (!assign_regs(true)) {
3429             if (failed)
3430                break;
3431          }
3432       }
3433    }
3434    assert(force_uncompressed_stack == 0);
3435
3436    /* This must come after all optimization and register allocation, since
3437     * it inserts dead code that happens to have side effects, and it does
3438     * so based on the actual physical registers in use.
3439     */
3440    insert_gen4_send_dependency_workarounds();
3441
3442    if (failed)
3443       return false;
3444
3445    if (!allocated_without_spills)
3446       schedule_instructions(SCHEDULE_POST);
3447
3448    if (dispatch_width == 8)
3449       c->prog_data.reg_blocks = brw_register_blocks(grf_used);
3450    else
3451       c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
3452
3453    /* If any state parameters were appended, then ParameterValues could have
3454     * been realloced, in which case the driver uniform storage set up by
3455     * _mesa_associate_uniform_storage() would point to freed memory.  Make
3456     * sure that didn't happen.
3457     */
3458    assert(sanity_param_count == fp->Base.Parameters->NumParameters);
3459
3460    return !failed;
3461 }
3462
3463 const unsigned *
3464 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
3465                struct gl_fragment_program *fp,
3466                struct gl_shader_program *prog,
3467                unsigned *final_assembly_size)
3468 {
3469    bool start_busy = false;
3470    double start_time = 0;
3471
3472    if (unlikely(brw->perf_debug)) {
3473       start_busy = (brw->batch.last_bo &&
3474                     drm_intel_bo_busy(brw->batch.last_bo));
3475       start_time = get_time();
3476    }
3477
3478    struct brw_shader *shader = NULL;
3479    if (prog)
3480       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3481
3482    if (unlikely(INTEL_DEBUG & DEBUG_WM))
3483       brw_dump_ir(brw, "fragment", prog, &shader->base, &fp->Base);
3484
3485    /* Now the main event: Visit the shader IR and generate our FS IR for it.
3486     */
3487    fs_visitor v(brw, c, prog, fp, 8);
3488    if (!v.run()) {
3489       if (prog) {
3490          prog->LinkStatus = false;
3491          ralloc_strcat(&prog->InfoLog, v.fail_msg);
3492       }
3493
3494       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3495                     v.fail_msg);
3496
3497       return NULL;
3498    }
3499
3500    exec_list *simd16_instructions = NULL;
3501    fs_visitor v2(brw, c, prog, fp, 16);
3502    if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16))) {
3503       if (c->prog_data.base.nr_pull_params == 0) {
3504          /* Try a SIMD16 compile */
3505          v2.import_uniforms(&v);
3506          if (!v2.run()) {
3507             perf_debug("SIMD16 shader failed to compile, falling back to "
3508                        "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
3509          } else {
3510             simd16_instructions = &v2.instructions;
3511          }
3512       } else {
3513          perf_debug("Skipping SIMD16 due to pull parameters.\n");
3514       }
3515    }
3516
3517    const unsigned *assembly = NULL;
3518    if (brw->gen >= 8) {
3519       gen8_fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
3520       assembly = g.generate_assembly(&v.instructions, simd16_instructions,
3521                                      final_assembly_size);
3522    } else {
3523       fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
3524       assembly = g.generate_assembly(&v.instructions, simd16_instructions,
3525                                      final_assembly_size);
3526    }
3527
3528    if (unlikely(brw->perf_debug) && shader) {
3529       if (shader->compiled_once)
3530          brw_wm_debug_recompile(brw, prog, &c->key);
3531       shader->compiled_once = true;
3532
3533       if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
3534          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3535                     (get_time() - start_time) * 1000);
3536       }
3537    }
3538
3539    return assembly;
3540 }
3541
3542 bool
3543 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
3544 {
3545    struct brw_context *brw = brw_context(ctx);
3546    struct brw_wm_prog_key key;
3547
3548    if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
3549       return true;
3550
3551    struct gl_fragment_program *fp = (struct gl_fragment_program *)
3552       prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
3553    struct brw_fragment_program *bfp = brw_fragment_program(fp);
3554    bool program_uses_dfdy = fp->UsesDFdy;
3555
3556    memset(&key, 0, sizeof(key));
3557
3558    if (brw->gen < 6) {
3559       if (fp->UsesKill)
3560          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3561
3562       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3563          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3564
3565       /* Just assume depth testing. */
3566       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3567       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3568    }
3569
3570    if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
3571                                          BRW_FS_VARYING_INPUT_MASK) > 16)
3572       key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
3573
3574    key.clamp_fragment_color = ctx->API == API_OPENGL_COMPAT;
3575
3576    unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
3577    for (unsigned i = 0; i < sampler_count; i++) {
3578       if (fp->Base.ShadowSamplers & (1 << i)) {
3579          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3580          key.tex.swizzles[i] =
3581             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3582       } else {
3583          /* Color sampler: assume no swizzling. */
3584          key.tex.swizzles[i] = SWIZZLE_XYZW;
3585       }
3586    }
3587
3588    if (fp->Base.InputsRead & VARYING_BIT_POS) {
3589       key.drawable_height = ctx->DrawBuffer->Height;
3590    }
3591
3592    key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
3593          ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
3594          BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
3595
3596    if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3597       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
3598                           key.nr_color_regions > 1;
3599    }
3600
3601    /* GL_FRAGMENT_SHADER_DERIVATIVE_HINT is almost always GL_DONT_CARE.  The
3602     * quality of the derivatives is likely to be determined by the driconf
3603     * option.
3604     */
3605    key.high_quality_derivatives = brw->disable_derivative_optimization;
3606
3607    key.program_string_id = bfp->id;
3608
3609    uint32_t old_prog_offset = brw->wm.base.prog_offset;
3610    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3611
3612    bool success = do_wm_prog(brw, prog, bfp, &key);
3613
3614    brw->wm.base.prog_offset = old_prog_offset;
3615    brw->wm.prog_data = old_prog_data;
3616
3617    return success;
3618 }