src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 extern "C" {
  32
  33 #include <sys/types.h>
  34
  35 #include "main/hash_table.h"
  36 #include "main/macros.h"
  37 #include "main/shaderobj.h"
  38 #include "main/fbobject.h"
  39 #include "program/prog_parameter.h"
  40 #include "program/prog_print.h"
  41 #include "program/register_allocate.h"
  42 #include "program/sampler.h"
  43 #include "program/hash_table.h"
  44 #include "brw_context.h"
  45 #include "brw_eu.h"
  46 #include "brw_wm.h"
  47 }
  48 #include "brw_fs.h"
  49 #include "brw_dead_control_flow.h"
  50 #include "main/uniforms.h"
  51 #include "brw_fs_live_variables.h"
  52 #include "glsl/glsl_types.h"
  53
  54 void
  55 fs_inst::init()
  56 {
  57    memset(this, 0, sizeof(*this));
  58    this->opcode = BRW_OPCODE_NOP;
  59    this->conditional_mod = BRW_CONDITIONAL_NONE;
  60
  61    this->dst = reg_undef;
  62    this->src[0] = reg_undef;
  63    this->src[1] = reg_undef;
  64    this->src[2] = reg_undef;
  65
  66    /* This will be the case for almost all instructions. */
  67    this->regs_written = 1;
  68 }
  69
  70 fs_inst::fs_inst()
  71 {
  72    init();
  73 }
  74
  75 fs_inst::fs_inst(enum opcode opcode)
  76 {
  77    init();
  78    this->opcode = opcode;
  79 }
  80
  81 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
  82 {
  83    init();
  84    this->opcode = opcode;
  85    this->dst = dst;
  86
  87    if (dst.file == GRF)
  88       assert(dst.reg_offset >= 0);
  89 }
  90
  91 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
  92 {
  93    init();
  94    this->opcode = opcode;
  95    this->dst = dst;
  96    this->src[0] = src0;
  97
  98    if (dst.file == GRF)
  99       assert(dst.reg_offset >= 0);
 100    if (src[0].file == GRF)
 101       assert(src[0].reg_offset >= 0);
 102 }
 103
 104 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 105 {
 106    init();
 107    this->opcode = opcode;
 108    this->dst = dst;
 109    this->src[0] = src0;
 110    this->src[1] = src1;
 111
 112    if (dst.file == GRF)
 113       assert(dst.reg_offset >= 0);
 114    if (src[0].file == GRF)
 115       assert(src[0].reg_offset >= 0);
 116    if (src[1].file == GRF)
 117       assert(src[1].reg_offset >= 0);
 118 }
 119
 120 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
 121                  fs_reg src0, fs_reg src1, fs_reg src2)
 122 {
 123    init();
 124    this->opcode = opcode;
 125    this->dst = dst;
 126    this->src[0] = src0;
 127    this->src[1] = src1;
 128    this->src[2] = src2;
 129
 130    if (dst.file == GRF)
 131       assert(dst.reg_offset >= 0);
 132    if (src[0].file == GRF)
 133       assert(src[0].reg_offset >= 0);
 134    if (src[1].file == GRF)
 135       assert(src[1].reg_offset >= 0);
 136    if (src[2].file == GRF)
 137       assert(src[2].reg_offset >= 0);
 138 }
 139
 140 #define ALU1(op)                                                        \
 141    fs_inst *                                                            \
 142    fs_visitor::op(fs_reg dst, fs_reg src0)                              \
 143    {                                                                    \
 144       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
 145    }
 146
 147 #define ALU2(op)                                                        \
 148    fs_inst *                                                            \
 149    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1)                 \
 150    {                                                                    \
 151       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
 152    }
 153
 154 #define ALU3(op)                                                        \
 155    fs_inst *                                                            \
 156    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1, fs_reg src2)    \
 157    {                                                                    \
 158       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
 159    }
 160
 161 ALU1(NOT)
 162 ALU1(MOV)
 163 ALU1(FRC)
 164 ALU1(RNDD)
 165 ALU1(RNDE)
 166 ALU1(RNDZ)
 167 ALU2(ADD)
 168 ALU2(MUL)
 169 ALU2(MACH)
 170 ALU2(AND)
 171 ALU2(OR)
 172 ALU2(XOR)
 173 ALU2(SHL)
 174 ALU2(SHR)
 175 ALU2(ASR)
 176 ALU3(LRP)
 177 ALU1(BFREV)
 178 ALU3(BFE)
 179 ALU2(BFI1)
 180 ALU3(BFI2)
 181 ALU1(FBH)
 182 ALU1(FBL)
 183 ALU1(CBIT)
 184 ALU3(MAD)
 185 ALU2(ADDC)
 186 ALU2(SUBB)
 187 ALU2(SEL)
 188
 189 /** Gen4 predicated IF. */
 190 fs_inst *
 191 fs_visitor::IF(uint32_t predicate)
 192 {
 193    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
 194    inst->predicate = predicate;
 195    return inst;
 196 }
 197
 198 /** Gen6 IF with embedded comparison. */
 199 fs_inst *
 200 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
 201 {
 202    assert(brw->gen == 6);
 203    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
 204                                         reg_null_d, src0, src1);
 205    inst->conditional_mod = condition;
 206    return inst;
 207 }
 208
 209 /**
 210  * CMP: Sets the low bit of the destination channels with the result
 211  * of the comparison, while the upper bits are undefined, and updates
 212  * the flag register with the packed 16 bits of the result.
 213  */
 214 fs_inst *
 215 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
 216 {
 217    fs_inst *inst;
 218
 219    /* Take the instruction:
 220     *
 221     * CMP null<d> src0<f> src1<f>
 222     *
 223     * Original gen4 does type conversion to the destination type before
 224     * comparison, producing garbage results for floating point comparisons.
 225     * gen5 does the comparison on the execution type (resolved source types),
 226     * so dst type doesn't matter.  gen6 does comparison and then uses the
 227     * result as if it was the dst type with no conversion, which happens to
 228     * mostly work out for float-interpreted-as-int since our comparisons are
 229     * for >0, =0, <0.
 230     */
 231    if (brw->gen == 4) {
 232       dst.type = src0.type;
 233       if (dst.file == HW_REG)
 234          dst.fixed_hw_reg.type = dst.type;
 235    }
 236
 237    resolve_ud_negate(&src0);
 238    resolve_ud_negate(&src1);
 239
 240    inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
 241    inst->conditional_mod = condition;
 242
 243    return inst;
 244 }
 245
 246 exec_list
 247 fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
 248                                        fs_reg varying_offset,
 249                                        uint32_t const_offset)
 250 {
 251    exec_list instructions;
 252    fs_inst *inst;
 253
 254    /* We have our constant surface use a pitch of 4 bytes, so our index can
 255     * be any component of a vector, and then we load 4 contiguous
 256     * components starting from that.
 257     *
 258     * We break down the const_offset to a portion added to the variable
 259     * offset and a portion done using reg_offset, which means that if you
 260     * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
 261     * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
 262     * CSE can later notice that those loads are all the same and eliminate
 263     * the redundant ones.
 264     */
 265    fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
 266    instructions.push_tail(ADD(vec4_offset,
 267                               varying_offset, const_offset & ~3));
 268
 269    int scale = 1;
 270    if (brw->gen == 4 && dispatch_width == 8) {
 271       /* Pre-gen5, we can either use a SIMD8 message that requires (header,
 272        * u, v, r) as parameters, or we can just use the SIMD16 message
 273        * consisting of (header, u).  We choose the second, at the cost of a
 274        * longer return length.
 275        */
 276       scale = 2;
 277    }
 278
 279    enum opcode op;
 280    if (brw->gen >= 7)
 281       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
 282    else
 283       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
 284    fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4 * scale), dst.type);
 285    inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
 286    inst->regs_written = 4 * scale;
 287    instructions.push_tail(inst);
 288
 289    if (brw->gen < 7) {
 290       inst->base_mrf = 13;
 291       inst->header_present = true;
 292       if (brw->gen == 4)
 293          inst->mlen = 3;
 294       else
 295          inst->mlen = 1 + dispatch_width / 8;
 296    }
 297
 298    vec4_result.reg_offset += (const_offset & 3) * scale;
 299    instructions.push_tail(MOV(dst, vec4_result));
 300
 301    return instructions;
 302 }
 303
 304 /**
 305  * A helper for MOV generation for fixing up broken hardware SEND dependency
 306  * handling.
 307  */
 308 fs_inst *
 309 fs_visitor::DEP_RESOLVE_MOV(int grf)
 310 {
 311    fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
 312
 313    inst->ir = NULL;
 314    inst->annotation = "send dependency resolve";
 315
 316    /* The caller always wants uncompressed to emit the minimal extra
 317     * dependencies, and to avoid having to deal with aligning its regs to 2.
 318     */
 319    inst->force_uncompressed = true;
 320
 321    return inst;
 322 }
 323
 324 bool
 325 fs_inst::equals(fs_inst *inst)
 326 {
 327    return (opcode == inst->opcode &&
 328            dst.equals(inst->dst) &&
 329            src[0].equals(inst->src[0]) &&
 330            src[1].equals(inst->src[1]) &&
 331            src[2].equals(inst->src[2]) &&
 332            saturate == inst->saturate &&
 333            predicate == inst->predicate &&
 334            conditional_mod == inst->conditional_mod &&
 335            mlen == inst->mlen &&
 336            base_mrf == inst->base_mrf &&
 337            sampler == inst->sampler &&
 338            target == inst->target &&
 339            eot == inst->eot &&
 340            header_present == inst->header_present &&
 341            shadow_compare == inst->shadow_compare &&
 342            offset == inst->offset);
 343 }
 344
 345 bool
 346 fs_inst::overwrites_reg(const fs_reg &reg)
 347 {
 348    return (reg.file == dst.file &&
 349            reg.reg == dst.reg &&
 350            reg.reg_offset >= dst.reg_offset  &&
 351            reg.reg_offset < dst.reg_offset + regs_written);
 352 }
 353
 354 bool
 355 fs_inst::is_send_from_grf()
 356 {
 357    return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
 358            opcode == SHADER_OPCODE_SHADER_TIME_ADD ||
 359            (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
 360             src[1].file == GRF) ||
 361            (is_tex() && src[0].file == GRF));
 362 }
 363
 364 bool
 365 fs_visitor::can_do_source_mods(fs_inst *inst)
 366 {
 367    if (brw->gen == 6 && inst->is_math())
 368       return false;
 369
 370    if (inst->is_send_from_grf())
 371       return false;
 372
 373    if (!inst->can_do_source_mods())
 374       return false;
 375
 376    return true;
 377 }
 378
 379 void
 380 fs_reg::init()
 381 {
 382    memset(this, 0, sizeof(*this));
 383    this->smear = -1;
 384 }
 385
 386 /** Generic unset register constructor. */
 387 fs_reg::fs_reg()
 388 {
 389    init();
 390    this->file = BAD_FILE;
 391 }
 392
 393 /** Immediate value constructor. */
 394 fs_reg::fs_reg(float f)
 395 {
 396    init();
 397    this->file = IMM;
 398    this->type = BRW_REGISTER_TYPE_F;
 399    this->imm.f = f;
 400 }
 401
 402 /** Immediate value constructor. */
 403 fs_reg::fs_reg(int32_t i)
 404 {
 405    init();
 406    this->file = IMM;
 407    this->type = BRW_REGISTER_TYPE_D;
 408    this->imm.i = i;
 409 }
 410
 411 /** Immediate value constructor. */
 412 fs_reg::fs_reg(uint32_t u)
 413 {
 414    init();
 415    this->file = IMM;
 416    this->type = BRW_REGISTER_TYPE_UD;
 417    this->imm.u = u;
 418 }
 419
 420 /** Fixed brw_reg. */
 421 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
 422 {
 423    init();
 424    this->file = HW_REG;
 425    this->fixed_hw_reg = fixed_hw_reg;
 426    this->type = fixed_hw_reg.type;
 427 }
 428
 429 bool
 430 fs_reg::equals(const fs_reg &r) const
 431 {
 432    return (file == r.file &&
 433            reg == r.reg &&
 434            reg_offset == r.reg_offset &&
 435            subreg_offset == r.subreg_offset &&
 436            type == r.type &&
 437            negate == r.negate &&
 438            abs == r.abs &&
 439            !reladdr && !r.reladdr &&
 440            memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
 441                   sizeof(fixed_hw_reg)) == 0 &&
 442            smear == r.smear &&
 443            imm.u == r.imm.u);
 444 }
 445
 446 fs_reg
 447 fs_reg::retype(uint32_t type)
 448 {
 449    fs_reg result = *this;
 450    result.type = type;
 451    return result;
 452 }
 453
 454 bool
 455 fs_reg::is_zero() const
 456 {
 457    if (file != IMM)
 458       return false;
 459
 460    return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
 461 }
 462
 463 bool
 464 fs_reg::is_one() const
 465 {
 466    if (file != IMM)
 467       return false;
 468
 469    return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
 470 }
 471
 472 bool
 473 fs_reg::is_null() const
 474 {
 475    return file == HW_REG &&
 476           fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
 477           fixed_hw_reg.nr == BRW_ARF_NULL;
 478 }
 479
 480 bool
 481 fs_reg::is_valid_3src() const
 482 {
 483    return file == GRF || file == UNIFORM;
 484 }
 485
 486 int
 487 fs_visitor::type_size(const struct glsl_type *type)
 488 {
 489    unsigned int size, i;
 490
 491    switch (type->base_type) {
 492    case GLSL_TYPE_UINT:
 493    case GLSL_TYPE_INT:
 494    case GLSL_TYPE_FLOAT:
 495    case GLSL_TYPE_BOOL:
 496       return type->components();
 497    case GLSL_TYPE_ARRAY:
 498       return type_size(type->fields.array) * type->length;
 499    case GLSL_TYPE_STRUCT:
 500       size = 0;
 501       for (i = 0; i < type->length; i++) {
 502          size += type_size(type->fields.structure[i].type);
 503       }
 504       return size;
 505    case GLSL_TYPE_SAMPLER:
 506       /* Samplers take up no register space, since they're baked in at
 507        * link time.
 508        */
 509       return 0;
 510    case GLSL_TYPE_ATOMIC_UINT:
 511       return 0;
 512    case GLSL_TYPE_IMAGE:
 513    case GLSL_TYPE_VOID:
 514    case GLSL_TYPE_ERROR:
 515    case GLSL_TYPE_INTERFACE:
 516       assert(!"not reached");
 517       break;
 518    }
 519
 520    return 0;
 521 }
 522
 523 fs_reg
 524 fs_visitor::get_timestamp()
 525 {
 526    assert(brw->gen >= 7);
 527
 528    fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 529                                           BRW_ARF_TIMESTAMP,
 530                                           0),
 531                              BRW_REGISTER_TYPE_UD));
 532
 533    fs_reg dst = fs_reg(this, glsl_type::uint_type);
 534
 535    fs_inst *mov = emit(MOV(dst, ts));
 536    /* We want to read the 3 fields we care about (mostly field 0, but also 2)
 537     * even if it's not enabled in the dispatch.
 538     */
 539    mov->force_writemask_all = true;
 540    mov->force_uncompressed = true;
 541
 542    /* The caller wants the low 32 bits of the timestamp.  Since it's running
 543     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
 544     * which is plenty of time for our purposes.  It is identical across the
 545     * EUs, but since it's tracking GPU core speed it will increment at a
 546     * varying rate as render P-states change.
 547     *
 548     * The caller could also check if render P-states have changed (or anything
 549     * else that might disrupt timing) by setting smear to 2 and checking if
 550     * that field is != 0.
 551     */
 552    dst.smear = 0;
 553
 554    return dst;
 555 }
 556
 557 void
 558 fs_visitor::emit_shader_time_begin()
 559 {
 560    current_annotation = "shader time start";
 561    shader_start_time = get_timestamp();
 562 }
 563
 564 void
 565 fs_visitor::emit_shader_time_end()
 566 {
 567    current_annotation = "shader time end";
 568
 569    enum shader_time_shader_type type, written_type, reset_type;
 570    if (dispatch_width == 8) {
 571       type = ST_FS8;
 572       written_type = ST_FS8_WRITTEN;
 573       reset_type = ST_FS8_RESET;
 574    } else {
 575       assert(dispatch_width == 16);
 576       type = ST_FS16;
 577       written_type = ST_FS16_WRITTEN;
 578       reset_type = ST_FS16_RESET;
 579    }
 580
 581    fs_reg shader_end_time = get_timestamp();
 582
 583    /* Check that there weren't any timestamp reset events (assuming these
 584     * were the only two timestamp reads that happened).
 585     */
 586    fs_reg reset = shader_end_time;
 587    reset.smear = 2;
 588    fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
 589    test->conditional_mod = BRW_CONDITIONAL_Z;
 590    emit(IF(BRW_PREDICATE_NORMAL));
 591
 592    push_force_uncompressed();
 593    fs_reg start = shader_start_time;
 594    start.negate = true;
 595    fs_reg diff = fs_reg(this, glsl_type::uint_type);
 596    emit(ADD(diff, start, shader_end_time));
 597
 598    /* If there were no instructions between the two timestamp gets, the diff
 599     * is 2 cycles.  Remove that overhead, so I can forget about that when
 600     * trying to determine the time taken for single instructions.
 601     */
 602    emit(ADD(diff, diff, fs_reg(-2u)));
 603
 604    emit_shader_time_write(type, diff);
 605    emit_shader_time_write(written_type, fs_reg(1u));
 606    emit(BRW_OPCODE_ELSE);
 607    emit_shader_time_write(reset_type, fs_reg(1u));
 608    emit(BRW_OPCODE_ENDIF);
 609
 610    pop_force_uncompressed();
 611 }
 612
 613 void
 614 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
 615                                    fs_reg value)
 616 {
 617    int shader_time_index =
 618       brw_get_shader_time_index(brw, shader_prog, &fp->Base, type);
 619    fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
 620
 621    fs_reg payload;
 622    if (dispatch_width == 8)
 623       payload = fs_reg(this, glsl_type::uvec2_type);
 624    else
 625       payload = fs_reg(this, glsl_type::uint_type);
 626
 627    emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
 628                 fs_reg(), payload, offset, value));
 629 }
 630
 631 void
 632 fs_visitor::fail(const char *format, ...)
 633 {
 634    va_list va;
 635    char *msg;
 636
 637    if (failed)
 638       return;
 639
 640    failed = true;
 641
 642    va_start(va, format);
 643    msg = ralloc_vasprintf(mem_ctx, format, va);
 644    va_end(va);
 645    msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
 646
 647    this->fail_msg = msg;
 648
 649    if (INTEL_DEBUG & DEBUG_WM) {
 650       fprintf(stderr, "%s",  msg);
 651    }
 652 }
 653
 654 fs_inst *
 655 fs_visitor::emit(enum opcode opcode)
 656 {
 657    return emit(fs_inst(opcode));
 658 }
 659
 660 fs_inst *
 661 fs_visitor::emit(enum opcode opcode, fs_reg dst)
 662 {
 663    return emit(fs_inst(opcode, dst));
 664 }
 665
 666 fs_inst *
 667 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
 668 {
 669    return emit(fs_inst(opcode, dst, src0));
 670 }
 671
 672 fs_inst *
 673 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 674 {
 675    return emit(fs_inst(opcode, dst, src0, src1));
 676 }
 677
 678 fs_inst *
 679 fs_visitor::emit(enum opcode opcode, fs_reg dst,
 680                  fs_reg src0, fs_reg src1, fs_reg src2)
 681 {
 682    return emit(fs_inst(opcode, dst, src0, src1, src2));
 683 }
 684
 685 void
 686 fs_visitor::push_force_uncompressed()
 687 {
 688    force_uncompressed_stack++;
 689 }
 690
 691 void
 692 fs_visitor::pop_force_uncompressed()
 693 {
 694    force_uncompressed_stack--;
 695    assert(force_uncompressed_stack >= 0);
 696 }
 697
 698 /**
 699  * Returns true if the instruction has a flag that means it won't
 700  * update an entire destination register.
 701  *
 702  * For example, dead code elimination and live variable analysis want to know
 703  * when a write to a variable screens off any preceding values that were in
 704  * it.
 705  */
 706 bool
 707 fs_inst::is_partial_write()
 708 {
 709    return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
 710            this->force_uncompressed ||
 711            this->force_sechalf);
 712 }
 713
 714 int
 715 fs_inst::regs_read(fs_visitor *v, int arg)
 716 {
 717    if (is_tex() && arg == 0 && src[0].file == GRF) {
 718       if (v->dispatch_width == 16)
 719          return (mlen + 1) / 2;
 720       else
 721          return mlen;
 722    }
 723    return 1;
 724 }
 725
 726 bool
 727 fs_inst::reads_flag()
 728 {
 729    return predicate;
 730 }
 731
 732 bool
 733 fs_inst::writes_flag()
 734 {
 735    return (conditional_mod && opcode != BRW_OPCODE_SEL) ||
 736           opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
 737 }
 738
 739 /**
 740  * Returns how many MRFs an FS opcode will write over.
 741  *
 742  * Note that this is not the 0 or 1 implied writes in an actual gen
 743  * instruction -- the FS opcodes often generate MOVs in addition.
 744  */
 745 int
 746 fs_visitor::implied_mrf_writes(fs_inst *inst)
 747 {
 748    if (inst->mlen == 0)
 749       return 0;
 750
 751    if (inst->base_mrf == -1)
 752       return 0;
 753
 754    switch (inst->opcode) {
 755    case SHADER_OPCODE_RCP:
 756    case SHADER_OPCODE_RSQ:
 757    case SHADER_OPCODE_SQRT:
 758    case SHADER_OPCODE_EXP2:
 759    case SHADER_OPCODE_LOG2:
 760    case SHADER_OPCODE_SIN:
 761    case SHADER_OPCODE_COS:
 762       return 1 * dispatch_width / 8;
 763    case SHADER_OPCODE_POW:
 764    case SHADER_OPCODE_INT_QUOTIENT:
 765    case SHADER_OPCODE_INT_REMAINDER:
 766       return 2 * dispatch_width / 8;
 767    case SHADER_OPCODE_TEX:
 768    case FS_OPCODE_TXB:
 769    case SHADER_OPCODE_TXD:
 770    case SHADER_OPCODE_TXF:
 771    case SHADER_OPCODE_TXF_CMS:
 772    case SHADER_OPCODE_TXF_MCS:
 773    case SHADER_OPCODE_TG4:
 774    case SHADER_OPCODE_TG4_OFFSET:
 775    case SHADER_OPCODE_TXL:
 776    case SHADER_OPCODE_TXS:
 777    case SHADER_OPCODE_LOD:
 778       return 1;
 779    case FS_OPCODE_FB_WRITE:
 780       return 2;
 781    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 782    case SHADER_OPCODE_GEN4_SCRATCH_READ:
 783       return 1;
 784    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
 785       return inst->mlen;
 786    case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
 787       return 2;
 788    case SHADER_OPCODE_UNTYPED_ATOMIC:
 789    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
 790       return 0;
 791    default:
 792       assert(!"not reached");
 793       return inst->mlen;
 794    }
 795 }
 796
 797 int
 798 fs_visitor::virtual_grf_alloc(int size)
 799 {
 800    if (virtual_grf_array_size <= virtual_grf_count) {
 801       if (virtual_grf_array_size == 0)
 802          virtual_grf_array_size = 16;
 803       else
 804          virtual_grf_array_size *= 2;
 805       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 806                                    virtual_grf_array_size);
 807    }
 808    virtual_grf_sizes[virtual_grf_count] = size;
 809    return virtual_grf_count++;
 810 }
 811
 812 /** Fixed HW reg constructor. */
 813 fs_reg::fs_reg(enum register_file file, int reg)
 814 {
 815    init();
 816    this->file = file;
 817    this->reg = reg;
 818    this->type = BRW_REGISTER_TYPE_F;
 819 }
 820
 821 /** Fixed HW reg constructor. */
 822 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
 823 {
 824    init();
 825    this->file = file;
 826    this->reg = reg;
 827    this->type = type;
 828 }
 829
 830 /** Automatic reg constructor. */
 831 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
 832 {
 833    init();
 834
 835    this->file = GRF;
 836    this->reg = v->virtual_grf_alloc(v->type_size(type));
 837    this->reg_offset = 0;
 838    this->type = brw_type_for_base_type(type);
 839 }
 840
 841 fs_reg *
 842 fs_visitor::variable_storage(ir_variable *var)
 843 {
 844    return (fs_reg *)hash_table_find(this->variable_ht, var);
 845 }
 846
 847 void
 848 import_uniforms_callback(const void *key,
 849                          void *data,
 850                          void *closure)
 851 {
 852    struct hash_table *dst_ht = (struct hash_table *)closure;
 853    const fs_reg *reg = (const fs_reg *)data;
 854
 855    if (reg->file != UNIFORM)
 856       return;
 857
 858    hash_table_insert(dst_ht, data, key);
 859 }
 860
 861 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
 862  * This brings in those uniform definitions
 863  */
 864 void
 865 fs_visitor::import_uniforms(fs_visitor *v)
 866 {
 867    hash_table_call_foreach(v->variable_ht,
 868                            import_uniforms_callback,
 869                            variable_ht);
 870    this->params_remap = v->params_remap;
 871    this->nr_params_remap = v->nr_params_remap;
 872 }
 873
 874 /* Our support for uniforms is piggy-backed on the struct
 875  * gl_fragment_program, because that's where the values actually
 876  * get stored, rather than in some global gl_shader_program uniform
 877  * store.
 878  */
 879 void
 880 fs_visitor::setup_uniform_values(ir_variable *ir)
 881 {
 882    int namelen = strlen(ir->name);
 883
 884    /* The data for our (non-builtin) uniforms is stored in a series of
 885     * gl_uniform_driver_storage structs for each subcomponent that
 886     * glGetUniformLocation() could name.  We know it's been set up in the same
 887     * order we'd walk the type, so walk the list of storage and find anything
 888     * with our name, or the prefix of a component that starts with our name.
 889     */
 890    unsigned params_before = c->prog_data.nr_params;
 891    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
 892       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 893
 894       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 895           (storage->name[namelen] != 0 &&
 896            storage->name[namelen] != '.' &&
 897            storage->name[namelen] != '[')) {
 898          continue;
 899       }
 900
 901       unsigned slots = storage->type->component_slots();
 902       if (storage->array_elements)
 903          slots *= storage->array_elements;
 904
 905       for (unsigned i = 0; i < slots; i++) {
 906          c->prog_data.param[c->prog_data.nr_params++] =
 907             &storage->storage[i].f;
 908       }
 909    }
 910
 911    /* Make sure we actually initialized the right amount of stuff here. */
 912    assert(params_before + ir->type->component_slots() ==
 913           c->prog_data.nr_params);
 914    (void)params_before;
 915 }
 916
 917
 918 /* Our support for builtin uniforms is even scarier than non-builtin.
 919  * It sits on top of the PROG_STATE_VAR parameters that are
 920  * automatically updated from GL context state.
 921  */
 922 void
 923 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
 924 {
 925    const ir_state_slot *const slots = ir->state_slots;
 926    assert(ir->state_slots != NULL);
 927
 928    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 929       /* This state reference has already been setup by ir_to_mesa, but we'll
 930        * get the same index back here.
 931        */
 932       int index = _mesa_add_state_reference(this->fp->Base.Parameters,
 933                                             (gl_state_index *)slots[i].tokens);
 934
 935       /* Add each of the unique swizzles of the element as a parameter.
 936        * This'll end up matching the expected layout of the
 937        * array/matrix/structure we're trying to fill in.
 938        */
 939       int last_swiz = -1;
 940       for (unsigned int j = 0; j < 4; j++) {
 941          int swiz = GET_SWZ(slots[i].swizzle, j);
 942          if (swiz == last_swiz)
 943             break;
 944          last_swiz = swiz;
 945
 946          c->prog_data.param[c->prog_data.nr_params++] =
 947             &fp->Base.Parameters->ParameterValues[index][swiz].f;
 948       }
 949    }
 950 }
 951
 952 fs_reg *
 953 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
 954 {
 955    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 956    fs_reg wpos = *reg;
 957    bool flip = !ir->data.origin_upper_left ^ c->key.render_to_fbo;
 958
 959    /* gl_FragCoord.x */
 960    if (ir->data.pixel_center_integer) {
 961       emit(MOV(wpos, this->pixel_x));
 962    } else {
 963       emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
 964    }
 965    wpos.reg_offset++;
 966
 967    /* gl_FragCoord.y */
 968    if (!flip && ir->data.pixel_center_integer) {
 969       emit(MOV(wpos, this->pixel_y));
 970    } else {
 971       fs_reg pixel_y = this->pixel_y;
 972       float offset = (ir->data.pixel_center_integer ? 0.0 : 0.5);
 973
 974       if (flip) {
 975          pixel_y.negate = true;
 976          offset += c->key.drawable_height - 1.0;
 977       }
 978
 979       emit(ADD(wpos, pixel_y, fs_reg(offset)));
 980    }
 981    wpos.reg_offset++;
 982
 983    /* gl_FragCoord.z */
 984    if (brw->gen >= 6) {
 985       emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
 986    } else {
 987       emit(FS_OPCODE_LINTERP, wpos,
 988            this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
 989            this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
 990            interp_reg(VARYING_SLOT_POS, 2));
 991    }
 992    wpos.reg_offset++;
 993
 994    /* gl_FragCoord.w: Already set up in emit_interpolation */
 995    emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
 996
 997    return reg;
 998 }
 999
1000 fs_inst *
1001 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1002                          glsl_interp_qualifier interpolation_mode,
1003                          bool is_centroid, bool is_sample)
1004 {
1005    brw_wm_barycentric_interp_mode barycoord_mode;
1006    if (brw->gen >= 6) {
1007       if (is_centroid) {
1008          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1009             barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1010          else
1011             barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1012       } else if (is_sample) {
1013           if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1014             barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1015          else
1016             barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1017       } else {
1018          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1019             barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1020          else
1021             barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1022       }
1023    } else {
1024       /* On Ironlake and below, there is only one interpolation mode.
1025        * Centroid interpolation doesn't mean anything on this hardware --
1026        * there is no multisampling.
1027        */
1028       barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1029    }
1030    return emit(FS_OPCODE_LINTERP, attr,
1031                this->delta_x[barycoord_mode],
1032                this->delta_y[barycoord_mode], interp);
1033 }
1034
1035 fs_reg *
1036 fs_visitor::emit_general_interpolation(ir_variable *ir)
1037 {
1038    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1039    reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
1040    fs_reg attr = *reg;
1041
1042    unsigned int array_elements;
1043    const glsl_type *type;
1044
1045    if (ir->type->is_array()) {
1046       array_elements = ir->type->length;
1047       if (array_elements == 0) {
1048          fail("dereferenced array '%s' has length 0\n", ir->name);
1049       }
1050       type = ir->type->fields.array;
1051    } else {
1052       array_elements = 1;
1053       type = ir->type;
1054    }
1055
1056    glsl_interp_qualifier interpolation_mode =
1057       ir->determine_interpolation_mode(c->key.flat_shade);
1058
1059    int location = ir->data.location;
1060    for (unsigned int i = 0; i < array_elements; i++) {
1061       for (unsigned int j = 0; j < type->matrix_columns; j++) {
1062          if (c->prog_data.urb_setup[location] == -1) {
1063             /* If there's no incoming setup data for this slot, don't
1064              * emit interpolation for it.
1065              */
1066             attr.reg_offset += type->vector_elements;
1067             location++;
1068             continue;
1069          }
1070
1071          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1072             /* Constant interpolation (flat shading) case. The SF has
1073              * handed us defined values in only the constant offset
1074              * field of the setup reg.
1075              */
1076             for (unsigned int k = 0; k < type->vector_elements; k++) {
1077                struct brw_reg interp = interp_reg(location, k);
1078                interp = suboffset(interp, 3);
1079                interp.type = reg->type;
1080                emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1081                attr.reg_offset++;
1082             }
1083          } else {
1084             /* Smooth/noperspective interpolation case. */
1085             for (unsigned int k = 0; k < type->vector_elements; k++) {
1086                /* FINISHME: At some point we probably want to push
1087                 * this farther by giving similar treatment to the
1088                 * other potentially constant components of the
1089                 * attribute, as well as making brw_vs_constval.c
1090                 * handle varyings other than gl_TexCoord.
1091                 */
1092                struct brw_reg interp = interp_reg(location, k);
1093                emit_linterp(attr, fs_reg(interp), interpolation_mode,
1094                             ir->data.centroid && !c->key.persample_shading,
1095                             ir->data.sample || c->key.persample_shading);
1096                if (brw->needs_unlit_centroid_workaround && ir->data.centroid) {
1097                   /* Get the pixel/sample mask into f0 so that we know
1098                    * which pixels are lit.  Then, for each channel that is
1099                    * unlit, replace the centroid data with non-centroid
1100                    * data.
1101                    */
1102                   emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1103                   fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1104                                                interpolation_mode,
1105                                                false, false);
1106                   inst->predicate = BRW_PREDICATE_NORMAL;
1107                   inst->predicate_inverse = true;
1108                }
1109                if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1110                   emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1111                }
1112                attr.reg_offset++;
1113             }
1114
1115          }
1116          location++;
1117       }
1118    }
1119
1120    return reg;
1121 }
1122
1123 fs_reg *
1124 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1125 {
1126    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1127
1128    /* The frontfacing comes in as a bit in the thread payload. */
1129    if (brw->gen >= 6) {
1130       emit(BRW_OPCODE_ASR, *reg,
1131            fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1132            fs_reg(15));
1133       emit(BRW_OPCODE_NOT, *reg, *reg);
1134       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1135    } else {
1136       struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1137       /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1138        * us front face
1139        */
1140       emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1141       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1142    }
1143
1144    return reg;
1145 }
1146
1147 void
1148 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1149 {
1150    assert(dst.type == BRW_REGISTER_TYPE_F);
1151
1152    if (c->key.compute_pos_offset) {
1153       /* Convert int_sample_pos to floating point */
1154       emit(MOV(dst, int_sample_pos));
1155       /* Scale to the range [0, 1] */
1156       emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1157    }
1158    else {
1159       /* From ARB_sample_shading specification:
1160        * "When rendering to a non-multisample buffer, or if multisample
1161        *  rasterization is disabled, gl_SamplePosition will always be
1162        *  (0.5, 0.5).
1163        */
1164       emit(MOV(dst, fs_reg(0.5f)));
1165    }
1166 }
1167
1168 fs_reg *
1169 fs_visitor::emit_samplepos_setup(ir_variable *ir)
1170 {
1171    assert(brw->gen >= 6);
1172    assert(ir->type == glsl_type::vec2_type);
1173
1174    this->current_annotation = "compute sample position";
1175    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1176    fs_reg pos = *reg;
1177    fs_reg int_sample_x = fs_reg(this, glsl_type::int_type);
1178    fs_reg int_sample_y = fs_reg(this, glsl_type::int_type);
1179
1180    /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1181     * mode will be enabled.
1182     *
1183     * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1184     * R31.1:0         Position Offset X/Y for Slot[3:0]
1185     * R31.3:2         Position Offset X/Y for Slot[7:4]
1186     * .....
1187     *
1188     * The X, Y sample positions come in as bytes in  thread payload. So, read
1189     * the positions using vstride=16, width=8, hstride=2.
1190     */
1191    struct brw_reg sample_pos_reg =
1192       stride(retype(brw_vec1_grf(c->sample_pos_reg, 0),
1193                     BRW_REGISTER_TYPE_B), 16, 8, 2);
1194
1195    emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1196    if (dispatch_width == 16) {
1197       int_sample_x.sechalf = true;
1198       fs_inst *inst = emit(MOV(int_sample_x,
1199                                fs_reg(suboffset(sample_pos_reg, 16))));
1200       inst->force_sechalf = true;
1201       int_sample_x.sechalf = false;
1202    }
1203    /* Compute gl_SamplePosition.x */
1204    compute_sample_position(pos, int_sample_x);
1205    pos.reg_offset++;
1206    emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1207    if (dispatch_width == 16) {
1208       int_sample_y.sechalf = true;
1209       fs_inst *inst = emit(MOV(int_sample_y,
1210                                fs_reg(suboffset(sample_pos_reg, 17))));
1211       inst->force_sechalf = true;
1212       int_sample_y.sechalf = false;
1213    }
1214    /* Compute gl_SamplePosition.y */
1215    compute_sample_position(pos, int_sample_y);
1216    return reg;
1217 }
1218
1219 fs_reg *
1220 fs_visitor::emit_sampleid_setup(ir_variable *ir)
1221 {
1222    assert(brw->gen >= 6);
1223
1224    this->current_annotation = "compute sample id";
1225    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1226
1227    if (c->key.compute_sample_id) {
1228       fs_reg t1 = fs_reg(this, glsl_type::int_type);
1229       fs_reg t2 = fs_reg(this, glsl_type::int_type);
1230       t2.type = BRW_REGISTER_TYPE_UW;
1231
1232       /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1233        * 8x multisampling, subspan 0 will represent sample N (where N
1234        * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1235        * 7. We can find the value of N by looking at R0.0 bits 7:6
1236        * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1237        * (since samples are always delivered in pairs). That is, we
1238        * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1239        * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1240        * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1241        * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1242        * populating a temporary variable with the sequence (0, 1, 2, 3),
1243        * and then reading from it using vstride=1, width=4, hstride=0.
1244        * These computations hold good for 4x multisampling as well.
1245        */
1246       emit(BRW_OPCODE_AND, t1,
1247            fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1248            fs_reg(brw_imm_d(0xc0)));
1249       emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1250       /* This works for both SIMD8 and SIMD16 */
1251       emit(MOV(t2, brw_imm_v(0x3210)));
1252       /* This special instruction takes care of setting vstride=1,
1253        * width=4, hstride=0 of t2 during an ADD instruction.
1254        */
1255       emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1256    } else {
1257       /* As per GL_ARB_sample_shading specification:
1258        * "When rendering to a non-multisample buffer, or if multisample
1259        *  rasterization is disabled, gl_SampleID will always be zero."
1260        */
1261       emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1262    }
1263
1264    return reg;
1265 }
1266
1267 fs_reg *
1268 fs_visitor::emit_samplemaskin_setup(ir_variable *ir)
1269 {
1270    assert(brw->gen >= 7);
1271    this->current_annotation = "compute gl_SampleMaskIn";
1272    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1273    emit(MOV(*reg, fs_reg(retype(brw_vec8_grf(c->sample_mask_reg, 0), BRW_REGISTER_TYPE_D))));
1274    return reg;
1275 }
1276
1277 fs_reg
1278 fs_visitor::fix_math_operand(fs_reg src)
1279 {
1280    /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1281     * might be able to do better by doing execsize = 1 math and then
1282     * expanding that result out, but we would need to be careful with
1283     * masking.
1284     *
1285     * The hardware ignores source modifiers (negate and abs) on math
1286     * instructions, so we also move to a temp to set those up.
1287     */
1288    if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1289        !src.abs && !src.negate)
1290       return src;
1291
1292    /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1293     * operands to math
1294     */
1295    if (brw->gen >= 7 && src.file != IMM)
1296       return src;
1297
1298    fs_reg expanded = fs_reg(this, glsl_type::float_type);
1299    expanded.type = src.type;
1300    emit(BRW_OPCODE_MOV, expanded, src);
1301    return expanded;
1302 }
1303
1304 fs_inst *
1305 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1306 {
1307    switch (opcode) {
1308    case SHADER_OPCODE_RCP:
1309    case SHADER_OPCODE_RSQ:
1310    case SHADER_OPCODE_SQRT:
1311    case SHADER_OPCODE_EXP2:
1312    case SHADER_OPCODE_LOG2:
1313    case SHADER_OPCODE_SIN:
1314    case SHADER_OPCODE_COS:
1315       break;
1316    default:
1317       assert(!"not reached: bad math opcode");
1318       return NULL;
1319    }
1320
1321    /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
1322     * might be able to do better by doing execsize = 1 math and then
1323     * expanding that result out, but we would need to be careful with
1324     * masking.
1325     *
1326     * Gen 6 hardware ignores source modifiers (negate and abs) on math
1327     * instructions, so we also move to a temp to set those up.
1328     */
1329    if (brw->gen >= 6)
1330       src = fix_math_operand(src);
1331
1332    fs_inst *inst = emit(opcode, dst, src);
1333
1334    if (brw->gen < 6) {
1335       inst->base_mrf = 2;
1336       inst->mlen = dispatch_width / 8;
1337    }
1338
1339    return inst;
1340 }
1341
1342 fs_inst *
1343 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1344 {
1345    int base_mrf = 2;
1346    fs_inst *inst;
1347
1348    switch (opcode) {
1349    case SHADER_OPCODE_INT_QUOTIENT:
1350    case SHADER_OPCODE_INT_REMAINDER:
1351       if (brw->gen >= 7 && dispatch_width == 16)
1352          fail("SIMD16 INTDIV unsupported\n");
1353       break;
1354    case SHADER_OPCODE_POW:
1355       break;
1356    default:
1357       assert(!"not reached: unsupported binary math opcode.");
1358       return NULL;
1359    }
1360
1361    if (brw->gen >= 6) {
1362       src0 = fix_math_operand(src0);
1363       src1 = fix_math_operand(src1);
1364
1365       inst = emit(opcode, dst, src0, src1);
1366    } else {
1367       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1368        * "Message Payload":
1369        *
1370        * "Operand0[7].  For the INT DIV functions, this operand is the
1371        *  denominator."
1372        *  ...
1373        * "Operand1[7].  For the INT DIV functions, this operand is the
1374        *  numerator."
1375        */
1376       bool is_int_div = opcode != SHADER_OPCODE_POW;
1377       fs_reg &op0 = is_int_div ? src1 : src0;
1378       fs_reg &op1 = is_int_div ? src0 : src1;
1379
1380       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1381       inst = emit(opcode, dst, op0, reg_null_f);
1382
1383       inst->base_mrf = base_mrf;
1384       inst->mlen = 2 * dispatch_width / 8;
1385    }
1386    return inst;
1387 }
1388
1389 void
1390 fs_visitor::assign_curb_setup()
1391 {
1392    c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
1393    if (dispatch_width == 8) {
1394       c->prog_data.first_curbe_grf = c->nr_payload_regs;
1395    } else {
1396       c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1397    }
1398
1399    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1400    foreach_list(node, &this->instructions) {
1401       fs_inst *inst = (fs_inst *)node;
1402
1403       for (unsigned int i = 0; i < 3; i++) {
1404          if (inst->src[i].file == UNIFORM) {
1405             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1406             struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1407                                                   constant_nr / 8,
1408                                                   constant_nr % 8);
1409
1410             inst->src[i].file = HW_REG;
1411             inst->src[i].fixed_hw_reg = byte_offset(
1412                retype(brw_reg, inst->src[i].type),
1413                inst->src[i].subreg_offset);
1414          }
1415       }
1416    }
1417 }
1418
1419 void
1420 fs_visitor::calculate_urb_setup()
1421 {
1422    for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1423       c->prog_data.urb_setup[i] = -1;
1424    }
1425
1426    int urb_next = 0;
1427    /* Figure out where each of the incoming setup attributes lands. */
1428    if (brw->gen >= 6) {
1429       if (_mesa_bitcount_64(fp->Base.InputsRead &
1430                             BRW_FS_VARYING_INPUT_MASK) <= 16) {
1431          /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1432           * first 16 varying inputs, so we can put them wherever we want.
1433           * Just put them in order.
1434           *
1435           * This is useful because it means that (a) inputs not used by the
1436           * fragment shader won't take up valuable register space, and (b) we
1437           * won't have to recompile the fragment shader if it gets paired with
1438           * a different vertex (or geometry) shader.
1439           */
1440          for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1441             if (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1442                 BITFIELD64_BIT(i)) {
1443                c->prog_data.urb_setup[i] = urb_next++;
1444             }
1445          }
1446       } else {
1447          /* We have enough input varyings that the SF/SBE pipeline stage can't
1448           * arbitrarily rearrange them to suit our whim; we have to put them
1449           * in an order that matches the output of the previous pipeline stage
1450           * (geometry or vertex shader).
1451           */
1452          struct brw_vue_map prev_stage_vue_map;
1453          brw_compute_vue_map(brw, &prev_stage_vue_map,
1454                              c->key.input_slots_valid);
1455          int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1456          assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1457          for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1458               slot++) {
1459             int varying = prev_stage_vue_map.slot_to_varying[slot];
1460             /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1461              * unused.
1462              */
1463             if (varying != BRW_VARYING_SLOT_COUNT &&
1464                 (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1465                  BITFIELD64_BIT(varying))) {
1466                c->prog_data.urb_setup[varying] = slot - first_slot;
1467             }
1468          }
1469          urb_next = prev_stage_vue_map.num_slots - first_slot;
1470       }
1471    } else {
1472       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1473       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1474          /* Point size is packed into the header, not as a general attribute */
1475          if (i == VARYING_SLOT_PSIZ)
1476             continue;
1477
1478          if (c->key.input_slots_valid & BITFIELD64_BIT(i)) {
1479             /* The back color slot is skipped when the front color is
1480              * also written to.  In addition, some slots can be
1481              * written in the vertex shader and not read in the
1482              * fragment shader.  So the register number must always be
1483              * incremented, mapped or not.
1484              */
1485             if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1486                c->prog_data.urb_setup[i] = urb_next;
1487             urb_next++;
1488          }
1489       }
1490
1491       /*
1492        * It's a FS only attribute, and we did interpolation for this attribute
1493        * in SF thread. So, count it here, too.
1494        *
1495        * See compile_sf_prog() for more info.
1496        */
1497       if (fp->Base.InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1498          c->prog_data.urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1499    }
1500
1501    c->prog_data.num_varying_inputs = urb_next;
1502 }
1503
1504 void
1505 fs_visitor::assign_urb_setup()
1506 {
1507    int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1508
1509    /* Offset all the urb_setup[] index by the actual position of the
1510     * setup regs, now that the location of the constants has been chosen.
1511     */
1512    foreach_list(node, &this->instructions) {
1513       fs_inst *inst = (fs_inst *)node;
1514
1515       if (inst->opcode == FS_OPCODE_LINTERP) {
1516          assert(inst->src[2].file == HW_REG);
1517          inst->src[2].fixed_hw_reg.nr += urb_start;
1518       }
1519
1520       if (inst->opcode == FS_OPCODE_CINTERP) {
1521          assert(inst->src[0].file == HW_REG);
1522          inst->src[0].fixed_hw_reg.nr += urb_start;
1523       }
1524    }
1525
1526    /* Each attribute is 4 setup channels, each of which is half a reg. */
1527    this->first_non_payload_grf =
1528       urb_start + c->prog_data.num_varying_inputs * 2;
1529 }
1530
1531 /**
1532  * Split large virtual GRFs into separate components if we can.
1533  *
1534  * This is mostly duplicated with what brw_fs_vector_splitting does,
1535  * but that's really conservative because it's afraid of doing
1536  * splitting that doesn't result in real progress after the rest of
1537  * the optimization phases, which would cause infinite looping in
1538  * optimization.  We can do it once here, safely.  This also has the
1539  * opportunity to split interpolated values, or maybe even uniforms,
1540  * which we don't have at the IR level.
1541  *
1542  * We want to split, because virtual GRFs are what we register
1543  * allocate and spill (due to contiguousness requirements for some
1544  * instructions), and they're what we naturally generate in the
1545  * codegen process, but most virtual GRFs don't actually need to be
1546  * contiguous sets of GRFs.  If we split, we'll end up with reduced
1547  * live intervals and better dead code elimination and coalescing.
1548  */
1549 void
1550 fs_visitor::split_virtual_grfs()
1551 {
1552    int num_vars = this->virtual_grf_count;
1553    bool split_grf[num_vars];
1554    int new_virtual_grf[num_vars];
1555
1556    /* Try to split anything > 0 sized. */
1557    for (int i = 0; i < num_vars; i++) {
1558       if (this->virtual_grf_sizes[i] != 1)
1559          split_grf[i] = true;
1560       else
1561          split_grf[i] = false;
1562    }
1563
1564    if (brw->has_pln &&
1565        this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1566       /* PLN opcodes rely on the delta_xy being contiguous.  We only have to
1567        * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1568        * Gen6, that was the only supported interpolation mode, and since Gen6,
1569        * delta_x and delta_y are in fixed hardware registers.
1570        */
1571       split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1572          false;
1573    }
1574
1575    foreach_list(node, &this->instructions) {
1576       fs_inst *inst = (fs_inst *)node;
1577
1578       /* If there's a SEND message that requires contiguous destination
1579        * registers, no splitting is allowed.
1580        */
1581       if (inst->regs_written > 1) {
1582          split_grf[inst->dst.reg] = false;
1583       }
1584
1585       /* If we're sending from a GRF, don't split it, on the assumption that
1586        * the send is reading the whole thing.
1587        */
1588       if (inst->is_send_from_grf()) {
1589          for (int i = 0; i < 3; i++) {
1590             if (inst->src[i].file == GRF) {
1591                split_grf[inst->src[i].reg] = false;
1592             }
1593          }
1594       }
1595    }
1596
1597    /* Allocate new space for split regs.  Note that the virtual
1598     * numbers will be contiguous.
1599     */
1600    for (int i = 0; i < num_vars; i++) {
1601       if (split_grf[i]) {
1602          new_virtual_grf[i] = virtual_grf_alloc(1);
1603          for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1604             int reg = virtual_grf_alloc(1);
1605             assert(reg == new_virtual_grf[i] + j - 1);
1606             (void) reg;
1607          }
1608          this->virtual_grf_sizes[i] = 1;
1609       }
1610    }
1611
1612    foreach_list(node, &this->instructions) {
1613       fs_inst *inst = (fs_inst *)node;
1614
1615       if (inst->dst.file == GRF &&
1616           split_grf[inst->dst.reg] &&
1617           inst->dst.reg_offset != 0) {
1618          inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1619                           inst->dst.reg_offset - 1);
1620          inst->dst.reg_offset = 0;
1621       }
1622       for (int i = 0; i < 3; i++) {
1623          if (inst->src[i].file == GRF &&
1624              split_grf[inst->src[i].reg] &&
1625              inst->src[i].reg_offset != 0) {
1626             inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1627                                 inst->src[i].reg_offset - 1);
1628             inst->src[i].reg_offset = 0;
1629          }
1630       }
1631    }
1632    invalidate_live_intervals();
1633 }
1634
1635 /**
1636  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1637  *
1638  * During code generation, we create tons of temporary variables, many of
1639  * which get immediately killed and are never used again.  Yet, in later
1640  * optimization and analysis passes, such as compute_live_intervals, we need
1641  * to loop over all the virtual GRFs.  Compacting them can save a lot of
1642  * overhead.
1643  */
1644 void
1645 fs_visitor::compact_virtual_grfs()
1646 {
1647    /* Mark which virtual GRFs are used, and count how many. */
1648    int remap_table[this->virtual_grf_count];
1649    memset(remap_table, -1, sizeof(remap_table));
1650
1651    foreach_list(node, &this->instructions) {
1652       const fs_inst *inst = (const fs_inst *) node;
1653
1654       if (inst->dst.file == GRF)
1655          remap_table[inst->dst.reg] = 0;
1656
1657       for (int i = 0; i < 3; i++) {
1658          if (inst->src[i].file == GRF)
1659             remap_table[inst->src[i].reg] = 0;
1660       }
1661    }
1662
1663    /* In addition to registers used in instructions, fs_visitor keeps
1664     * direct references to certain special values which must be patched:
1665     */
1666    fs_reg *special[] = {
1667       &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1668       &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1669       &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1670       &delta_x[0], &delta_x[1], &delta_x[2],
1671       &delta_x[3], &delta_x[4], &delta_x[5],
1672       &delta_y[0], &delta_y[1], &delta_y[2],
1673       &delta_y[3], &delta_y[4], &delta_y[5],
1674    };
1675    STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1676    STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1677
1678    /* Treat all special values as used, to be conservative */
1679    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1680       if (special[i]->file == GRF)
1681          remap_table[special[i]->reg] = 0;
1682    }
1683
1684    /* Compact the GRF arrays. */
1685    int new_index = 0;
1686    for (int i = 0; i < this->virtual_grf_count; i++) {
1687       if (remap_table[i] != -1) {
1688          remap_table[i] = new_index;
1689          virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1690          invalidate_live_intervals();
1691          ++new_index;
1692       }
1693    }
1694
1695    this->virtual_grf_count = new_index;
1696
1697    /* Patch all the instructions to use the newly renumbered registers */
1698    foreach_list(node, &this->instructions) {
1699       fs_inst *inst = (fs_inst *) node;
1700
1701       if (inst->dst.file == GRF)
1702          inst->dst.reg = remap_table[inst->dst.reg];
1703
1704       for (int i = 0; i < 3; i++) {
1705          if (inst->src[i].file == GRF)
1706             inst->src[i].reg = remap_table[inst->src[i].reg];
1707       }
1708    }
1709
1710    /* Patch all the references to special values */
1711    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1712       if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1713          special[i]->reg = remap_table[special[i]->reg];
1714    }
1715 }
1716
1717 bool
1718 fs_visitor::remove_dead_constants()
1719 {
1720    if (dispatch_width == 8) {
1721       this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
1722       this->nr_params_remap = c->prog_data.nr_params;
1723
1724       for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
1725          this->params_remap[i] = -1;
1726
1727       /* Find which params are still in use. */
1728       foreach_list(node, &this->instructions) {
1729          fs_inst *inst = (fs_inst *)node;
1730
1731          for (int i = 0; i < 3; i++) {
1732             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1733
1734             if (inst->src[i].file != UNIFORM)
1735                continue;
1736
1737             /* Section 5.11 of the OpenGL 4.3 spec says:
1738              *
1739              *     "Out-of-bounds reads return undefined values, which include
1740              *     values from other variables of the active program or zero."
1741              */
1742             if (constant_nr < 0 || constant_nr >= (int)c->prog_data.nr_params) {
1743                constant_nr = 0;
1744             }
1745
1746             /* For now, set this to non-negative.  We'll give it the
1747              * actual new number in a moment, in order to keep the
1748              * register numbers nicely ordered.
1749              */
1750             this->params_remap[constant_nr] = 0;
1751          }
1752       }
1753
1754       /* Figure out what the new numbers for the params will be.  At some
1755        * point when we're doing uniform array access, we're going to want
1756        * to keep the distinction between .reg and .reg_offset, but for
1757        * now we don't care.
1758        */
1759       unsigned int new_nr_params = 0;
1760       for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1761          if (this->params_remap[i] != -1) {
1762             this->params_remap[i] = new_nr_params++;
1763          }
1764       }
1765
1766       /* Update the list of params to be uploaded to match our new numbering. */
1767       for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1768          int remapped = this->params_remap[i];
1769
1770          if (remapped == -1)
1771             continue;
1772
1773          c->prog_data.param[remapped] = c->prog_data.param[i];
1774       }
1775
1776       c->prog_data.nr_params = new_nr_params;
1777    } else {
1778       /* This should have been generated in the SIMD8 pass already. */
1779       assert(this->params_remap);
1780    }
1781
1782    /* Now do the renumbering of the shader to remove unused params. */
1783    foreach_list(node, &this->instructions) {
1784       fs_inst *inst = (fs_inst *)node;
1785
1786       for (int i = 0; i < 3; i++) {
1787          int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1788
1789          if (inst->src[i].file != UNIFORM)
1790             continue;
1791
1792          /* as above alias to 0 */
1793          if (constant_nr < 0 || constant_nr >= (int)this->nr_params_remap) {
1794             constant_nr = 0;
1795          }
1796          assert(this->params_remap[constant_nr] != -1);
1797          inst->src[i].reg = this->params_remap[constant_nr];
1798          inst->src[i].reg_offset = 0;
1799       }
1800    }
1801
1802    return true;
1803 }
1804
1805 /*
1806  * Implements array access of uniforms by inserting a
1807  * PULL_CONSTANT_LOAD instruction.
1808  *
1809  * Unlike temporary GRF array access (where we don't support it due to
1810  * the difficulty of doing relative addressing on instruction
1811  * destinations), we could potentially do array access of uniforms
1812  * that were loaded in GRF space as push constants.  In real-world
1813  * usage we've seen, though, the arrays being used are always larger
1814  * than we could load as push constants, so just always move all
1815  * uniform array access out to a pull constant buffer.
1816  */
1817 void
1818 fs_visitor::move_uniform_array_access_to_pull_constants()
1819 {
1820    int pull_constant_loc[c->prog_data.nr_params];
1821
1822    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1823       pull_constant_loc[i] = -1;
1824    }
1825
1826    /* Walk through and find array access of uniforms.  Put a copy of that
1827     * uniform in the pull constant buffer.
1828     *
1829     * Note that we don't move constant-indexed accesses to arrays.  No
1830     * testing has been done of the performance impact of this choice.
1831     */
1832    foreach_list_safe(node, &this->instructions) {
1833       fs_inst *inst = (fs_inst *)node;
1834
1835       for (int i = 0 ; i < 3; i++) {
1836          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1837             continue;
1838
1839          int uniform = inst->src[i].reg;
1840
1841          /* If this array isn't already present in the pull constant buffer,
1842           * add it.
1843           */
1844          if (pull_constant_loc[uniform] == -1) {
1845             const float **values = &c->prog_data.param[uniform];
1846
1847             pull_constant_loc[uniform] = c->prog_data.nr_pull_params;
1848
1849             assert(param_size[uniform]);
1850
1851             for (int j = 0; j < param_size[uniform]; j++) {
1852                c->prog_data.pull_param[c->prog_data.nr_pull_params++] =
1853                   values[j];
1854             }
1855          }
1856
1857          /* Set up the annotation tracking for new generated instructions. */
1858          base_ir = inst->ir;
1859          current_annotation = inst->annotation;
1860
1861          fs_reg surf_index = fs_reg(c->prog_data.base.binding_table.pull_constants_start);
1862          fs_reg temp = fs_reg(this, glsl_type::float_type);
1863          exec_list list = VARYING_PULL_CONSTANT_LOAD(temp,
1864                                                      surf_index,
1865                                                      *inst->src[i].reladdr,
1866                                                      pull_constant_loc[uniform] +
1867                                                      inst->src[i].reg_offset);
1868          inst->insert_before(&list);
1869
1870          inst->src[i].file = temp.file;
1871          inst->src[i].reg = temp.reg;
1872          inst->src[i].reg_offset = temp.reg_offset;
1873          inst->src[i].reladdr = NULL;
1874       }
1875    }
1876 }
1877
1878 /**
1879  * Choose accesses from the UNIFORM file to demote to using the pull
1880  * constant buffer.
1881  *
1882  * We allow a fragment shader to have more than the specified minimum
1883  * maximum number of fragment shader uniform components (64).  If
1884  * there are too many of these, they'd fill up all of register space.
1885  * So, this will push some of them out to the pull constant buffer and
1886  * update the program to load them.
1887  */
1888 void
1889 fs_visitor::setup_pull_constants()
1890 {
1891    /* Only allow 16 registers (128 uniform components) as push constants. */
1892    unsigned int max_uniform_components = 16 * 8;
1893    if (c->prog_data.nr_params <= max_uniform_components)
1894       return;
1895
1896    if (dispatch_width == 16) {
1897       fail("Pull constants not supported in SIMD16\n");
1898       return;
1899    }
1900
1901    /* Just demote the end of the list.  We could probably do better
1902     * here, demoting things that are rarely used in the program first.
1903     */
1904    unsigned int pull_uniform_base = max_uniform_components;
1905
1906    int pull_constant_loc[c->prog_data.nr_params];
1907    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1908       if (i < pull_uniform_base) {
1909          pull_constant_loc[i] = -1;
1910       } else {
1911          pull_constant_loc[i] = -1;
1912          /* If our constant is already being uploaded for reladdr purposes,
1913           * reuse it.
1914           */
1915          for (unsigned int j = 0; j < c->prog_data.nr_pull_params; j++) {
1916             if (c->prog_data.pull_param[j] == c->prog_data.param[i]) {
1917                pull_constant_loc[i] = j;
1918                break;
1919             }
1920          }
1921          if (pull_constant_loc[i] == -1) {
1922             int pull_index = c->prog_data.nr_pull_params++;
1923             c->prog_data.pull_param[pull_index] = c->prog_data.param[i];
1924             pull_constant_loc[i] = pull_index;;
1925          }
1926       }
1927    }
1928    c->prog_data.nr_params = pull_uniform_base;
1929
1930    foreach_list(node, &this->instructions) {
1931       fs_inst *inst = (fs_inst *)node;
1932
1933       for (int i = 0; i < 3; i++) {
1934          if (inst->src[i].file != UNIFORM)
1935             continue;
1936
1937          int pull_index = pull_constant_loc[inst->src[i].reg +
1938                                             inst->src[i].reg_offset];
1939          if (pull_index == -1)
1940             continue;
1941
1942          assert(!inst->src[i].reladdr);
1943
1944          fs_reg dst = fs_reg(this, glsl_type::float_type);
1945          fs_reg index = fs_reg(c->prog_data.base.binding_table.pull_constants_start);
1946          fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1947          fs_inst *pull =
1948             new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1949                                  dst, index, offset);
1950          pull->ir = inst->ir;
1951          pull->annotation = inst->annotation;
1952
1953          inst->insert_before(pull);
1954
1955          inst->src[i].file = GRF;
1956          inst->src[i].reg = dst.reg;
1957          inst->src[i].reg_offset = 0;
1958          inst->src[i].smear = pull_index & 3;
1959       }
1960    }
1961 }
1962
1963 bool
1964 fs_visitor::opt_algebraic()
1965 {
1966    bool progress = false;
1967
1968    foreach_list(node, &this->instructions) {
1969       fs_inst *inst = (fs_inst *)node;
1970
1971       switch (inst->opcode) {
1972       case BRW_OPCODE_MUL:
1973          if (inst->src[1].file != IMM)
1974             continue;
1975
1976          /* a * 1.0 = a */
1977          if (inst->src[1].is_one()) {
1978             inst->opcode = BRW_OPCODE_MOV;
1979             inst->src[1] = reg_undef;
1980             progress = true;
1981             break;
1982          }
1983
1984          /* a * 0.0 = 0.0 */
1985          if (inst->src[1].is_zero()) {
1986             inst->opcode = BRW_OPCODE_MOV;
1987             inst->src[0] = inst->src[1];
1988             inst->src[1] = reg_undef;
1989             progress = true;
1990             break;
1991          }
1992
1993          break;
1994       case BRW_OPCODE_ADD:
1995          if (inst->src[1].file != IMM)
1996             continue;
1997
1998          /* a + 0.0 = a */
1999          if (inst->src[1].is_zero()) {
2000             inst->opcode = BRW_OPCODE_MOV;
2001             inst->src[1] = reg_undef;
2002             progress = true;
2003             break;
2004          }
2005          break;
2006       case BRW_OPCODE_OR:
2007          if (inst->src[0].equals(inst->src[1])) {
2008             inst->opcode = BRW_OPCODE_MOV;
2009             inst->src[1] = reg_undef;
2010             progress = true;
2011             break;
2012          }
2013          break;
2014       case BRW_OPCODE_LRP:
2015          if (inst->src[1].equals(inst->src[2])) {
2016             inst->opcode = BRW_OPCODE_MOV;
2017             inst->src[0] = inst->src[1];
2018             inst->src[1] = reg_undef;
2019             inst->src[2] = reg_undef;
2020             progress = true;
2021             break;
2022          }
2023          break;
2024       case BRW_OPCODE_SEL:
2025          if (inst->saturate && inst->src[1].file == IMM) {
2026             switch (inst->conditional_mod) {
2027             case BRW_CONDITIONAL_LE:
2028             case BRW_CONDITIONAL_L:
2029                switch (inst->src[1].type) {
2030                case BRW_REGISTER_TYPE_F:
2031                   if (inst->src[1].imm.f >= 1.0f) {
2032                      inst->opcode = BRW_OPCODE_MOV;
2033                      inst->src[1] = reg_undef;
2034                      progress = true;
2035                   }
2036                   break;
2037                default:
2038                   break;
2039                }
2040                break;
2041             case BRW_CONDITIONAL_GE:
2042             case BRW_CONDITIONAL_G:
2043                switch (inst->src[1].type) {
2044                case BRW_REGISTER_TYPE_F:
2045                   if (inst->src[1].imm.f <= 0.0f) {
2046                      inst->opcode = BRW_OPCODE_MOV;
2047                      inst->src[1] = reg_undef;
2048                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
2049                      progress = true;
2050                   }
2051                   break;
2052                default:
2053                   break;
2054                }
2055             default:
2056                break;
2057             }
2058          }
2059          break;
2060       default:
2061          break;
2062       }
2063    }
2064
2065    return progress;
2066 }
2067
2068 /**
2069  * Removes any instructions writing a VGRF where that VGRF is not used by any
2070  * later instruction.
2071  */
2072 bool
2073 fs_visitor::dead_code_eliminate()
2074 {
2075    bool progress = false;
2076    int pc = 0;
2077
2078    calculate_live_intervals();
2079
2080    foreach_list_safe(node, &this->instructions) {
2081       fs_inst *inst = (fs_inst *)node;
2082
2083       if (inst->dst.file == GRF && !inst->has_side_effects()) {
2084          bool dead = true;
2085
2086          for (int i = 0; i < inst->regs_written; i++) {
2087             int var = live_intervals->var_from_vgrf[inst->dst.reg];
2088             assert(live_intervals->end[var + inst->dst.reg_offset + i] >= pc);
2089             if (live_intervals->end[var + inst->dst.reg_offset + i] != pc) {
2090                dead = false;
2091                break;
2092             }
2093          }
2094
2095          if (dead) {
2096             /* Don't dead code eliminate instructions that write to the
2097              * accumulator as a side-effect. Instead just set the destination
2098              * to the null register to free it.
2099              */
2100             switch (inst->opcode) {
2101             case BRW_OPCODE_ADDC:
2102             case BRW_OPCODE_SUBB:
2103             case BRW_OPCODE_MACH:
2104                inst->dst = fs_reg(retype(brw_null_reg(), inst->dst.type));
2105                break;
2106             default:
2107                inst->remove();
2108                progress = true;
2109                break;
2110             }
2111          }
2112       }
2113
2114       pc++;
2115    }
2116
2117    if (progress)
2118       invalidate_live_intervals();
2119
2120    return progress;
2121 }
2122
2123 struct dead_code_hash_key
2124 {
2125    int vgrf;
2126    int reg_offset;
2127 };
2128
2129 static bool
2130 dead_code_hash_compare(const void *a, const void *b)
2131 {
2132    return memcmp(a, b, sizeof(struct dead_code_hash_key)) == 0;
2133 }
2134
2135 static void
2136 clear_dead_code_hash(struct hash_table *ht)
2137 {
2138    struct hash_entry *entry;
2139
2140    hash_table_foreach(ht, entry) {
2141       _mesa_hash_table_remove(ht, entry);
2142    }
2143 }
2144
2145 static void
2146 insert_dead_code_hash(struct hash_table *ht,
2147                       int vgrf, int reg_offset, fs_inst *inst)
2148 {
2149    /* We don't bother freeing keys, because they'll be GCed with the ht. */
2150    struct dead_code_hash_key *key = ralloc(ht, struct dead_code_hash_key);
2151
2152    key->vgrf = vgrf;
2153    key->reg_offset = reg_offset;
2154
2155    _mesa_hash_table_insert(ht, _mesa_hash_data(key, sizeof(*key)), key, inst);
2156 }
2157
2158 static struct hash_entry *
2159 get_dead_code_hash_entry(struct hash_table *ht, int vgrf, int reg_offset)
2160 {
2161    struct dead_code_hash_key key;
2162
2163    key.vgrf = vgrf;
2164    key.reg_offset = reg_offset;
2165
2166    return _mesa_hash_table_search(ht, _mesa_hash_data(&key, sizeof(key)), &key);
2167 }
2168
2169 static void
2170 remove_dead_code_hash(struct hash_table *ht,
2171                       int vgrf, int reg_offset)
2172 {
2173    struct hash_entry *entry = get_dead_code_hash_entry(ht, vgrf, reg_offset);
2174    if (!entry)
2175       return;
2176
2177    _mesa_hash_table_remove(ht, entry);
2178 }
2179
2180 /**
2181  * Walks basic blocks, removing any regs that are written but not read before
2182  * being redefined.
2183  *
2184  * The dead_code_eliminate() function implements a global dead code
2185  * elimination, but it only handles the removing the last write to a register
2186  * if it's never read.  This one can handle intermediate writes, but only
2187  * within a basic block.
2188  */
2189 bool
2190 fs_visitor::dead_code_eliminate_local()
2191 {
2192    struct hash_table *ht;
2193    bool progress = false;
2194
2195    ht = _mesa_hash_table_create(mem_ctx, dead_code_hash_compare);
2196
2197    if (ht == NULL) {
2198       return false;
2199    }
2200
2201    foreach_list_safe(node, &this->instructions) {
2202       fs_inst *inst = (fs_inst *)node;
2203
2204       /* At a basic block, empty the HT since we don't understand dataflow
2205        * here.
2206        */
2207       if (inst->is_control_flow()) {
2208          clear_dead_code_hash(ht);
2209          continue;
2210       }
2211
2212       /* Clear the HT of any instructions that got read. */
2213       for (int i = 0; i < 3; i++) {
2214          fs_reg src = inst->src[i];
2215          if (src.file != GRF)
2216             continue;
2217
2218          int read = 1;
2219          if (inst->is_send_from_grf())
2220             read = virtual_grf_sizes[src.reg] - src.reg_offset;
2221
2222          for (int reg_offset = src.reg_offset;
2223               reg_offset < src.reg_offset + read;
2224               reg_offset++) {
2225             remove_dead_code_hash(ht, src.reg, reg_offset);
2226          }
2227       }
2228
2229       /* Add any update of a GRF to the HT, removing a previous write if it
2230        * wasn't read.
2231        */
2232       if (inst->dst.file == GRF) {
2233          if (inst->regs_written > 1) {
2234             /* We don't know how to trim channels from an instruction's
2235              * writes, so we can't incrementally remove unread channels from
2236              * it.  Just remove whatever it overwrites from the table
2237              */
2238             for (int i = 0; i < inst->regs_written; i++) {
2239                remove_dead_code_hash(ht,
2240                                      inst->dst.reg,
2241                                      inst->dst.reg_offset + i);
2242             }
2243          } else {
2244             struct hash_entry *entry =
2245                get_dead_code_hash_entry(ht, inst->dst.reg,
2246                                         inst->dst.reg_offset);
2247
2248             if (entry) {
2249                if (inst->is_partial_write()) {
2250                   /* For a partial write, we can't remove any previous dead code
2251                    * candidate, since we're just modifying their result.
2252                    */
2253                } else {
2254                   /* We're completely updating a channel, and there was a
2255                    * previous write to the channel that wasn't read.  Kill it!
2256                    */
2257                   fs_inst *inst = (fs_inst *)entry->data;
2258                   inst->remove();
2259                   progress = true;
2260                }
2261
2262                _mesa_hash_table_remove(ht, entry);
2263             }
2264
2265             if (!inst->has_side_effects())
2266                insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
2267                                      inst);
2268          }
2269       }
2270    }
2271
2272    _mesa_hash_table_destroy(ht, NULL);
2273
2274    if (progress)
2275       invalidate_live_intervals();
2276
2277    return progress;
2278 }
2279
2280 /**
2281  * Implements register coalescing: Checks if the two registers involved in a
2282  * raw move don't interfere, in which case they can both be stored in the same
2283  * place and the MOV removed.
2284  *
2285  * To do this, all uses of the source of the MOV in the shader are replaced
2286  * with the destination of the MOV. For example:
2287  *
2288  * add vgrf3:F, vgrf1:F, vgrf2:F
2289  * mov vgrf4:F, vgrf3:F
2290  * mul vgrf5:F, vgrf5:F, vgrf4:F
2291  *
2292  * becomes
2293  *
2294  * add vgrf4:F, vgrf1:F, vgrf2:F
2295  * mul vgrf5:F, vgrf5:F, vgrf4:F
2296  */
2297 bool
2298 fs_visitor::register_coalesce()
2299 {
2300    bool progress = false;
2301
2302    calculate_live_intervals();
2303
2304    int src_size = 0;
2305    int channels_remaining = 0;
2306    int reg_from = -1, reg_to = -1;
2307    int reg_to_offset[MAX_SAMPLER_MESSAGE_SIZE];
2308    fs_inst *mov[MAX_SAMPLER_MESSAGE_SIZE];
2309
2310    foreach_list(node, &this->instructions) {
2311       fs_inst *inst = (fs_inst *)node;
2312
2313       if (inst->opcode != BRW_OPCODE_MOV ||
2314           inst->is_partial_write() ||
2315           inst->saturate ||
2316           inst->src[0].file != GRF ||
2317           inst->src[0].negate ||
2318           inst->src[0].abs ||
2319           inst->src[0].smear != -1 ||
2320           inst->dst.file != GRF ||
2321           inst->dst.type != inst->src[0].type) {
2322          continue;
2323       }
2324
2325       if (virtual_grf_sizes[inst->src[0].reg] >
2326           virtual_grf_sizes[inst->dst.reg])
2327          continue;
2328
2329       int var_from = live_intervals->var_from_reg(&inst->src[0]);
2330       int var_to = live_intervals->var_from_reg(&inst->dst);
2331
2332       if (live_intervals->vars_interfere(var_from, var_to) &&
2333           !inst->dst.equals(inst->src[0])) {
2334
2335          /* We know that the live ranges of A (var_from) and B (var_to)
2336           * interfere because of the ->vars_interfere() call above. If the end
2337           * of B's live range is after the end of A's range, then we know two
2338           * things:
2339           *  - the start of B's live range must be in A's live range (since we
2340           *    already know the two ranges interfere, this is the only remaining
2341           *    possibility)
2342           *  - the interference isn't of the form we're looking for (where B is
2343           *    entirely inside A)
2344           */
2345          if (live_intervals->end[var_to] > live_intervals->end[var_from])
2346             continue;
2347
2348          bool overwritten = false;
2349          int scan_ip = -1;
2350
2351          foreach_list(n, &this->instructions) {
2352             fs_inst *scan_inst = (fs_inst *)n;
2353             scan_ip++;
2354
2355             if (scan_inst->is_control_flow()) {
2356                overwritten = true;
2357                break;
2358             }
2359
2360             if (scan_ip <= live_intervals->start[var_to])
2361                continue;
2362
2363             if (scan_ip > live_intervals->end[var_to])
2364                break;
2365
2366             if (scan_inst->dst.equals(inst->dst) ||
2367                 scan_inst->dst.equals(inst->src[0])) {
2368                overwritten = true;
2369                break;
2370             }
2371          }
2372
2373          if (overwritten)
2374             continue;
2375       }
2376
2377       if (reg_from != inst->src[0].reg) {
2378          reg_from = inst->src[0].reg;
2379
2380          src_size = virtual_grf_sizes[inst->src[0].reg];
2381          assert(src_size <= MAX_SAMPLER_MESSAGE_SIZE);
2382
2383          channels_remaining = src_size;
2384          memset(mov, 0, sizeof(mov));
2385
2386          reg_to = inst->dst.reg;
2387       }
2388
2389       if (reg_to != inst->dst.reg)
2390          continue;
2391
2392       const int offset = inst->src[0].reg_offset;
2393       reg_to_offset[offset] = inst->dst.reg_offset;
2394       mov[offset] = inst;
2395       channels_remaining--;
2396
2397       if (channels_remaining)
2398          continue;
2399
2400       bool removed = false;
2401       for (int i = 0; i < src_size; i++) {
2402          if (mov[i]) {
2403             removed = true;
2404
2405             mov[i]->opcode = BRW_OPCODE_NOP;
2406             mov[i]->conditional_mod = BRW_CONDITIONAL_NONE;
2407             mov[i]->dst = reg_undef;
2408             mov[i]->src[0] = reg_undef;
2409             mov[i]->src[1] = reg_undef;
2410             mov[i]->src[2] = reg_undef;
2411          }
2412       }
2413
2414       foreach_list(node, &this->instructions) {
2415          fs_inst *scan_inst = (fs_inst *)node;
2416
2417          for (int i = 0; i < src_size; i++) {
2418             if (mov[i]) {
2419                if (scan_inst->dst.file == GRF &&
2420                    scan_inst->dst.reg == reg_from &&
2421                    scan_inst->dst.reg_offset == i) {
2422                   scan_inst->dst.reg = reg_to;
2423                   scan_inst->dst.reg_offset = reg_to_offset[i];
2424                }
2425                for (int j = 0; j < 3; j++) {
2426                   if (scan_inst->src[j].file == GRF &&
2427                       scan_inst->src[j].reg == reg_from &&
2428                       scan_inst->src[j].reg_offset == i) {
2429                      scan_inst->src[j].reg = reg_to;
2430                      scan_inst->src[j].reg_offset = reg_to_offset[i];
2431                   }
2432                }
2433             }
2434          }
2435       }
2436
2437       if (removed) {
2438          live_intervals->start[var_to] = MIN2(live_intervals->start[var_to],
2439                                               live_intervals->start[var_from]);
2440          live_intervals->end[var_to] = MAX2(live_intervals->end[var_to],
2441                                             live_intervals->end[var_from]);
2442          reg_from = -1;
2443       }
2444    }
2445
2446    foreach_list_safe(node, &this->instructions) {
2447       fs_inst *inst = (fs_inst *)node;
2448
2449       if (inst->opcode == BRW_OPCODE_NOP) {
2450          inst->remove();
2451          progress = true;
2452       }
2453    }
2454
2455    if (progress)
2456       invalidate_live_intervals();
2457
2458    return progress;
2459 }
2460
2461 bool
2462 fs_visitor::compute_to_mrf()
2463 {
2464    bool progress = false;
2465    int next_ip = 0;
2466
2467    calculate_live_intervals();
2468
2469    foreach_list_safe(node, &this->instructions) {
2470       fs_inst *inst = (fs_inst *)node;
2471
2472       int ip = next_ip;
2473       next_ip++;
2474
2475       if (inst->opcode != BRW_OPCODE_MOV ||
2476           inst->is_partial_write() ||
2477           inst->dst.file != MRF || inst->src[0].file != GRF ||
2478           inst->dst.type != inst->src[0].type ||
2479           inst->src[0].abs || inst->src[0].negate ||
2480           inst->src[0].smear != -1 || inst->src[0].subreg_offset)
2481          continue;
2482
2483       /* Work out which hardware MRF registers are written by this
2484        * instruction.
2485        */
2486       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2487       int mrf_high;
2488       if (inst->dst.reg & BRW_MRF_COMPR4) {
2489          mrf_high = mrf_low + 4;
2490       } else if (dispatch_width == 16 &&
2491                  (!inst->force_uncompressed && !inst->force_sechalf)) {
2492          mrf_high = mrf_low + 1;
2493       } else {
2494          mrf_high = mrf_low;
2495       }
2496
2497       /* Can't compute-to-MRF this GRF if someone else was going to
2498        * read it later.
2499        */
2500       if (this->virtual_grf_end[inst->src[0].reg] > ip)
2501          continue;
2502
2503       /* Found a move of a GRF to a MRF.  Let's see if we can go
2504        * rewrite the thing that made this GRF to write into the MRF.
2505        */
2506       fs_inst *scan_inst;
2507       for (scan_inst = (fs_inst *)inst->prev;
2508            scan_inst->prev != NULL;
2509            scan_inst = (fs_inst *)scan_inst->prev) {
2510          if (scan_inst->dst.file == GRF &&
2511              scan_inst->dst.reg == inst->src[0].reg) {
2512             /* Found the last thing to write our reg we want to turn
2513              * into a compute-to-MRF.
2514              */
2515
2516             /* If this one instruction didn't populate all the
2517              * channels, bail.  We might be able to rewrite everything
2518              * that writes that reg, but it would require smarter
2519              * tracking to delay the rewriting until complete success.
2520              */
2521             if (scan_inst->is_partial_write())
2522                break;
2523
2524             /* Things returning more than one register would need us to
2525              * understand coalescing out more than one MOV at a time.
2526              */
2527             if (scan_inst->regs_written > 1)
2528                break;
2529
2530             /* SEND instructions can't have MRF as a destination. */
2531             if (scan_inst->mlen)
2532                break;
2533
2534             if (brw->gen == 6) {
2535                /* gen6 math instructions must have the destination be
2536                 * GRF, so no compute-to-MRF for them.
2537                 */
2538                if (scan_inst->is_math()) {
2539                   break;
2540                }
2541             }
2542
2543             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2544                /* Found the creator of our MRF's source value. */
2545                scan_inst->dst.file = MRF;
2546                scan_inst->dst.reg = inst->dst.reg;
2547                scan_inst->saturate |= inst->saturate;
2548                inst->remove();
2549                progress = true;
2550             }
2551             break;
2552          }
2553
2554          /* We don't handle control flow here.  Most computation of
2555           * values that end up in MRFs are shortly before the MRF
2556           * write anyway.
2557           */
2558          if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2559             break;
2560
2561          /* You can't read from an MRF, so if someone else reads our
2562           * MRF's source GRF that we wanted to rewrite, that stops us.
2563           */
2564          bool interfered = false;
2565          for (int i = 0; i < 3; i++) {
2566             if (scan_inst->src[i].file == GRF &&
2567                 scan_inst->src[i].reg == inst->src[0].reg &&
2568                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2569                interfered = true;
2570             }
2571          }
2572          if (interfered)
2573             break;
2574
2575          if (scan_inst->dst.file == MRF) {
2576             /* If somebody else writes our MRF here, we can't
2577              * compute-to-MRF before that.
2578              */
2579             int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2580             int scan_mrf_high;
2581
2582             if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2583                scan_mrf_high = scan_mrf_low + 4;
2584             } else if (dispatch_width == 16 &&
2585                        (!scan_inst->force_uncompressed &&
2586                         !scan_inst->force_sechalf)) {
2587                scan_mrf_high = scan_mrf_low + 1;
2588             } else {
2589                scan_mrf_high = scan_mrf_low;
2590             }
2591
2592             if (mrf_low == scan_mrf_low ||
2593                 mrf_low == scan_mrf_high ||
2594                 mrf_high == scan_mrf_low ||
2595                 mrf_high == scan_mrf_high) {
2596                break;
2597             }
2598          }
2599
2600          if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2601             /* Found a SEND instruction, which means that there are
2602              * live values in MRFs from base_mrf to base_mrf +
2603              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
2604              * above it.
2605              */
2606             if (mrf_low >= scan_inst->base_mrf &&
2607                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2608                break;
2609             }
2610             if (mrf_high >= scan_inst->base_mrf &&
2611                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2612                break;
2613             }
2614          }
2615       }
2616    }
2617
2618    if (progress)
2619       invalidate_live_intervals();
2620
2621    return progress;
2622 }
2623
2624 /**
2625  * Walks through basic blocks, looking for repeated MRF writes and
2626  * removing the later ones.
2627  */
2628 bool
2629 fs_visitor::remove_duplicate_mrf_writes()
2630 {
2631    fs_inst *last_mrf_move[16];
2632    bool progress = false;
2633
2634    /* Need to update the MRF tracking for compressed instructions. */
2635    if (dispatch_width == 16)
2636       return false;
2637
2638    memset(last_mrf_move, 0, sizeof(last_mrf_move));
2639
2640    foreach_list_safe(node, &this->instructions) {
2641       fs_inst *inst = (fs_inst *)node;
2642
2643       if (inst->is_control_flow()) {
2644          memset(last_mrf_move, 0, sizeof(last_mrf_move));
2645       }
2646
2647       if (inst->opcode == BRW_OPCODE_MOV &&
2648           inst->dst.file == MRF) {
2649          fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2650          if (prev_inst && inst->equals(prev_inst)) {
2651             inst->remove();
2652             progress = true;
2653             continue;
2654          }
2655       }
2656
2657       /* Clear out the last-write records for MRFs that were overwritten. */
2658       if (inst->dst.file == MRF) {
2659          last_mrf_move[inst->dst.reg] = NULL;
2660       }
2661
2662       if (inst->mlen > 0 && inst->base_mrf != -1) {
2663          /* Found a SEND instruction, which will include two or fewer
2664           * implied MRF writes.  We could do better here.
2665           */
2666          for (int i = 0; i < implied_mrf_writes(inst); i++) {
2667             last_mrf_move[inst->base_mrf + i] = NULL;
2668          }
2669       }
2670
2671       /* Clear out any MRF move records whose sources got overwritten. */
2672       if (inst->dst.file == GRF) {
2673          for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2674             if (last_mrf_move[i] &&
2675                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2676                last_mrf_move[i] = NULL;
2677             }
2678          }
2679       }
2680
2681       if (inst->opcode == BRW_OPCODE_MOV &&
2682           inst->dst.file == MRF &&
2683           inst->src[0].file == GRF &&
2684           !inst->is_partial_write()) {
2685          last_mrf_move[inst->dst.reg] = inst;
2686       }
2687    }
2688
2689    if (progress)
2690       invalidate_live_intervals();
2691
2692    return progress;
2693 }
2694
2695 static void
2696 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2697                         int first_grf, int grf_len)
2698 {
2699    bool inst_simd16 = (dispatch_width > 8 &&
2700                        !inst->force_uncompressed &&
2701                        !inst->force_sechalf);
2702
2703    /* Clear the flag for registers that actually got read (as expected). */
2704    for (int i = 0; i < 3; i++) {
2705       int grf;
2706       if (inst->src[i].file == GRF) {
2707          grf = inst->src[i].reg;
2708       } else if (inst->src[i].file == HW_REG &&
2709                  inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2710          grf = inst->src[i].fixed_hw_reg.nr;
2711       } else {
2712          continue;
2713       }
2714
2715       if (grf >= first_grf &&
2716           grf < first_grf + grf_len) {
2717          deps[grf - first_grf] = false;
2718          if (inst_simd16)
2719             deps[grf - first_grf + 1] = false;
2720       }
2721    }
2722 }
2723
2724 /**
2725  * Implements this workaround for the original 965:
2726  *
2727  *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2728  *      check for post destination dependencies on this instruction, software
2729  *      must ensure that there is no destination hazard for the case of ‘write
2730  *      followed by a posted write’ shown in the following example.
2731  *
2732  *      1. mov r3 0
2733  *      2. send r3.xy <rest of send instruction>
2734  *      3. mov r2 r3
2735  *
2736  *      Due to no post-destination dependency check on the ‘send’, the above
2737  *      code sequence could have two instructions (1 and 2) in flight at the
2738  *      same time that both consider ‘r3’ as the target of their final writes.
2739  */
2740 void
2741 fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
2742 {
2743    int reg_size = dispatch_width / 8;
2744    int write_len = inst->regs_written * reg_size;
2745    int first_write_grf = inst->dst.reg;
2746    bool needs_dep[BRW_MAX_MRF];
2747    assert(write_len < (int)sizeof(needs_dep) - 1);
2748
2749    memset(needs_dep, false, sizeof(needs_dep));
2750    memset(needs_dep, true, write_len);
2751
2752    clear_deps_for_inst_src(inst, dispatch_width,
2753                            needs_dep, first_write_grf, write_len);
2754
2755    /* Walk backwards looking for writes to registers we're writing which
2756     * aren't read since being written.  If we hit the start of the program,
2757     * we assume that there are no outstanding dependencies on entry to the
2758     * program.
2759     */
2760    for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2761         scan_inst != NULL;
2762         scan_inst = (fs_inst *)scan_inst->prev) {
2763
2764       /* If we hit control flow, assume that there *are* outstanding
2765        * dependencies, and force their cleanup before our instruction.
2766        */
2767       if (scan_inst->is_control_flow()) {
2768          for (int i = 0; i < write_len; i++) {
2769             if (needs_dep[i]) {
2770                inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2771             }
2772          }
2773          return;
2774       }
2775
2776       bool scan_inst_simd16 = (dispatch_width > 8 &&
2777                                !scan_inst->force_uncompressed &&
2778                                !scan_inst->force_sechalf);
2779
2780       /* We insert our reads as late as possible on the assumption that any
2781        * instruction but a MOV that might have left us an outstanding
2782        * dependency has more latency than a MOV.
2783        */
2784       if (scan_inst->dst.file == GRF) {
2785          for (int i = 0; i < scan_inst->regs_written; i++) {
2786             int reg = scan_inst->dst.reg + i * reg_size;
2787
2788             if (reg >= first_write_grf &&
2789                 reg < first_write_grf + write_len &&
2790                 needs_dep[reg - first_write_grf]) {
2791                inst->insert_before(DEP_RESOLVE_MOV(reg));
2792                needs_dep[reg - first_write_grf] = false;
2793                if (scan_inst_simd16)
2794                   needs_dep[reg - first_write_grf + 1] = false;
2795             }
2796          }
2797       }
2798
2799       /* Clear the flag for registers that actually got read (as expected). */
2800       clear_deps_for_inst_src(scan_inst, dispatch_width,
2801                               needs_dep, first_write_grf, write_len);
2802
2803       /* Continue the loop only if we haven't resolved all the dependencies */
2804       int i;
2805       for (i = 0; i < write_len; i++) {
2806          if (needs_dep[i])
2807             break;
2808       }
2809       if (i == write_len)
2810          return;
2811    }
2812 }
2813
2814 /**
2815  * Implements this workaround for the original 965:
2816  *
2817  *     "[DevBW, DevCL] Errata: A destination register from a send can not be
2818  *      used as a destination register until after it has been sourced by an
2819  *      instruction with a different destination register.
2820  */
2821 void
2822 fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
2823 {
2824    int write_len = inst->regs_written * dispatch_width / 8;
2825    int first_write_grf = inst->dst.reg;
2826    bool needs_dep[BRW_MAX_MRF];
2827    assert(write_len < (int)sizeof(needs_dep) - 1);
2828
2829    memset(needs_dep, false, sizeof(needs_dep));
2830    memset(needs_dep, true, write_len);
2831    /* Walk forwards looking for writes to registers we're writing which aren't
2832     * read before being written.
2833     */
2834    for (fs_inst *scan_inst = (fs_inst *)inst->next;
2835         !scan_inst->is_tail_sentinel();
2836         scan_inst = (fs_inst *)scan_inst->next) {
2837       /* If we hit control flow, force resolve all remaining dependencies. */
2838       if (scan_inst->is_control_flow()) {
2839          for (int i = 0; i < write_len; i++) {
2840             if (needs_dep[i])
2841                scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2842          }
2843          return;
2844       }
2845
2846       /* Clear the flag for registers that actually got read (as expected). */
2847       clear_deps_for_inst_src(scan_inst, dispatch_width,
2848                               needs_dep, first_write_grf, write_len);
2849
2850       /* We insert our reads as late as possible since they're reading the
2851        * result of a SEND, which has massive latency.
2852        */
2853       if (scan_inst->dst.file == GRF &&
2854           scan_inst->dst.reg >= first_write_grf &&
2855           scan_inst->dst.reg < first_write_grf + write_len &&
2856           needs_dep[scan_inst->dst.reg - first_write_grf]) {
2857          scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2858          needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2859       }
2860
2861       /* Continue the loop only if we haven't resolved all the dependencies */
2862       int i;
2863       for (i = 0; i < write_len; i++) {
2864          if (needs_dep[i])
2865             break;
2866       }
2867       if (i == write_len)
2868          return;
2869    }
2870
2871    /* If we hit the end of the program, resolve all remaining dependencies out
2872     * of paranoia.
2873     */
2874    fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2875    assert(last_inst->eot);
2876    for (int i = 0; i < write_len; i++) {
2877       if (needs_dep[i])
2878          last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2879    }
2880 }
2881
2882 void
2883 fs_visitor::insert_gen4_send_dependency_workarounds()
2884 {
2885    if (brw->gen != 4 || brw->is_g4x)
2886       return;
2887
2888    /* Note that we're done with register allocation, so GRF fs_regs always
2889     * have a .reg_offset of 0.
2890     */
2891
2892    foreach_list_safe(node, &this->instructions) {
2893       fs_inst *inst = (fs_inst *)node;
2894
2895       if (inst->mlen != 0 && inst->dst.file == GRF) {
2896          insert_gen4_pre_send_dependency_workarounds(inst);
2897          insert_gen4_post_send_dependency_workarounds(inst);
2898       }
2899    }
2900 }
2901
2902 /**
2903  * Turns the generic expression-style uniform pull constant load instruction
2904  * into a hardware-specific series of instructions for loading a pull
2905  * constant.
2906  *
2907  * The expression style allows the CSE pass before this to optimize out
2908  * repeated loads from the same offset, and gives the pre-register-allocation
2909  * scheduling full flexibility, while the conversion to native instructions
2910  * allows the post-register-allocation scheduler the best information
2911  * possible.
2912  *
2913  * Note that execution masking for setting up pull constant loads is special:
2914  * the channels that need to be written are unrelated to the current execution
2915  * mask, since a later instruction will use one of the result channels as a
2916  * source operand for all 8 or 16 of its channels.
2917  */
2918 void
2919 fs_visitor::lower_uniform_pull_constant_loads()
2920 {
2921    foreach_list(node, &this->instructions) {
2922       fs_inst *inst = (fs_inst *)node;
2923
2924       if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2925          continue;
2926
2927       if (brw->gen >= 7) {
2928          /* The offset arg before was a vec4-aligned byte offset.  We need to
2929           * turn it into a dword offset.
2930           */
2931          fs_reg const_offset_reg = inst->src[1];
2932          assert(const_offset_reg.file == IMM &&
2933                 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2934          const_offset_reg.imm.u /= 4;
2935          fs_reg payload = fs_reg(this, glsl_type::uint_type);
2936
2937          /* This is actually going to be a MOV, but since only the first dword
2938           * is accessed, we have a special opcode to do just that one.  Note
2939           * that this needs to be an operation that will be considered a def
2940           * by live variable analysis, or register allocation will explode.
2941           */
2942          fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2943                                                payload, const_offset_reg);
2944          setup->force_writemask_all = true;
2945
2946          setup->ir = inst->ir;
2947          setup->annotation = inst->annotation;
2948          inst->insert_before(setup);
2949
2950          /* Similarly, this will only populate the first 4 channels of the
2951           * result register (since we only use smear values from 0-3), but we
2952           * don't tell the optimizer.
2953           */
2954          inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2955          inst->src[1] = payload;
2956
2957          invalidate_live_intervals();
2958       } else {
2959          /* Before register allocation, we didn't tell the scheduler about the
2960           * MRF we use.  We know it's safe to use this MRF because nothing
2961           * else does except for register spill/unspill, which generates and
2962           * uses its MRF within a single IR instruction.
2963           */
2964          inst->base_mrf = 14;
2965          inst->mlen = 1;
2966       }
2967    }
2968 }
2969
2970 void
2971 fs_visitor::dump_instructions()
2972 {
2973    calculate_register_pressure();
2974
2975    int ip = 0, max_pressure = 0;
2976    foreach_list(node, &this->instructions) {
2977       backend_instruction *inst = (backend_instruction *)node;
2978       max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
2979       printf("{%3d} %4d: ", regs_live_at_ip[ip], ip);
2980       dump_instruction(inst);
2981       ++ip;
2982    }
2983    printf("Maximum %3d registers live at once.\n", max_pressure);
2984 }
2985
2986 void
2987 fs_visitor::dump_instruction(backend_instruction *be_inst)
2988 {
2989    fs_inst *inst = (fs_inst *)be_inst;
2990
2991    if (inst->predicate) {
2992       printf("(%cf0.%d) ",
2993              inst->predicate_inverse ? '-' : '+',
2994              inst->flag_subreg);
2995    }
2996
2997    printf("%s", brw_instruction_name(inst->opcode));
2998    if (inst->saturate)
2999       printf(".sat");
3000    if (inst->conditional_mod) {
3001       printf("%s", conditional_modifier[inst->conditional_mod]);
3002       if (!inst->predicate &&
3003           (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
3004                               inst->opcode != BRW_OPCODE_IF &&
3005                               inst->opcode != BRW_OPCODE_WHILE))) {
3006          printf(".f0.%d", inst->flag_subreg);
3007       }
3008    }
3009    printf(" ");
3010
3011
3012    switch (inst->dst.file) {
3013    case GRF:
3014       printf("vgrf%d", inst->dst.reg);
3015       if (virtual_grf_sizes[inst->dst.reg] != 1 ||
3016           inst->dst.subreg_offset)
3017          printf("+%d.%d", inst->dst.reg_offset, inst->dst.subreg_offset);
3018       break;
3019    case MRF:
3020       printf("m%d", inst->dst.reg);
3021       break;
3022    case BAD_FILE:
3023       printf("(null)");
3024       break;
3025    case UNIFORM:
3026       printf("***u%d***", inst->dst.reg);
3027       break;
3028    case HW_REG:
3029       if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3030          switch (inst->dst.fixed_hw_reg.nr) {
3031          case BRW_ARF_NULL:
3032             printf("null");
3033             break;
3034          case BRW_ARF_ADDRESS:
3035             printf("a0.%d", inst->dst.fixed_hw_reg.subnr);
3036             break;
3037          case BRW_ARF_ACCUMULATOR:
3038             printf("acc%d", inst->dst.fixed_hw_reg.subnr);
3039             break;
3040          case BRW_ARF_FLAG:
3041             printf("f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3042                              inst->dst.fixed_hw_reg.subnr);
3043             break;
3044          default:
3045             printf("arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3046                                inst->dst.fixed_hw_reg.subnr);
3047             break;
3048          }
3049       } else {
3050          printf("hw_reg%d", inst->dst.fixed_hw_reg.nr);
3051       }
3052       if (inst->dst.fixed_hw_reg.subnr)
3053          printf("+%d", inst->dst.fixed_hw_reg.subnr);
3054       break;
3055    default:
3056       printf("???");
3057       break;
3058    }
3059    printf(":%s, ", reg_encoding[inst->dst.type]);
3060
3061    for (int i = 0; i < 3 && inst->src[i].file != BAD_FILE; i++) {
3062       if (inst->src[i].negate)
3063          printf("-");
3064       if (inst->src[i].abs)
3065          printf("|");
3066       switch (inst->src[i].file) {
3067       case GRF:
3068          printf("vgrf%d", inst->src[i].reg);
3069          if (virtual_grf_sizes[inst->src[i].reg] != 1 ||
3070              inst->src[i].subreg_offset)
3071             printf("+%d.%d", inst->src[i].reg_offset,
3072                    inst->src[i].subreg_offset);
3073          break;
3074       case MRF:
3075          printf("***m%d***", inst->src[i].reg);
3076          break;
3077       case UNIFORM:
3078          printf("u%d", inst->src[i].reg);
3079          if (virtual_grf_sizes[inst->src[i].reg] != 1 ||
3080              inst->src[i].subreg_offset)
3081             printf("+%d.%d", inst->src[i].reg_offset,
3082                    inst->src[i].subreg_offset);
3083          break;
3084       case BAD_FILE:
3085          printf("(null)");
3086          break;
3087       case IMM:
3088          switch (inst->src[i].type) {
3089          case BRW_REGISTER_TYPE_F:
3090             printf("%ff", inst->src[i].imm.f);
3091             break;
3092          case BRW_REGISTER_TYPE_D:
3093             printf("%dd", inst->src[i].imm.i);
3094             break;
3095          case BRW_REGISTER_TYPE_UD:
3096             printf("%uu", inst->src[i].imm.u);
3097             break;
3098          default:
3099             printf("???");
3100             break;
3101          }
3102          break;
3103       case HW_REG:
3104          if (inst->src[i].fixed_hw_reg.negate)
3105             printf("-");
3106          if (inst->src[i].fixed_hw_reg.abs)
3107             printf("|");
3108          if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3109             switch (inst->src[i].fixed_hw_reg.nr) {
3110             case BRW_ARF_NULL:
3111                printf("null");
3112                break;
3113             case BRW_ARF_ADDRESS:
3114                printf("a0.%d", inst->src[i].fixed_hw_reg.subnr);
3115                break;
3116             case BRW_ARF_ACCUMULATOR:
3117                printf("acc%d", inst->src[i].fixed_hw_reg.subnr);
3118                break;
3119             case BRW_ARF_FLAG:
3120                printf("f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3121                                 inst->src[i].fixed_hw_reg.subnr);
3122                break;
3123             default:
3124                printf("arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3125                                   inst->src[i].fixed_hw_reg.subnr);
3126                break;
3127             }
3128          } else {
3129             printf("hw_reg%d", inst->src[i].fixed_hw_reg.nr);
3130          }
3131          if (inst->src[i].fixed_hw_reg.subnr)
3132             printf("+%d", inst->src[i].fixed_hw_reg.subnr);
3133          if (inst->src[i].fixed_hw_reg.abs)
3134             printf("|");
3135          break;
3136       default:
3137          printf("???");
3138          break;
3139       }
3140       if (inst->src[i].abs)
3141          printf("|");
3142
3143       if (inst->src[i].file != IMM) {
3144          printf(":%s", brw_reg_type_letters(inst->src[i].type));
3145       }
3146
3147       if (i < 2 && inst->src[i + 1].file != BAD_FILE)
3148          printf(", ");
3149    }
3150
3151    printf(" ");
3152
3153    if (inst->force_uncompressed)
3154       printf("1sthalf ");
3155
3156    if (inst->force_sechalf)
3157       printf("2ndhalf ");
3158
3159    printf("\n");
3160 }
3161
3162 /**
3163  * Possibly returns an instruction that set up @param reg.
3164  *
3165  * Sometimes we want to take the result of some expression/variable
3166  * dereference tree and rewrite the instruction generating the result
3167  * of the tree.  When processing the tree, we know that the
3168  * instructions generated are all writing temporaries that are dead
3169  * outside of this tree.  So, if we have some instructions that write
3170  * a temporary, we're free to point that temp write somewhere else.
3171  *
3172  * Note that this doesn't guarantee that the instruction generated
3173  * only reg -- it might be the size=4 destination of a texture instruction.
3174  */
3175 fs_inst *
3176 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3177                                            fs_inst *end,
3178                                            fs_reg reg)
3179 {
3180    if (end == start ||
3181        end->is_partial_write() ||
3182        reg.reladdr ||
3183        !reg.equals(end->dst)) {
3184       return NULL;
3185    } else {
3186       return end;
3187    }
3188 }
3189
3190 void
3191 fs_visitor::setup_payload_gen6()
3192 {
3193    bool uses_depth =
3194       (fp->Base.InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3195    unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
3196
3197    assert(brw->gen >= 6);
3198
3199    /* R0-1: masks, pixel X/Y coordinates. */
3200    c->nr_payload_regs = 2;
3201    /* R2: only for 32-pixel dispatch.*/
3202
3203    /* R3-26: barycentric interpolation coordinates.  These appear in the
3204     * same order that they appear in the brw_wm_barycentric_interp_mode
3205     * enum.  Each set of coordinates occupies 2 registers if dispatch width
3206     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
3207     * appear if they were enabled using the "Barycentric Interpolation
3208     * Mode" bits in WM_STATE.
3209     */
3210    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3211       if (barycentric_interp_modes & (1 << i)) {
3212          c->barycentric_coord_reg[i] = c->nr_payload_regs;
3213          c->nr_payload_regs += 2;
3214          if (dispatch_width == 16) {
3215             c->nr_payload_regs += 2;
3216          }
3217       }
3218    }
3219
3220    /* R27: interpolated depth if uses source depth */
3221    if (uses_depth) {
3222       c->source_depth_reg = c->nr_payload_regs;
3223       c->nr_payload_regs++;
3224       if (dispatch_width == 16) {
3225          /* R28: interpolated depth if not SIMD8. */
3226          c->nr_payload_regs++;
3227       }
3228    }
3229    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3230    if (uses_depth) {
3231       c->source_w_reg = c->nr_payload_regs;
3232       c->nr_payload_regs++;
3233       if (dispatch_width == 16) {
3234          /* R30: interpolated W if not SIMD8. */
3235          c->nr_payload_regs++;
3236       }
3237    }
3238
3239    c->prog_data.uses_pos_offset = c->key.compute_pos_offset;
3240    /* R31: MSAA position offsets. */
3241    if (c->prog_data.uses_pos_offset) {
3242       c->sample_pos_reg = c->nr_payload_regs;
3243       c->nr_payload_regs++;
3244    }
3245
3246    /* R32: MSAA input coverage mask */
3247    if (fp->Base.SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
3248       assert(brw->gen >= 7);
3249       c->sample_mask_reg = c->nr_payload_regs;
3250       c->nr_payload_regs++;
3251       if (dispatch_width == 16) {
3252          /* R33: input coverage mask if not SIMD8. */
3253          c->nr_payload_regs++;
3254       }
3255    }
3256
3257    /* R34-: bary for 32-pixel. */
3258    /* R58-59: interp W for 32-pixel. */
3259
3260    if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3261       c->source_depth_to_render_target = true;
3262    }
3263 }
3264
3265 void
3266 fs_visitor::assign_binding_table_offsets()
3267 {
3268    uint32_t next_binding_table_offset = 0;
3269
3270    /* If there are no color regions, we still perform an FB write to a null
3271     * renderbuffer, which we place at surface index 0.
3272     */
3273    c->prog_data.binding_table.render_target_start = next_binding_table_offset;
3274    next_binding_table_offset += MAX2(c->key.nr_color_regions, 1);
3275
3276    assign_common_binding_table_offsets(next_binding_table_offset);
3277 }
3278
3279 void
3280 fs_visitor::calculate_register_pressure()
3281 {
3282    calculate_live_intervals();
3283
3284    int num_instructions = 0;
3285    foreach_list(node, &this->instructions) {
3286       ++num_instructions;
3287    }
3288
3289    regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
3290
3291    for (int reg = 0; reg < virtual_grf_count; reg++) {
3292       for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
3293          regs_live_at_ip[ip] += virtual_grf_sizes[reg];
3294    }
3295 }
3296
3297 bool
3298 fs_visitor::run()
3299 {
3300    sanity_param_count = fp->Base.Parameters->NumParameters;
3301    uint32_t orig_nr_params = c->prog_data.nr_params;
3302    bool allocated_without_spills;
3303
3304    assign_binding_table_offsets();
3305
3306    if (brw->gen >= 6)
3307       setup_payload_gen6();
3308    else
3309       setup_payload_gen4();
3310
3311    if (0) {
3312       emit_dummy_fs();
3313    } else {
3314       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3315          emit_shader_time_begin();
3316
3317       calculate_urb_setup();
3318       if (fp->Base.InputsRead > 0) {
3319          if (brw->gen < 6)
3320             emit_interpolation_setup_gen4();
3321          else
3322             emit_interpolation_setup_gen6();
3323       }
3324
3325       /* We handle discards by keeping track of the still-live pixels in f0.1.
3326        * Initialize it with the dispatched pixels.
3327        */
3328       if (fp->UsesKill || c->key.alpha_test_func) {
3329          fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
3330          discard_init->flag_subreg = 1;
3331       }
3332
3333       /* Generate FS IR for main().  (the visitor only descends into
3334        * functions called "main").
3335        */
3336       if (shader) {
3337          foreach_list(node, &*shader->base.ir) {
3338             ir_instruction *ir = (ir_instruction *)node;
3339             base_ir = ir;
3340             this->result = reg_undef;
3341             ir->accept(this);
3342          }
3343       } else {
3344          emit_fragment_program_code();
3345       }
3346       base_ir = NULL;
3347       if (failed)
3348          return false;
3349
3350       emit(FS_OPCODE_PLACEHOLDER_HALT);
3351
3352       if (c->key.alpha_test_func)
3353          emit_alpha_test();
3354
3355       emit_fb_writes();
3356
3357       split_virtual_grfs();
3358
3359       move_uniform_array_access_to_pull_constants();
3360       remove_dead_constants();
3361       setup_pull_constants();
3362
3363       bool progress;
3364       do {
3365          progress = false;
3366
3367          compact_virtual_grfs();
3368
3369          progress = remove_duplicate_mrf_writes() || progress;
3370
3371          progress = opt_algebraic() || progress;
3372          progress = opt_cse() || progress;
3373          progress = opt_copy_propagate() || progress;
3374          progress = opt_peephole_predicated_break() || progress;
3375          progress = dead_code_eliminate() || progress;
3376          progress = dead_code_eliminate_local() || progress;
3377          progress = opt_peephole_sel() || progress;
3378          progress = dead_control_flow_eliminate(this) || progress;
3379          progress = opt_saturate_propagation() || progress;
3380          progress = register_coalesce() || progress;
3381          progress = compute_to_mrf() || progress;
3382       } while (progress);
3383
3384       lower_uniform_pull_constant_loads();
3385
3386       assign_curb_setup();
3387       assign_urb_setup();
3388
3389       static enum instruction_scheduler_mode pre_modes[] = {
3390          SCHEDULE_PRE,
3391          SCHEDULE_PRE_NON_LIFO,
3392          SCHEDULE_PRE_LIFO,
3393       };
3394
3395       /* Try each scheduling heuristic to see if it can successfully register
3396        * allocate without spilling.  They should be ordered by decreasing
3397        * performance but increasing likelihood of allocating.
3398        */
3399       for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3400          schedule_instructions(pre_modes[i]);
3401
3402          if (0) {
3403             assign_regs_trivial();
3404             allocated_without_spills = true;
3405          } else {
3406             allocated_without_spills = assign_regs(false);
3407          }
3408          if (allocated_without_spills)
3409             break;
3410       }
3411
3412       if (!allocated_without_spills) {
3413          /* We assume that any spilling is worse than just dropping back to
3414           * SIMD8.  There's probably actually some intermediate point where
3415           * SIMD16 with a couple of spills is still better.
3416           */
3417          if (dispatch_width == 16) {
3418             fail("Failure to register allocate.  Reduce number of "
3419                  "live scalar values to avoid this.");
3420          }
3421
3422          /* Since we're out of heuristics, just go spill registers until we
3423           * get an allocation.
3424           */
3425          while (!assign_regs(true)) {
3426             if (failed)
3427                break;
3428          }
3429       }
3430    }
3431    assert(force_uncompressed_stack == 0);
3432
3433    /* This must come after all optimization and register allocation, since
3434     * it inserts dead code that happens to have side effects, and it does
3435     * so based on the actual physical registers in use.
3436     */
3437    insert_gen4_send_dependency_workarounds();
3438
3439    if (failed)
3440       return false;
3441
3442    if (!allocated_without_spills)
3443       schedule_instructions(SCHEDULE_POST);
3444
3445    if (dispatch_width == 8) {
3446       c->prog_data.reg_blocks = brw_register_blocks(grf_used);
3447    } else {
3448       c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
3449
3450       /* Make sure we didn't try to sneak in an extra uniform */
3451       assert(orig_nr_params == c->prog_data.nr_params);
3452       (void) orig_nr_params;
3453    }
3454
3455    /* If any state parameters were appended, then ParameterValues could have
3456     * been realloced, in which case the driver uniform storage set up by
3457     * _mesa_associate_uniform_storage() would point to freed memory.  Make
3458     * sure that didn't happen.
3459     */
3460    assert(sanity_param_count == fp->Base.Parameters->NumParameters);
3461
3462    return !failed;
3463 }
3464
3465 const unsigned *
3466 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
3467                struct gl_fragment_program *fp,
3468                struct gl_shader_program *prog,
3469                unsigned *final_assembly_size)
3470 {
3471    bool start_busy = false;
3472    float start_time = 0;
3473
3474    if (unlikely(brw->perf_debug)) {
3475       start_busy = (brw->batch.last_bo &&
3476                     drm_intel_bo_busy(brw->batch.last_bo));
3477       start_time = get_time();
3478    }
3479
3480    struct brw_shader *shader = NULL;
3481    if (prog)
3482       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3483
3484    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3485       if (prog) {
3486          printf("GLSL IR for native fragment shader %d:\n", prog->Name);
3487          _mesa_print_ir(shader->base.ir, NULL);
3488          printf("\n\n");
3489       } else {
3490          printf("ARB_fragment_program %d ir for native fragment shader\n",
3491                 fp->Base.Id);
3492          _mesa_print_program(&fp->Base);
3493       }
3494    }
3495
3496    /* Now the main event: Visit the shader IR and generate our FS IR for it.
3497     */
3498    fs_visitor v(brw, c, prog, fp, 8);
3499    if (!v.run()) {
3500       if (prog) {
3501          prog->LinkStatus = false;
3502          ralloc_strcat(&prog->InfoLog, v.fail_msg);
3503       }
3504
3505       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3506                     v.fail_msg);
3507
3508       return NULL;
3509    }
3510
3511    exec_list *simd16_instructions = NULL;
3512    fs_visitor v2(brw, c, prog, fp, 16);
3513    if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16))) {
3514       if (c->prog_data.nr_pull_params == 0) {
3515          /* Try a SIMD16 compile */
3516          v2.import_uniforms(&v);
3517          if (!v2.run()) {
3518             perf_debug("SIMD16 shader failed to compile, falling back to "
3519                        "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
3520          } else {
3521             simd16_instructions = &v2.instructions;
3522          }
3523       } else {
3524          perf_debug("Skipping SIMD16 due to pull parameters.\n");
3525       }
3526    }
3527
3528    const unsigned *assembly = NULL;
3529    if (brw->gen >= 8) {
3530       gen8_fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
3531       assembly = g.generate_assembly(&v.instructions, simd16_instructions,
3532                                      final_assembly_size);
3533    } else {
3534       fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
3535       assembly = g.generate_assembly(&v.instructions, simd16_instructions,
3536                                      final_assembly_size);
3537    }
3538
3539    if (unlikely(brw->perf_debug) && shader) {
3540       if (shader->compiled_once)
3541          brw_wm_debug_recompile(brw, prog, &c->key);
3542       shader->compiled_once = true;
3543
3544       if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
3545          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3546                     (get_time() - start_time) * 1000);
3547       }
3548    }
3549
3550    return assembly;
3551 }
3552
3553 bool
3554 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
3555 {
3556    struct brw_context *brw = brw_context(ctx);
3557    struct brw_wm_prog_key key;
3558
3559    if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
3560       return true;
3561
3562    struct gl_fragment_program *fp = (struct gl_fragment_program *)
3563       prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
3564    struct brw_fragment_program *bfp = brw_fragment_program(fp);
3565    bool program_uses_dfdy = fp->UsesDFdy;
3566
3567    memset(&key, 0, sizeof(key));
3568
3569    if (brw->gen < 6) {
3570       if (fp->UsesKill)
3571          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3572
3573       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3574          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3575
3576       /* Just assume depth testing. */
3577       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3578       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3579    }
3580
3581    if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
3582                                          BRW_FS_VARYING_INPUT_MASK) > 16)
3583       key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
3584
3585    key.clamp_fragment_color = ctx->API == API_OPENGL_COMPAT;
3586
3587    unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
3588    for (unsigned i = 0; i < sampler_count; i++) {
3589       if (fp->Base.ShadowSamplers & (1 << i)) {
3590          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3591          key.tex.swizzles[i] =
3592             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3593       } else {
3594          /* Color sampler: assume no swizzling. */
3595          key.tex.swizzles[i] = SWIZZLE_XYZW;
3596       }
3597    }
3598
3599    if (fp->Base.InputsRead & VARYING_BIT_POS) {
3600       key.drawable_height = ctx->DrawBuffer->Height;
3601    }
3602
3603    key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
3604          ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
3605          BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
3606
3607    if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3608       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
3609                           key.nr_color_regions > 1;
3610    }
3611
3612    /* GL_FRAGMENT_SHADER_DERIVATIVE_HINT is almost always GL_DONT_CARE.  The
3613     * quality of the derivatives is likely to be determined by the driconf
3614     * option.
3615     */
3616    key.high_quality_derivatives = brw->disable_derivative_optimization;
3617
3618    key.program_string_id = bfp->id;
3619
3620    uint32_t old_prog_offset = brw->wm.base.prog_offset;
3621    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3622
3623    bool success = do_wm_prog(brw, prog, bfp, &key);
3624
3625    brw->wm.base.prog_offset = old_prog_offset;
3626    brw->wm.prog_data = old_prog_data;
3627
3628    return success;
3629 }