src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 extern "C" {
  32
  33 #include <sys/types.h>
  34
  35 #include "main/hash_table.h"
  36 #include "main/macros.h"
  37 #include "main/shaderobj.h"
  38 #include "main/fbobject.h"
  39 #include "program/prog_parameter.h"
  40 #include "program/prog_print.h"
  41 #include "program/register_allocate.h"
  42 #include "program/sampler.h"
  43 #include "program/hash_table.h"
  44 #include "brw_context.h"
  45 #include "brw_eu.h"
  46 #include "brw_wm.h"
  47 }
  48 #include "brw_fs.h"
  49 #include "brw_dead_control_flow.h"
  50 #include "main/uniforms.h"
  51 #include "brw_fs_live_variables.h"
  52 #include "glsl/glsl_types.h"
  53
  54 void
  55 fs_inst::init()
  56 {
  57    memset(this, 0, sizeof(*this));
  58    this->opcode = BRW_OPCODE_NOP;
  59    this->conditional_mod = BRW_CONDITIONAL_NONE;
  60
  61    this->dst = reg_undef;
  62    this->src[0] = reg_undef;
  63    this->src[1] = reg_undef;
  64    this->src[2] = reg_undef;
  65
  66    /* This will be the case for almost all instructions. */
  67    this->regs_written = 1;
  68 }
  69
  70 fs_inst::fs_inst()
  71 {
  72    init();
  73 }
  74
  75 fs_inst::fs_inst(enum opcode opcode)
  76 {
  77    init();
  78    this->opcode = opcode;
  79 }
  80
  81 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
  82 {
  83    init();
  84    this->opcode = opcode;
  85    this->dst = dst;
  86
  87    if (dst.file == GRF)
  88       assert(dst.reg_offset >= 0);
  89 }
  90
  91 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
  92 {
  93    init();
  94    this->opcode = opcode;
  95    this->dst = dst;
  96    this->src[0] = src0;
  97
  98    if (dst.file == GRF)
  99       assert(dst.reg_offset >= 0);
 100    if (src[0].file == GRF)
 101       assert(src[0].reg_offset >= 0);
 102 }
 103
 104 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 105 {
 106    init();
 107    this->opcode = opcode;
 108    this->dst = dst;
 109    this->src[0] = src0;
 110    this->src[1] = src1;
 111
 112    if (dst.file == GRF)
 113       assert(dst.reg_offset >= 0);
 114    if (src[0].file == GRF)
 115       assert(src[0].reg_offset >= 0);
 116    if (src[1].file == GRF)
 117       assert(src[1].reg_offset >= 0);
 118 }
 119
 120 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
 121                  fs_reg src0, fs_reg src1, fs_reg src2)
 122 {
 123    init();
 124    this->opcode = opcode;
 125    this->dst = dst;
 126    this->src[0] = src0;
 127    this->src[1] = src1;
 128    this->src[2] = src2;
 129
 130    if (dst.file == GRF)
 131       assert(dst.reg_offset >= 0);
 132    if (src[0].file == GRF)
 133       assert(src[0].reg_offset >= 0);
 134    if (src[1].file == GRF)
 135       assert(src[1].reg_offset >= 0);
 136    if (src[2].file == GRF)
 137       assert(src[2].reg_offset >= 0);
 138 }
 139
 140 #define ALU1(op)                                                        \
 141    fs_inst *                                                            \
 142    fs_visitor::op(fs_reg dst, fs_reg src0)                              \
 143    {                                                                    \
 144       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
 145    }
 146
 147 #define ALU2(op)                                                        \
 148    fs_inst *                                                            \
 149    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1)                 \
 150    {                                                                    \
 151       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
 152    }
 153
 154 #define ALU3(op)                                                        \
 155    fs_inst *                                                            \
 156    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1, fs_reg src2)    \
 157    {                                                                    \
 158       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
 159    }
 160
 161 ALU1(NOT)
 162 ALU1(MOV)
 163 ALU1(FRC)
 164 ALU1(RNDD)
 165 ALU1(RNDE)
 166 ALU1(RNDZ)
 167 ALU2(ADD)
 168 ALU2(MUL)
 169 ALU2(MACH)
 170 ALU2(AND)
 171 ALU2(OR)
 172 ALU2(XOR)
 173 ALU2(SHL)
 174 ALU2(SHR)
 175 ALU2(ASR)
 176 ALU3(LRP)
 177 ALU1(BFREV)
 178 ALU3(BFE)
 179 ALU2(BFI1)
 180 ALU3(BFI2)
 181 ALU1(FBH)
 182 ALU1(FBL)
 183 ALU1(CBIT)
 184 ALU3(MAD)
 185 ALU2(ADDC)
 186 ALU2(SUBB)
 187 ALU2(SEL)
 188
 189 /** Gen4 predicated IF. */
 190 fs_inst *
 191 fs_visitor::IF(uint32_t predicate)
 192 {
 193    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
 194    inst->predicate = predicate;
 195    return inst;
 196 }
 197
 198 /** Gen6 IF with embedded comparison. */
 199 fs_inst *
 200 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
 201 {
 202    assert(brw->gen == 6);
 203    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
 204                                         reg_null_d, src0, src1);
 205    inst->conditional_mod = condition;
 206    return inst;
 207 }
 208
 209 /**
 210  * CMP: Sets the low bit of the destination channels with the result
 211  * of the comparison, while the upper bits are undefined, and updates
 212  * the flag register with the packed 16 bits of the result.
 213  */
 214 fs_inst *
 215 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
 216 {
 217    fs_inst *inst;
 218
 219    /* Take the instruction:
 220     *
 221     * CMP null<d> src0<f> src1<f>
 222     *
 223     * Original gen4 does type conversion to the destination type before
 224     * comparison, producing garbage results for floating point comparisons.
 225     * gen5 does the comparison on the execution type (resolved source types),
 226     * so dst type doesn't matter.  gen6 does comparison and then uses the
 227     * result as if it was the dst type with no conversion, which happens to
 228     * mostly work out for float-interpreted-as-int since our comparisons are
 229     * for >0, =0, <0.
 230     */
 231    if (brw->gen == 4) {
 232       dst.type = src0.type;
 233       if (dst.file == HW_REG)
 234          dst.fixed_hw_reg.type = dst.type;
 235    }
 236
 237    resolve_ud_negate(&src0);
 238    resolve_ud_negate(&src1);
 239
 240    inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
 241    inst->conditional_mod = condition;
 242
 243    return inst;
 244 }
 245
 246 exec_list
 247 fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
 248                                        fs_reg varying_offset,
 249                                        uint32_t const_offset)
 250 {
 251    exec_list instructions;
 252    fs_inst *inst;
 253
 254    /* We have our constant surface use a pitch of 4 bytes, so our index can
 255     * be any component of a vector, and then we load 4 contiguous
 256     * components starting from that.
 257     *
 258     * We break down the const_offset to a portion added to the variable
 259     * offset and a portion done using reg_offset, which means that if you
 260     * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
 261     * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
 262     * CSE can later notice that those loads are all the same and eliminate
 263     * the redundant ones.
 264     */
 265    fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
 266    instructions.push_tail(ADD(vec4_offset,
 267                               varying_offset, const_offset & ~3));
 268
 269    int scale = 1;
 270    if (brw->gen == 4 && dispatch_width == 8) {
 271       /* Pre-gen5, we can either use a SIMD8 message that requires (header,
 272        * u, v, r) as parameters, or we can just use the SIMD16 message
 273        * consisting of (header, u).  We choose the second, at the cost of a
 274        * longer return length.
 275        */
 276       scale = 2;
 277    }
 278
 279    enum opcode op;
 280    if (brw->gen >= 7)
 281       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
 282    else
 283       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
 284    fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4 * scale), dst.type);
 285    inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
 286    inst->regs_written = 4 * scale;
 287    instructions.push_tail(inst);
 288
 289    if (brw->gen < 7) {
 290       inst->base_mrf = 13;
 291       inst->header_present = true;
 292       if (brw->gen == 4)
 293          inst->mlen = 3;
 294       else
 295          inst->mlen = 1 + dispatch_width / 8;
 296    }
 297
 298    vec4_result.reg_offset += (const_offset & 3) * scale;
 299    instructions.push_tail(MOV(dst, vec4_result));
 300
 301    return instructions;
 302 }
 303
 304 /**
 305  * A helper for MOV generation for fixing up broken hardware SEND dependency
 306  * handling.
 307  */
 308 fs_inst *
 309 fs_visitor::DEP_RESOLVE_MOV(int grf)
 310 {
 311    fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
 312
 313    inst->ir = NULL;
 314    inst->annotation = "send dependency resolve";
 315
 316    /* The caller always wants uncompressed to emit the minimal extra
 317     * dependencies, and to avoid having to deal with aligning its regs to 2.
 318     */
 319    inst->force_uncompressed = true;
 320
 321    return inst;
 322 }
 323
 324 bool
 325 fs_inst::equals(fs_inst *inst)
 326 {
 327    return (opcode == inst->opcode &&
 328            dst.equals(inst->dst) &&
 329            src[0].equals(inst->src[0]) &&
 330            src[1].equals(inst->src[1]) &&
 331            src[2].equals(inst->src[2]) &&
 332            saturate == inst->saturate &&
 333            predicate == inst->predicate &&
 334            conditional_mod == inst->conditional_mod &&
 335            mlen == inst->mlen &&
 336            base_mrf == inst->base_mrf &&
 337            sampler == inst->sampler &&
 338            target == inst->target &&
 339            eot == inst->eot &&
 340            header_present == inst->header_present &&
 341            shadow_compare == inst->shadow_compare &&
 342            offset == inst->offset);
 343 }
 344
 345 bool
 346 fs_inst::overwrites_reg(const fs_reg &reg)
 347 {
 348    return (reg.file == dst.file &&
 349            reg.reg == dst.reg &&
 350            reg.reg_offset >= dst.reg_offset  &&
 351            reg.reg_offset < dst.reg_offset + regs_written);
 352 }
 353
 354 bool
 355 fs_inst::is_send_from_grf()
 356 {
 357    return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
 358            opcode == SHADER_OPCODE_SHADER_TIME_ADD ||
 359            (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
 360             src[1].file == GRF) ||
 361            (is_tex() && src[0].file == GRF));
 362 }
 363
 364 bool
 365 fs_visitor::can_do_source_mods(fs_inst *inst)
 366 {
 367    if (brw->gen == 6 && inst->is_math())
 368       return false;
 369
 370    if (inst->is_send_from_grf())
 371       return false;
 372
 373    if (!inst->can_do_source_mods())
 374       return false;
 375
 376    return true;
 377 }
 378
 379 void
 380 fs_reg::init()
 381 {
 382    memset(this, 0, sizeof(*this));
 383    stride = 1;
 384 }
 385
 386 /** Generic unset register constructor. */
 387 fs_reg::fs_reg()
 388 {
 389    init();
 390    this->file = BAD_FILE;
 391 }
 392
 393 /** Immediate value constructor. */
 394 fs_reg::fs_reg(float f)
 395 {
 396    init();
 397    this->file = IMM;
 398    this->type = BRW_REGISTER_TYPE_F;
 399    this->imm.f = f;
 400 }
 401
 402 /** Immediate value constructor. */
 403 fs_reg::fs_reg(int32_t i)
 404 {
 405    init();
 406    this->file = IMM;
 407    this->type = BRW_REGISTER_TYPE_D;
 408    this->imm.i = i;
 409 }
 410
 411 /** Immediate value constructor. */
 412 fs_reg::fs_reg(uint32_t u)
 413 {
 414    init();
 415    this->file = IMM;
 416    this->type = BRW_REGISTER_TYPE_UD;
 417    this->imm.u = u;
 418 }
 419
 420 /** Fixed brw_reg. */
 421 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
 422 {
 423    init();
 424    this->file = HW_REG;
 425    this->fixed_hw_reg = fixed_hw_reg;
 426    this->type = fixed_hw_reg.type;
 427 }
 428
 429 bool
 430 fs_reg::equals(const fs_reg &r) const
 431 {
 432    return (file == r.file &&
 433            reg == r.reg &&
 434            reg_offset == r.reg_offset &&
 435            subreg_offset == r.subreg_offset &&
 436            type == r.type &&
 437            negate == r.negate &&
 438            abs == r.abs &&
 439            !reladdr && !r.reladdr &&
 440            memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
 441                   sizeof(fixed_hw_reg)) == 0 &&
 442            stride == r.stride &&
 443            imm.u == r.imm.u);
 444 }
 445
 446 fs_reg &
 447 fs_reg::apply_stride(unsigned stride)
 448 {
 449    assert((this->stride * stride) <= 4 &&
 450           (is_power_of_two(stride) || stride == 0) &&
 451           file != HW_REG && file != IMM);
 452    this->stride *= stride;
 453    return *this;
 454 }
 455
 456 fs_reg &
 457 fs_reg::set_smear(unsigned subreg)
 458 {
 459    assert(file != HW_REG && file != IMM);
 460    subreg_offset = subreg * type_sz(type);
 461    stride = 0;
 462    return *this;
 463 }
 464
 465 bool
 466 fs_reg::is_contiguous() const
 467 {
 468    return stride == 1;
 469 }
 470
 471 bool
 472 fs_reg::is_zero() const
 473 {
 474    if (file != IMM)
 475       return false;
 476
 477    return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
 478 }
 479
 480 bool
 481 fs_reg::is_one() const
 482 {
 483    if (file != IMM)
 484       return false;
 485
 486    return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
 487 }
 488
 489 bool
 490 fs_reg::is_null() const
 491 {
 492    return file == HW_REG &&
 493           fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
 494           fixed_hw_reg.nr == BRW_ARF_NULL;
 495 }
 496
 497 bool
 498 fs_reg::is_valid_3src() const
 499 {
 500    return file == GRF || file == UNIFORM;
 501 }
 502
 503 int
 504 fs_visitor::type_size(const struct glsl_type *type)
 505 {
 506    unsigned int size, i;
 507
 508    switch (type->base_type) {
 509    case GLSL_TYPE_UINT:
 510    case GLSL_TYPE_INT:
 511    case GLSL_TYPE_FLOAT:
 512    case GLSL_TYPE_BOOL:
 513       return type->components();
 514    case GLSL_TYPE_ARRAY:
 515       return type_size(type->fields.array) * type->length;
 516    case GLSL_TYPE_STRUCT:
 517       size = 0;
 518       for (i = 0; i < type->length; i++) {
 519          size += type_size(type->fields.structure[i].type);
 520       }
 521       return size;
 522    case GLSL_TYPE_SAMPLER:
 523       /* Samplers take up no register space, since they're baked in at
 524        * link time.
 525        */
 526       return 0;
 527    case GLSL_TYPE_ATOMIC_UINT:
 528       return 0;
 529    case GLSL_TYPE_IMAGE:
 530    case GLSL_TYPE_VOID:
 531    case GLSL_TYPE_ERROR:
 532    case GLSL_TYPE_INTERFACE:
 533       assert(!"not reached");
 534       break;
 535    }
 536
 537    return 0;
 538 }
 539
 540 fs_reg
 541 fs_visitor::get_timestamp()
 542 {
 543    assert(brw->gen >= 7);
 544
 545    fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 546                                           BRW_ARF_TIMESTAMP,
 547                                           0),
 548                              BRW_REGISTER_TYPE_UD));
 549
 550    fs_reg dst = fs_reg(this, glsl_type::uint_type);
 551
 552    fs_inst *mov = emit(MOV(dst, ts));
 553    /* We want to read the 3 fields we care about (mostly field 0, but also 2)
 554     * even if it's not enabled in the dispatch.
 555     */
 556    mov->force_writemask_all = true;
 557    mov->force_uncompressed = true;
 558
 559    /* The caller wants the low 32 bits of the timestamp.  Since it's running
 560     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
 561     * which is plenty of time for our purposes.  It is identical across the
 562     * EUs, but since it's tracking GPU core speed it will increment at a
 563     * varying rate as render P-states change.
 564     *
 565     * The caller could also check if render P-states have changed (or anything
 566     * else that might disrupt timing) by setting smear to 2 and checking if
 567     * that field is != 0.
 568     */
 569    dst.set_smear(0);
 570
 571    return dst;
 572 }
 573
 574 void
 575 fs_visitor::emit_shader_time_begin()
 576 {
 577    current_annotation = "shader time start";
 578    shader_start_time = get_timestamp();
 579 }
 580
 581 void
 582 fs_visitor::emit_shader_time_end()
 583 {
 584    current_annotation = "shader time end";
 585
 586    enum shader_time_shader_type type, written_type, reset_type;
 587    if (dispatch_width == 8) {
 588       type = ST_FS8;
 589       written_type = ST_FS8_WRITTEN;
 590       reset_type = ST_FS8_RESET;
 591    } else {
 592       assert(dispatch_width == 16);
 593       type = ST_FS16;
 594       written_type = ST_FS16_WRITTEN;
 595       reset_type = ST_FS16_RESET;
 596    }
 597
 598    fs_reg shader_end_time = get_timestamp();
 599
 600    /* Check that there weren't any timestamp reset events (assuming these
 601     * were the only two timestamp reads that happened).
 602     */
 603    fs_reg reset = shader_end_time;
 604    reset.set_smear(2);
 605    fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
 606    test->conditional_mod = BRW_CONDITIONAL_Z;
 607    emit(IF(BRW_PREDICATE_NORMAL));
 608
 609    push_force_uncompressed();
 610    fs_reg start = shader_start_time;
 611    start.negate = true;
 612    fs_reg diff = fs_reg(this, glsl_type::uint_type);
 613    emit(ADD(diff, start, shader_end_time));
 614
 615    /* If there were no instructions between the two timestamp gets, the diff
 616     * is 2 cycles.  Remove that overhead, so I can forget about that when
 617     * trying to determine the time taken for single instructions.
 618     */
 619    emit(ADD(diff, diff, fs_reg(-2u)));
 620
 621    emit_shader_time_write(type, diff);
 622    emit_shader_time_write(written_type, fs_reg(1u));
 623    emit(BRW_OPCODE_ELSE);
 624    emit_shader_time_write(reset_type, fs_reg(1u));
 625    emit(BRW_OPCODE_ENDIF);
 626
 627    pop_force_uncompressed();
 628 }
 629
 630 void
 631 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
 632                                    fs_reg value)
 633 {
 634    int shader_time_index =
 635       brw_get_shader_time_index(brw, shader_prog, &fp->Base, type);
 636    fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
 637
 638    fs_reg payload;
 639    if (dispatch_width == 8)
 640       payload = fs_reg(this, glsl_type::uvec2_type);
 641    else
 642       payload = fs_reg(this, glsl_type::uint_type);
 643
 644    emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
 645                 fs_reg(), payload, offset, value));
 646 }
 647
 648 void
 649 fs_visitor::fail(const char *format, ...)
 650 {
 651    va_list va;
 652    char *msg;
 653
 654    if (failed)
 655       return;
 656
 657    failed = true;
 658
 659    va_start(va, format);
 660    msg = ralloc_vasprintf(mem_ctx, format, va);
 661    va_end(va);
 662    msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
 663
 664    this->fail_msg = msg;
 665
 666    if (INTEL_DEBUG & DEBUG_WM) {
 667       fprintf(stderr, "%s",  msg);
 668    }
 669 }
 670
 671 fs_inst *
 672 fs_visitor::emit(enum opcode opcode)
 673 {
 674    return emit(fs_inst(opcode));
 675 }
 676
 677 fs_inst *
 678 fs_visitor::emit(enum opcode opcode, fs_reg dst)
 679 {
 680    return emit(fs_inst(opcode, dst));
 681 }
 682
 683 fs_inst *
 684 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
 685 {
 686    return emit(fs_inst(opcode, dst, src0));
 687 }
 688
 689 fs_inst *
 690 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 691 {
 692    return emit(fs_inst(opcode, dst, src0, src1));
 693 }
 694
 695 fs_inst *
 696 fs_visitor::emit(enum opcode opcode, fs_reg dst,
 697                  fs_reg src0, fs_reg src1, fs_reg src2)
 698 {
 699    return emit(fs_inst(opcode, dst, src0, src1, src2));
 700 }
 701
 702 void
 703 fs_visitor::push_force_uncompressed()
 704 {
 705    force_uncompressed_stack++;
 706 }
 707
 708 void
 709 fs_visitor::pop_force_uncompressed()
 710 {
 711    force_uncompressed_stack--;
 712    assert(force_uncompressed_stack >= 0);
 713 }
 714
 715 /**
 716  * Returns true if the instruction has a flag that means it won't
 717  * update an entire destination register.
 718  *
 719  * For example, dead code elimination and live variable analysis want to know
 720  * when a write to a variable screens off any preceding values that were in
 721  * it.
 722  */
 723 bool
 724 fs_inst::is_partial_write()
 725 {
 726    return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
 727            this->force_uncompressed ||
 728            this->force_sechalf || !this->dst.is_contiguous());
 729 }
 730
 731 int
 732 fs_inst::regs_read(fs_visitor *v, int arg)
 733 {
 734    if (is_tex() && arg == 0 && src[0].file == GRF) {
 735       if (v->dispatch_width == 16)
 736          return (mlen + 1) / 2;
 737       else
 738          return mlen;
 739    }
 740    return 1;
 741 }
 742
 743 bool
 744 fs_inst::reads_flag()
 745 {
 746    return predicate;
 747 }
 748
 749 bool
 750 fs_inst::writes_flag()
 751 {
 752    return (conditional_mod && opcode != BRW_OPCODE_SEL) ||
 753           opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
 754 }
 755
 756 /**
 757  * Returns how many MRFs an FS opcode will write over.
 758  *
 759  * Note that this is not the 0 or 1 implied writes in an actual gen
 760  * instruction -- the FS opcodes often generate MOVs in addition.
 761  */
 762 int
 763 fs_visitor::implied_mrf_writes(fs_inst *inst)
 764 {
 765    if (inst->mlen == 0)
 766       return 0;
 767
 768    if (inst->base_mrf == -1)
 769       return 0;
 770
 771    switch (inst->opcode) {
 772    case SHADER_OPCODE_RCP:
 773    case SHADER_OPCODE_RSQ:
 774    case SHADER_OPCODE_SQRT:
 775    case SHADER_OPCODE_EXP2:
 776    case SHADER_OPCODE_LOG2:
 777    case SHADER_OPCODE_SIN:
 778    case SHADER_OPCODE_COS:
 779       return 1 * dispatch_width / 8;
 780    case SHADER_OPCODE_POW:
 781    case SHADER_OPCODE_INT_QUOTIENT:
 782    case SHADER_OPCODE_INT_REMAINDER:
 783       return 2 * dispatch_width / 8;
 784    case SHADER_OPCODE_TEX:
 785    case FS_OPCODE_TXB:
 786    case SHADER_OPCODE_TXD:
 787    case SHADER_OPCODE_TXF:
 788    case SHADER_OPCODE_TXF_CMS:
 789    case SHADER_OPCODE_TXF_MCS:
 790    case SHADER_OPCODE_TG4:
 791    case SHADER_OPCODE_TG4_OFFSET:
 792    case SHADER_OPCODE_TXL:
 793    case SHADER_OPCODE_TXS:
 794    case SHADER_OPCODE_LOD:
 795       return 1;
 796    case FS_OPCODE_FB_WRITE:
 797       return 2;
 798    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 799    case SHADER_OPCODE_GEN4_SCRATCH_READ:
 800       return 1;
 801    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
 802       return inst->mlen;
 803    case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
 804       return 2;
 805    case SHADER_OPCODE_UNTYPED_ATOMIC:
 806    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
 807       return 0;
 808    default:
 809       assert(!"not reached");
 810       return inst->mlen;
 811    }
 812 }
 813
 814 int
 815 fs_visitor::virtual_grf_alloc(int size)
 816 {
 817    if (virtual_grf_array_size <= virtual_grf_count) {
 818       if (virtual_grf_array_size == 0)
 819          virtual_grf_array_size = 16;
 820       else
 821          virtual_grf_array_size *= 2;
 822       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 823                                    virtual_grf_array_size);
 824    }
 825    virtual_grf_sizes[virtual_grf_count] = size;
 826    return virtual_grf_count++;
 827 }
 828
 829 /** Fixed HW reg constructor. */
 830 fs_reg::fs_reg(enum register_file file, int reg)
 831 {
 832    init();
 833    this->file = file;
 834    this->reg = reg;
 835    this->type = BRW_REGISTER_TYPE_F;
 836 }
 837
 838 /** Fixed HW reg constructor. */
 839 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
 840 {
 841    init();
 842    this->file = file;
 843    this->reg = reg;
 844    this->type = type;
 845 }
 846
 847 /** Automatic reg constructor. */
 848 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
 849 {
 850    init();
 851
 852    this->file = GRF;
 853    this->reg = v->virtual_grf_alloc(v->type_size(type));
 854    this->reg_offset = 0;
 855    this->type = brw_type_for_base_type(type);
 856 }
 857
 858 fs_reg *
 859 fs_visitor::variable_storage(ir_variable *var)
 860 {
 861    return (fs_reg *)hash_table_find(this->variable_ht, var);
 862 }
 863
 864 void
 865 import_uniforms_callback(const void *key,
 866                          void *data,
 867                          void *closure)
 868 {
 869    struct hash_table *dst_ht = (struct hash_table *)closure;
 870    const fs_reg *reg = (const fs_reg *)data;
 871
 872    if (reg->file != UNIFORM)
 873       return;
 874
 875    hash_table_insert(dst_ht, data, key);
 876 }
 877
 878 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
 879  * This brings in those uniform definitions
 880  */
 881 void
 882 fs_visitor::import_uniforms(fs_visitor *v)
 883 {
 884    hash_table_call_foreach(v->variable_ht,
 885                            import_uniforms_callback,
 886                            variable_ht);
 887    this->params_remap = v->params_remap;
 888    this->nr_params_remap = v->nr_params_remap;
 889 }
 890
 891 /* Our support for uniforms is piggy-backed on the struct
 892  * gl_fragment_program, because that's where the values actually
 893  * get stored, rather than in some global gl_shader_program uniform
 894  * store.
 895  */
 896 void
 897 fs_visitor::setup_uniform_values(ir_variable *ir)
 898 {
 899    int namelen = strlen(ir->name);
 900
 901    /* The data for our (non-builtin) uniforms is stored in a series of
 902     * gl_uniform_driver_storage structs for each subcomponent that
 903     * glGetUniformLocation() could name.  We know it's been set up in the same
 904     * order we'd walk the type, so walk the list of storage and find anything
 905     * with our name, or the prefix of a component that starts with our name.
 906     */
 907    unsigned params_before = uniforms;
 908    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
 909       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 910
 911       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 912           (storage->name[namelen] != 0 &&
 913            storage->name[namelen] != '.' &&
 914            storage->name[namelen] != '[')) {
 915          continue;
 916       }
 917
 918       unsigned slots = storage->type->component_slots();
 919       if (storage->array_elements)
 920          slots *= storage->array_elements;
 921
 922       for (unsigned i = 0; i < slots; i++) {
 923          stage_prog_data->param[uniforms++] = &storage->storage[i].f;
 924       }
 925    }
 926
 927    /* Make sure we actually initialized the right amount of stuff here. */
 928    assert(params_before + ir->type->component_slots() == uniforms);
 929    (void)params_before;
 930 }
 931
 932
 933 /* Our support for builtin uniforms is even scarier than non-builtin.
 934  * It sits on top of the PROG_STATE_VAR parameters that are
 935  * automatically updated from GL context state.
 936  */
 937 void
 938 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
 939 {
 940    const ir_state_slot *const slots = ir->state_slots;
 941    assert(ir->state_slots != NULL);
 942
 943    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 944       /* This state reference has already been setup by ir_to_mesa, but we'll
 945        * get the same index back here.
 946        */
 947       int index = _mesa_add_state_reference(this->fp->Base.Parameters,
 948                                             (gl_state_index *)slots[i].tokens);
 949
 950       /* Add each of the unique swizzles of the element as a parameter.
 951        * This'll end up matching the expected layout of the
 952        * array/matrix/structure we're trying to fill in.
 953        */
 954       int last_swiz = -1;
 955       for (unsigned int j = 0; j < 4; j++) {
 956          int swiz = GET_SWZ(slots[i].swizzle, j);
 957          if (swiz == last_swiz)
 958             break;
 959          last_swiz = swiz;
 960
 961          stage_prog_data->param[uniforms++] =
 962             &fp->Base.Parameters->ParameterValues[index][swiz].f;
 963       }
 964    }
 965 }
 966
 967 fs_reg *
 968 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
 969 {
 970    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 971    fs_reg wpos = *reg;
 972    bool flip = !ir->data.origin_upper_left ^ c->key.render_to_fbo;
 973
 974    /* gl_FragCoord.x */
 975    if (ir->data.pixel_center_integer) {
 976       emit(MOV(wpos, this->pixel_x));
 977    } else {
 978       emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
 979    }
 980    wpos.reg_offset++;
 981
 982    /* gl_FragCoord.y */
 983    if (!flip && ir->data.pixel_center_integer) {
 984       emit(MOV(wpos, this->pixel_y));
 985    } else {
 986       fs_reg pixel_y = this->pixel_y;
 987       float offset = (ir->data.pixel_center_integer ? 0.0 : 0.5);
 988
 989       if (flip) {
 990          pixel_y.negate = true;
 991          offset += c->key.drawable_height - 1.0;
 992       }
 993
 994       emit(ADD(wpos, pixel_y, fs_reg(offset)));
 995    }
 996    wpos.reg_offset++;
 997
 998    /* gl_FragCoord.z */
 999    if (brw->gen >= 6) {
1000       emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
1001    } else {
1002       emit(FS_OPCODE_LINTERP, wpos,
1003            this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1004            this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1005            interp_reg(VARYING_SLOT_POS, 2));
1006    }
1007    wpos.reg_offset++;
1008
1009    /* gl_FragCoord.w: Already set up in emit_interpolation */
1010    emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1011
1012    return reg;
1013 }
1014
1015 fs_inst *
1016 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1017                          glsl_interp_qualifier interpolation_mode,
1018                          bool is_centroid, bool is_sample)
1019 {
1020    brw_wm_barycentric_interp_mode barycoord_mode;
1021    if (brw->gen >= 6) {
1022       if (is_centroid) {
1023          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1024             barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1025          else
1026             barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1027       } else if (is_sample) {
1028           if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1029             barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1030          else
1031             barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1032       } else {
1033          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1034             barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1035          else
1036             barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1037       }
1038    } else {
1039       /* On Ironlake and below, there is only one interpolation mode.
1040        * Centroid interpolation doesn't mean anything on this hardware --
1041        * there is no multisampling.
1042        */
1043       barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1044    }
1045    return emit(FS_OPCODE_LINTERP, attr,
1046                this->delta_x[barycoord_mode],
1047                this->delta_y[barycoord_mode], interp);
1048 }
1049
1050 fs_reg *
1051 fs_visitor::emit_general_interpolation(ir_variable *ir)
1052 {
1053    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1054    reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
1055    fs_reg attr = *reg;
1056
1057    unsigned int array_elements;
1058    const glsl_type *type;
1059
1060    if (ir->type->is_array()) {
1061       array_elements = ir->type->length;
1062       if (array_elements == 0) {
1063          fail("dereferenced array '%s' has length 0\n", ir->name);
1064       }
1065       type = ir->type->fields.array;
1066    } else {
1067       array_elements = 1;
1068       type = ir->type;
1069    }
1070
1071    glsl_interp_qualifier interpolation_mode =
1072       ir->determine_interpolation_mode(c->key.flat_shade);
1073
1074    int location = ir->data.location;
1075    for (unsigned int i = 0; i < array_elements; i++) {
1076       for (unsigned int j = 0; j < type->matrix_columns; j++) {
1077          if (c->prog_data.urb_setup[location] == -1) {
1078             /* If there's no incoming setup data for this slot, don't
1079              * emit interpolation for it.
1080              */
1081             attr.reg_offset += type->vector_elements;
1082             location++;
1083             continue;
1084          }
1085
1086          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1087             /* Constant interpolation (flat shading) case. The SF has
1088              * handed us defined values in only the constant offset
1089              * field of the setup reg.
1090              */
1091             for (unsigned int k = 0; k < type->vector_elements; k++) {
1092                struct brw_reg interp = interp_reg(location, k);
1093                interp = suboffset(interp, 3);
1094                interp.type = reg->type;
1095                emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1096                attr.reg_offset++;
1097             }
1098          } else {
1099             /* Smooth/noperspective interpolation case. */
1100             for (unsigned int k = 0; k < type->vector_elements; k++) {
1101                struct brw_reg interp = interp_reg(location, k);
1102                emit_linterp(attr, fs_reg(interp), interpolation_mode,
1103                             ir->data.centroid && !c->key.persample_shading,
1104                             ir->data.sample || c->key.persample_shading);
1105                if (brw->needs_unlit_centroid_workaround && ir->data.centroid) {
1106                   /* Get the pixel/sample mask into f0 so that we know
1107                    * which pixels are lit.  Then, for each channel that is
1108                    * unlit, replace the centroid data with non-centroid
1109                    * data.
1110                    */
1111                   emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1112                   fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1113                                                interpolation_mode,
1114                                                false, false);
1115                   inst->predicate = BRW_PREDICATE_NORMAL;
1116                   inst->predicate_inverse = true;
1117                }
1118                if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1119                   emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1120                }
1121                attr.reg_offset++;
1122             }
1123
1124          }
1125          location++;
1126       }
1127    }
1128
1129    return reg;
1130 }
1131
1132 fs_reg *
1133 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1134 {
1135    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1136
1137    /* The frontfacing comes in as a bit in the thread payload. */
1138    if (brw->gen >= 6) {
1139       emit(BRW_OPCODE_ASR, *reg,
1140            fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1141            fs_reg(15));
1142       emit(BRW_OPCODE_NOT, *reg, *reg);
1143       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1144    } else {
1145       struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1146       /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1147        * us front face
1148        */
1149       emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1150       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1151    }
1152
1153    return reg;
1154 }
1155
1156 void
1157 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1158 {
1159    assert(dst.type == BRW_REGISTER_TYPE_F);
1160
1161    if (c->key.compute_pos_offset) {
1162       /* Convert int_sample_pos to floating point */
1163       emit(MOV(dst, int_sample_pos));
1164       /* Scale to the range [0, 1] */
1165       emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1166    }
1167    else {
1168       /* From ARB_sample_shading specification:
1169        * "When rendering to a non-multisample buffer, or if multisample
1170        *  rasterization is disabled, gl_SamplePosition will always be
1171        *  (0.5, 0.5).
1172        */
1173       emit(MOV(dst, fs_reg(0.5f)));
1174    }
1175 }
1176
1177 fs_reg *
1178 fs_visitor::emit_samplepos_setup(ir_variable *ir)
1179 {
1180    assert(brw->gen >= 6);
1181    assert(ir->type == glsl_type::vec2_type);
1182
1183    this->current_annotation = "compute sample position";
1184    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1185    fs_reg pos = *reg;
1186    fs_reg int_sample_x = fs_reg(this, glsl_type::int_type);
1187    fs_reg int_sample_y = fs_reg(this, glsl_type::int_type);
1188
1189    /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1190     * mode will be enabled.
1191     *
1192     * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1193     * R31.1:0         Position Offset X/Y for Slot[3:0]
1194     * R31.3:2         Position Offset X/Y for Slot[7:4]
1195     * .....
1196     *
1197     * The X, Y sample positions come in as bytes in  thread payload. So, read
1198     * the positions using vstride=16, width=8, hstride=2.
1199     */
1200    struct brw_reg sample_pos_reg =
1201       stride(retype(brw_vec1_grf(c->sample_pos_reg, 0),
1202                     BRW_REGISTER_TYPE_B), 16, 8, 2);
1203
1204    emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1205    if (dispatch_width == 16) {
1206       fs_inst *inst = emit(MOV(half(int_sample_x, 1),
1207                                fs_reg(suboffset(sample_pos_reg, 16))));
1208       inst->force_sechalf = true;
1209    }
1210    /* Compute gl_SamplePosition.x */
1211    compute_sample_position(pos, int_sample_x);
1212    pos.reg_offset++;
1213    emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1214    if (dispatch_width == 16) {
1215       fs_inst *inst = emit(MOV(half(int_sample_y, 1),
1216                                fs_reg(suboffset(sample_pos_reg, 17))));
1217       inst->force_sechalf = true;
1218    }
1219    /* Compute gl_SamplePosition.y */
1220    compute_sample_position(pos, int_sample_y);
1221    return reg;
1222 }
1223
1224 fs_reg *
1225 fs_visitor::emit_sampleid_setup(ir_variable *ir)
1226 {
1227    assert(brw->gen >= 6);
1228
1229    this->current_annotation = "compute sample id";
1230    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1231
1232    if (c->key.compute_sample_id) {
1233       fs_reg t1 = fs_reg(this, glsl_type::int_type);
1234       fs_reg t2 = fs_reg(this, glsl_type::int_type);
1235       t2.type = BRW_REGISTER_TYPE_UW;
1236
1237       /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1238        * 8x multisampling, subspan 0 will represent sample N (where N
1239        * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1240        * 7. We can find the value of N by looking at R0.0 bits 7:6
1241        * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1242        * (since samples are always delivered in pairs). That is, we
1243        * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1244        * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1245        * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1246        * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1247        * populating a temporary variable with the sequence (0, 1, 2, 3),
1248        * and then reading from it using vstride=1, width=4, hstride=0.
1249        * These computations hold good for 4x multisampling as well.
1250        */
1251       emit(BRW_OPCODE_AND, t1,
1252            fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1253            fs_reg(brw_imm_d(0xc0)));
1254       emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1255       /* This works for both SIMD8 and SIMD16 */
1256       emit(MOV(t2, brw_imm_v(0x3210)));
1257       /* This special instruction takes care of setting vstride=1,
1258        * width=4, hstride=0 of t2 during an ADD instruction.
1259        */
1260       emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1261    } else {
1262       /* As per GL_ARB_sample_shading specification:
1263        * "When rendering to a non-multisample buffer, or if multisample
1264        *  rasterization is disabled, gl_SampleID will always be zero."
1265        */
1266       emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1267    }
1268
1269    return reg;
1270 }
1271
1272 fs_reg *
1273 fs_visitor::emit_samplemaskin_setup(ir_variable *ir)
1274 {
1275    assert(brw->gen >= 7);
1276    this->current_annotation = "compute gl_SampleMaskIn";
1277    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1278    emit(MOV(*reg, fs_reg(retype(brw_vec8_grf(c->sample_mask_reg, 0), BRW_REGISTER_TYPE_D))));
1279    return reg;
1280 }
1281
1282 fs_reg
1283 fs_visitor::fix_math_operand(fs_reg src)
1284 {
1285    /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1286     * might be able to do better by doing execsize = 1 math and then
1287     * expanding that result out, but we would need to be careful with
1288     * masking.
1289     *
1290     * The hardware ignores source modifiers (negate and abs) on math
1291     * instructions, so we also move to a temp to set those up.
1292     */
1293    if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1294        !src.abs && !src.negate)
1295       return src;
1296
1297    /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1298     * operands to math
1299     */
1300    if (brw->gen >= 7 && src.file != IMM)
1301       return src;
1302
1303    fs_reg expanded = fs_reg(this, glsl_type::float_type);
1304    expanded.type = src.type;
1305    emit(BRW_OPCODE_MOV, expanded, src);
1306    return expanded;
1307 }
1308
1309 fs_inst *
1310 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1311 {
1312    switch (opcode) {
1313    case SHADER_OPCODE_RCP:
1314    case SHADER_OPCODE_RSQ:
1315    case SHADER_OPCODE_SQRT:
1316    case SHADER_OPCODE_EXP2:
1317    case SHADER_OPCODE_LOG2:
1318    case SHADER_OPCODE_SIN:
1319    case SHADER_OPCODE_COS:
1320       break;
1321    default:
1322       assert(!"not reached: bad math opcode");
1323       return NULL;
1324    }
1325
1326    /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
1327     * might be able to do better by doing execsize = 1 math and then
1328     * expanding that result out, but we would need to be careful with
1329     * masking.
1330     *
1331     * Gen 6 hardware ignores source modifiers (negate and abs) on math
1332     * instructions, so we also move to a temp to set those up.
1333     */
1334    if (brw->gen >= 6)
1335       src = fix_math_operand(src);
1336
1337    fs_inst *inst = emit(opcode, dst, src);
1338
1339    if (brw->gen < 6) {
1340       inst->base_mrf = 2;
1341       inst->mlen = dispatch_width / 8;
1342    }
1343
1344    return inst;
1345 }
1346
1347 fs_inst *
1348 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1349 {
1350    int base_mrf = 2;
1351    fs_inst *inst;
1352
1353    switch (opcode) {
1354    case SHADER_OPCODE_INT_QUOTIENT:
1355    case SHADER_OPCODE_INT_REMAINDER:
1356       if (brw->gen >= 7 && dispatch_width == 16)
1357          fail("SIMD16 INTDIV unsupported\n");
1358       break;
1359    case SHADER_OPCODE_POW:
1360       break;
1361    default:
1362       assert(!"not reached: unsupported binary math opcode.");
1363       return NULL;
1364    }
1365
1366    if (brw->gen >= 6) {
1367       src0 = fix_math_operand(src0);
1368       src1 = fix_math_operand(src1);
1369
1370       inst = emit(opcode, dst, src0, src1);
1371    } else {
1372       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1373        * "Message Payload":
1374        *
1375        * "Operand0[7].  For the INT DIV functions, this operand is the
1376        *  denominator."
1377        *  ...
1378        * "Operand1[7].  For the INT DIV functions, this operand is the
1379        *  numerator."
1380        */
1381       bool is_int_div = opcode != SHADER_OPCODE_POW;
1382       fs_reg &op0 = is_int_div ? src1 : src0;
1383       fs_reg &op1 = is_int_div ? src0 : src1;
1384
1385       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1386       inst = emit(opcode, dst, op0, reg_null_f);
1387
1388       inst->base_mrf = base_mrf;
1389       inst->mlen = 2 * dispatch_width / 8;
1390    }
1391    return inst;
1392 }
1393
1394 void
1395 fs_visitor::assign_curb_setup()
1396 {
1397    if (dispatch_width == 8) {
1398       c->prog_data.first_curbe_grf = c->nr_payload_regs;
1399       stage_prog_data->nr_params = uniforms;
1400    } else {
1401       c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1402       /* Make sure we didn't try to sneak in an extra uniform */
1403       assert(uniforms == 0);
1404    }
1405
1406    c->prog_data.curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1407
1408    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1409    foreach_list(node, &this->instructions) {
1410       fs_inst *inst = (fs_inst *)node;
1411
1412       for (unsigned int i = 0; i < 3; i++) {
1413          if (inst->src[i].file == UNIFORM) {
1414             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1415             struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1416                                                   constant_nr / 8,
1417                                                   constant_nr % 8);
1418
1419             inst->src[i].file = HW_REG;
1420             inst->src[i].fixed_hw_reg = byte_offset(
1421                retype(brw_reg, inst->src[i].type),
1422                inst->src[i].subreg_offset);
1423          }
1424       }
1425    }
1426 }
1427
1428 void
1429 fs_visitor::calculate_urb_setup()
1430 {
1431    for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1432       c->prog_data.urb_setup[i] = -1;
1433    }
1434
1435    int urb_next = 0;
1436    /* Figure out where each of the incoming setup attributes lands. */
1437    if (brw->gen >= 6) {
1438       if (_mesa_bitcount_64(fp->Base.InputsRead &
1439                             BRW_FS_VARYING_INPUT_MASK) <= 16) {
1440          /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1441           * first 16 varying inputs, so we can put them wherever we want.
1442           * Just put them in order.
1443           *
1444           * This is useful because it means that (a) inputs not used by the
1445           * fragment shader won't take up valuable register space, and (b) we
1446           * won't have to recompile the fragment shader if it gets paired with
1447           * a different vertex (or geometry) shader.
1448           */
1449          for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1450             if (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1451                 BITFIELD64_BIT(i)) {
1452                c->prog_data.urb_setup[i] = urb_next++;
1453             }
1454          }
1455       } else {
1456          /* We have enough input varyings that the SF/SBE pipeline stage can't
1457           * arbitrarily rearrange them to suit our whim; we have to put them
1458           * in an order that matches the output of the previous pipeline stage
1459           * (geometry or vertex shader).
1460           */
1461          struct brw_vue_map prev_stage_vue_map;
1462          brw_compute_vue_map(brw, &prev_stage_vue_map,
1463                              c->key.input_slots_valid);
1464          int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1465          assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1466          for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1467               slot++) {
1468             int varying = prev_stage_vue_map.slot_to_varying[slot];
1469             /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1470              * unused.
1471              */
1472             if (varying != BRW_VARYING_SLOT_COUNT &&
1473                 (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1474                  BITFIELD64_BIT(varying))) {
1475                c->prog_data.urb_setup[varying] = slot - first_slot;
1476             }
1477          }
1478          urb_next = prev_stage_vue_map.num_slots - first_slot;
1479       }
1480    } else {
1481       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1482       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1483          /* Point size is packed into the header, not as a general attribute */
1484          if (i == VARYING_SLOT_PSIZ)
1485             continue;
1486
1487          if (c->key.input_slots_valid & BITFIELD64_BIT(i)) {
1488             /* The back color slot is skipped when the front color is
1489              * also written to.  In addition, some slots can be
1490              * written in the vertex shader and not read in the
1491              * fragment shader.  So the register number must always be
1492              * incremented, mapped or not.
1493              */
1494             if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1495                c->prog_data.urb_setup[i] = urb_next;
1496             urb_next++;
1497          }
1498       }
1499
1500       /*
1501        * It's a FS only attribute, and we did interpolation for this attribute
1502        * in SF thread. So, count it here, too.
1503        *
1504        * See compile_sf_prog() for more info.
1505        */
1506       if (fp->Base.InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1507          c->prog_data.urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1508    }
1509
1510    c->prog_data.num_varying_inputs = urb_next;
1511 }
1512
1513 void
1514 fs_visitor::assign_urb_setup()
1515 {
1516    int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1517
1518    /* Offset all the urb_setup[] index by the actual position of the
1519     * setup regs, now that the location of the constants has been chosen.
1520     */
1521    foreach_list(node, &this->instructions) {
1522       fs_inst *inst = (fs_inst *)node;
1523
1524       if (inst->opcode == FS_OPCODE_LINTERP) {
1525          assert(inst->src[2].file == HW_REG);
1526          inst->src[2].fixed_hw_reg.nr += urb_start;
1527       }
1528
1529       if (inst->opcode == FS_OPCODE_CINTERP) {
1530          assert(inst->src[0].file == HW_REG);
1531          inst->src[0].fixed_hw_reg.nr += urb_start;
1532       }
1533    }
1534
1535    /* Each attribute is 4 setup channels, each of which is half a reg. */
1536    this->first_non_payload_grf =
1537       urb_start + c->prog_data.num_varying_inputs * 2;
1538 }
1539
1540 /**
1541  * Split large virtual GRFs into separate components if we can.
1542  *
1543  * This is mostly duplicated with what brw_fs_vector_splitting does,
1544  * but that's really conservative because it's afraid of doing
1545  * splitting that doesn't result in real progress after the rest of
1546  * the optimization phases, which would cause infinite looping in
1547  * optimization.  We can do it once here, safely.  This also has the
1548  * opportunity to split interpolated values, or maybe even uniforms,
1549  * which we don't have at the IR level.
1550  *
1551  * We want to split, because virtual GRFs are what we register
1552  * allocate and spill (due to contiguousness requirements for some
1553  * instructions), and they're what we naturally generate in the
1554  * codegen process, but most virtual GRFs don't actually need to be
1555  * contiguous sets of GRFs.  If we split, we'll end up with reduced
1556  * live intervals and better dead code elimination and coalescing.
1557  */
1558 void
1559 fs_visitor::split_virtual_grfs()
1560 {
1561    int num_vars = this->virtual_grf_count;
1562    bool split_grf[num_vars];
1563    int new_virtual_grf[num_vars];
1564
1565    /* Try to split anything > 0 sized. */
1566    for (int i = 0; i < num_vars; i++) {
1567       if (this->virtual_grf_sizes[i] != 1)
1568          split_grf[i] = true;
1569       else
1570          split_grf[i] = false;
1571    }
1572
1573    if (brw->has_pln &&
1574        this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1575       /* PLN opcodes rely on the delta_xy being contiguous.  We only have to
1576        * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1577        * Gen6, that was the only supported interpolation mode, and since Gen6,
1578        * delta_x and delta_y are in fixed hardware registers.
1579        */
1580       split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1581          false;
1582    }
1583
1584    foreach_list(node, &this->instructions) {
1585       fs_inst *inst = (fs_inst *)node;
1586
1587       /* If there's a SEND message that requires contiguous destination
1588        * registers, no splitting is allowed.
1589        */
1590       if (inst->regs_written > 1) {
1591          split_grf[inst->dst.reg] = false;
1592       }
1593
1594       /* If we're sending from a GRF, don't split it, on the assumption that
1595        * the send is reading the whole thing.
1596        */
1597       if (inst->is_send_from_grf()) {
1598          for (int i = 0; i < 3; i++) {
1599             if (inst->src[i].file == GRF) {
1600                split_grf[inst->src[i].reg] = false;
1601             }
1602          }
1603       }
1604    }
1605
1606    /* Allocate new space for split regs.  Note that the virtual
1607     * numbers will be contiguous.
1608     */
1609    for (int i = 0; i < num_vars; i++) {
1610       if (split_grf[i]) {
1611          new_virtual_grf[i] = virtual_grf_alloc(1);
1612          for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1613             int reg = virtual_grf_alloc(1);
1614             assert(reg == new_virtual_grf[i] + j - 1);
1615             (void) reg;
1616          }
1617          this->virtual_grf_sizes[i] = 1;
1618       }
1619    }
1620
1621    foreach_list(node, &this->instructions) {
1622       fs_inst *inst = (fs_inst *)node;
1623
1624       if (inst->dst.file == GRF &&
1625           split_grf[inst->dst.reg] &&
1626           inst->dst.reg_offset != 0) {
1627          inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1628                           inst->dst.reg_offset - 1);
1629          inst->dst.reg_offset = 0;
1630       }
1631       for (int i = 0; i < 3; i++) {
1632          if (inst->src[i].file == GRF &&
1633              split_grf[inst->src[i].reg] &&
1634              inst->src[i].reg_offset != 0) {
1635             inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1636                                 inst->src[i].reg_offset - 1);
1637             inst->src[i].reg_offset = 0;
1638          }
1639       }
1640    }
1641    invalidate_live_intervals();
1642 }
1643
1644 /**
1645  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1646  *
1647  * During code generation, we create tons of temporary variables, many of
1648  * which get immediately killed and are never used again.  Yet, in later
1649  * optimization and analysis passes, such as compute_live_intervals, we need
1650  * to loop over all the virtual GRFs.  Compacting them can save a lot of
1651  * overhead.
1652  */
1653 void
1654 fs_visitor::compact_virtual_grfs()
1655 {
1656    /* Mark which virtual GRFs are used, and count how many. */
1657    int remap_table[this->virtual_grf_count];
1658    memset(remap_table, -1, sizeof(remap_table));
1659
1660    foreach_list(node, &this->instructions) {
1661       const fs_inst *inst = (const fs_inst *) node;
1662
1663       if (inst->dst.file == GRF)
1664          remap_table[inst->dst.reg] = 0;
1665
1666       for (int i = 0; i < 3; i++) {
1667          if (inst->src[i].file == GRF)
1668             remap_table[inst->src[i].reg] = 0;
1669       }
1670    }
1671
1672    /* In addition to registers used in instructions, fs_visitor keeps
1673     * direct references to certain special values which must be patched:
1674     */
1675    fs_reg *special[] = {
1676       &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1677       &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1678       &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1679       &delta_x[0], &delta_x[1], &delta_x[2],
1680       &delta_x[3], &delta_x[4], &delta_x[5],
1681       &delta_y[0], &delta_y[1], &delta_y[2],
1682       &delta_y[3], &delta_y[4], &delta_y[5],
1683    };
1684    STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1685    STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1686
1687    /* Treat all special values as used, to be conservative */
1688    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1689       if (special[i]->file == GRF)
1690          remap_table[special[i]->reg] = 0;
1691    }
1692
1693    /* Compact the GRF arrays. */
1694    int new_index = 0;
1695    for (int i = 0; i < this->virtual_grf_count; i++) {
1696       if (remap_table[i] != -1) {
1697          remap_table[i] = new_index;
1698          virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1699          invalidate_live_intervals();
1700          ++new_index;
1701       }
1702    }
1703
1704    this->virtual_grf_count = new_index;
1705
1706    /* Patch all the instructions to use the newly renumbered registers */
1707    foreach_list(node, &this->instructions) {
1708       fs_inst *inst = (fs_inst *) node;
1709
1710       if (inst->dst.file == GRF)
1711          inst->dst.reg = remap_table[inst->dst.reg];
1712
1713       for (int i = 0; i < 3; i++) {
1714          if (inst->src[i].file == GRF)
1715             inst->src[i].reg = remap_table[inst->src[i].reg];
1716       }
1717    }
1718
1719    /* Patch all the references to special values */
1720    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1721       if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1722          special[i]->reg = remap_table[special[i]->reg];
1723    }
1724 }
1725
1726 bool
1727 fs_visitor::remove_dead_constants()
1728 {
1729    if (dispatch_width == 8) {
1730       this->params_remap = ralloc_array(mem_ctx, int, uniforms);
1731       this->nr_params_remap = uniforms;
1732
1733       for (unsigned int i = 0; i < uniforms; i++)
1734          this->params_remap[i] = -1;
1735
1736       /* Find which params are still in use. */
1737       foreach_list(node, &this->instructions) {
1738          fs_inst *inst = (fs_inst *)node;
1739
1740          for (int i = 0; i < 3; i++) {
1741             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1742
1743             if (inst->src[i].file != UNIFORM)
1744                continue;
1745
1746             /* Section 5.11 of the OpenGL 4.3 spec says:
1747              *
1748              *     "Out-of-bounds reads return undefined values, which include
1749              *     values from other variables of the active program or zero."
1750              */
1751             if (constant_nr < 0 || constant_nr >= (int)uniforms) {
1752                constant_nr = 0;
1753             }
1754
1755             /* For now, set this to non-negative.  We'll give it the
1756              * actual new number in a moment, in order to keep the
1757              * register numbers nicely ordered.
1758              */
1759             this->params_remap[constant_nr] = 0;
1760          }
1761       }
1762
1763       /* Figure out what the new numbers for the params will be.  At some
1764        * point when we're doing uniform array access, we're going to want
1765        * to keep the distinction between .reg and .reg_offset, but for
1766        * now we don't care.
1767        */
1768       unsigned int new_nr_params = 0;
1769       for (unsigned int i = 0; i < uniforms; i++) {
1770          if (this->params_remap[i] != -1) {
1771             this->params_remap[i] = new_nr_params++;
1772          }
1773       }
1774
1775       /* Update the list of params to be uploaded to match our new numbering. */
1776       for (unsigned int i = 0; i < uniforms; i++) {
1777          int remapped = this->params_remap[i];
1778
1779          if (remapped == -1)
1780             continue;
1781
1782          stage_prog_data->param[remapped] = stage_prog_data->param[i];
1783       }
1784
1785       uniforms = new_nr_params;
1786    } else {
1787       /* This should have been generated in the SIMD8 pass already. */
1788       assert(this->params_remap);
1789    }
1790
1791    /* Now do the renumbering of the shader to remove unused params. */
1792    foreach_list(node, &this->instructions) {
1793       fs_inst *inst = (fs_inst *)node;
1794
1795       for (int i = 0; i < 3; i++) {
1796          int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1797
1798          if (inst->src[i].file != UNIFORM)
1799             continue;
1800
1801          /* as above alias to 0 */
1802          if (constant_nr < 0 || constant_nr >= (int)this->nr_params_remap) {
1803             constant_nr = 0;
1804          }
1805          assert(this->params_remap[constant_nr] != -1);
1806          inst->src[i].reg = this->params_remap[constant_nr];
1807          inst->src[i].reg_offset = 0;
1808       }
1809    }
1810
1811    return true;
1812 }
1813
1814 /*
1815  * Implements array access of uniforms by inserting a
1816  * PULL_CONSTANT_LOAD instruction.
1817  *
1818  * Unlike temporary GRF array access (where we don't support it due to
1819  * the difficulty of doing relative addressing on instruction
1820  * destinations), we could potentially do array access of uniforms
1821  * that were loaded in GRF space as push constants.  In real-world
1822  * usage we've seen, though, the arrays being used are always larger
1823  * than we could load as push constants, so just always move all
1824  * uniform array access out to a pull constant buffer.
1825  */
1826 void
1827 fs_visitor::move_uniform_array_access_to_pull_constants()
1828 {
1829    int pull_constant_loc[uniforms];
1830
1831    for (unsigned int i = 0; i < uniforms; i++) {
1832       pull_constant_loc[i] = -1;
1833    }
1834
1835    /* Walk through and find array access of uniforms.  Put a copy of that
1836     * uniform in the pull constant buffer.
1837     *
1838     * Note that we don't move constant-indexed accesses to arrays.  No
1839     * testing has been done of the performance impact of this choice.
1840     */
1841    foreach_list_safe(node, &this->instructions) {
1842       fs_inst *inst = (fs_inst *)node;
1843
1844       for (int i = 0 ; i < 3; i++) {
1845          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1846             continue;
1847
1848          int uniform = inst->src[i].reg;
1849
1850          /* If this array isn't already present in the pull constant buffer,
1851           * add it.
1852           */
1853          if (pull_constant_loc[uniform] == -1) {
1854             const float **values = &stage_prog_data->param[uniform];
1855
1856             pull_constant_loc[uniform] = stage_prog_data->nr_pull_params;
1857
1858             assert(param_size[uniform]);
1859
1860             for (int j = 0; j < param_size[uniform]; j++) {
1861                stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
1862                   values[j];
1863             }
1864          }
1865
1866          /* Set up the annotation tracking for new generated instructions. */
1867          base_ir = inst->ir;
1868          current_annotation = inst->annotation;
1869
1870          fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
1871          fs_reg temp = fs_reg(this, glsl_type::float_type);
1872          exec_list list = VARYING_PULL_CONSTANT_LOAD(temp,
1873                                                      surf_index,
1874                                                      *inst->src[i].reladdr,
1875                                                      pull_constant_loc[uniform] +
1876                                                      inst->src[i].reg_offset);
1877          inst->insert_before(&list);
1878
1879          inst->src[i].file = temp.file;
1880          inst->src[i].reg = temp.reg;
1881          inst->src[i].reg_offset = temp.reg_offset;
1882          inst->src[i].reladdr = NULL;
1883       }
1884    }
1885 }
1886
1887 /**
1888  * Choose accesses from the UNIFORM file to demote to using the pull
1889  * constant buffer.
1890  *
1891  * We allow a fragment shader to have more than the specified minimum
1892  * maximum number of fragment shader uniform components (64).  If
1893  * there are too many of these, they'd fill up all of register space.
1894  * So, this will push some of them out to the pull constant buffer and
1895  * update the program to load them.
1896  */
1897 void
1898 fs_visitor::setup_pull_constants()
1899 {
1900    /* Only allow 16 registers (128 uniform components) as push constants. */
1901    unsigned int max_uniform_components = 16 * 8;
1902    if (uniforms <= max_uniform_components)
1903       return;
1904
1905    if (dispatch_width == 16) {
1906       fail("Pull constants not supported in SIMD16\n");
1907       return;
1908    }
1909
1910    /* Just demote the end of the list.  We could probably do better
1911     * here, demoting things that are rarely used in the program first.
1912     */
1913    unsigned int pull_uniform_base = max_uniform_components;
1914
1915    int pull_constant_loc[uniforms];
1916    for (unsigned int i = 0; i < uniforms; i++) {
1917       if (i < pull_uniform_base) {
1918          pull_constant_loc[i] = -1;
1919       } else {
1920          pull_constant_loc[i] = -1;
1921          /* If our constant is already being uploaded for reladdr purposes,
1922           * reuse it.
1923           */
1924          for (unsigned int j = 0; j < stage_prog_data->nr_pull_params; j++) {
1925             if (stage_prog_data->pull_param[j] == stage_prog_data->param[i]) {
1926                pull_constant_loc[i] = j;
1927                break;
1928             }
1929          }
1930          if (pull_constant_loc[i] == -1) {
1931             int pull_index = stage_prog_data->nr_pull_params++;
1932             stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
1933             pull_constant_loc[i] = pull_index;
1934          }
1935       }
1936    }
1937    uniforms = pull_uniform_base;
1938
1939    foreach_list(node, &this->instructions) {
1940       fs_inst *inst = (fs_inst *)node;
1941
1942       for (int i = 0; i < 3; i++) {
1943          if (inst->src[i].file != UNIFORM)
1944             continue;
1945
1946          int pull_index = pull_constant_loc[inst->src[i].reg +
1947                                             inst->src[i].reg_offset];
1948          if (pull_index == -1)
1949             continue;
1950
1951          assert(!inst->src[i].reladdr);
1952
1953          fs_reg dst = fs_reg(this, glsl_type::float_type);
1954          fs_reg index(stage_prog_data->binding_table.pull_constants_start);
1955          fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1956          fs_inst *pull =
1957             new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1958                                  dst, index, offset);
1959          pull->ir = inst->ir;
1960          pull->annotation = inst->annotation;
1961
1962          inst->insert_before(pull);
1963
1964          inst->src[i].file = GRF;
1965          inst->src[i].reg = dst.reg;
1966          inst->src[i].reg_offset = 0;
1967          inst->src[i].set_smear(pull_index & 3);
1968       }
1969    }
1970 }
1971
1972 bool
1973 fs_visitor::opt_algebraic()
1974 {
1975    bool progress = false;
1976
1977    foreach_list(node, &this->instructions) {
1978       fs_inst *inst = (fs_inst *)node;
1979
1980       switch (inst->opcode) {
1981       case BRW_OPCODE_MUL:
1982          if (inst->src[1].file != IMM)
1983             continue;
1984
1985          /* a * 1.0 = a */
1986          if (inst->src[1].is_one()) {
1987             inst->opcode = BRW_OPCODE_MOV;
1988             inst->src[1] = reg_undef;
1989             progress = true;
1990             break;
1991          }
1992
1993          /* a * 0.0 = 0.0 */
1994          if (inst->src[1].is_zero()) {
1995             inst->opcode = BRW_OPCODE_MOV;
1996             inst->src[0] = inst->src[1];
1997             inst->src[1] = reg_undef;
1998             progress = true;
1999             break;
2000          }
2001
2002          break;
2003       case BRW_OPCODE_ADD:
2004          if (inst->src[1].file != IMM)
2005             continue;
2006
2007          /* a + 0.0 = a */
2008          if (inst->src[1].is_zero()) {
2009             inst->opcode = BRW_OPCODE_MOV;
2010             inst->src[1] = reg_undef;
2011             progress = true;
2012             break;
2013          }
2014          break;
2015       case BRW_OPCODE_OR:
2016          if (inst->src[0].equals(inst->src[1])) {
2017             inst->opcode = BRW_OPCODE_MOV;
2018             inst->src[1] = reg_undef;
2019             progress = true;
2020             break;
2021          }
2022          break;
2023       case BRW_OPCODE_LRP:
2024          if (inst->src[1].equals(inst->src[2])) {
2025             inst->opcode = BRW_OPCODE_MOV;
2026             inst->src[0] = inst->src[1];
2027             inst->src[1] = reg_undef;
2028             inst->src[2] = reg_undef;
2029             progress = true;
2030             break;
2031          }
2032          break;
2033       case BRW_OPCODE_SEL:
2034          if (inst->saturate && inst->src[1].file == IMM) {
2035             switch (inst->conditional_mod) {
2036             case BRW_CONDITIONAL_LE:
2037             case BRW_CONDITIONAL_L:
2038                switch (inst->src[1].type) {
2039                case BRW_REGISTER_TYPE_F:
2040                   if (inst->src[1].imm.f >= 1.0f) {
2041                      inst->opcode = BRW_OPCODE_MOV;
2042                      inst->src[1] = reg_undef;
2043                      progress = true;
2044                   }
2045                   break;
2046                default:
2047                   break;
2048                }
2049                break;
2050             case BRW_CONDITIONAL_GE:
2051             case BRW_CONDITIONAL_G:
2052                switch (inst->src[1].type) {
2053                case BRW_REGISTER_TYPE_F:
2054                   if (inst->src[1].imm.f <= 0.0f) {
2055                      inst->opcode = BRW_OPCODE_MOV;
2056                      inst->src[1] = reg_undef;
2057                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
2058                      progress = true;
2059                   }
2060                   break;
2061                default:
2062                   break;
2063                }
2064             default:
2065                break;
2066             }
2067          }
2068          break;
2069       default:
2070          break;
2071       }
2072    }
2073
2074    return progress;
2075 }
2076
2077 /**
2078  * Removes any instructions writing a VGRF where that VGRF is not used by any
2079  * later instruction.
2080  */
2081 bool
2082 fs_visitor::dead_code_eliminate()
2083 {
2084    bool progress = false;
2085    int pc = 0;
2086
2087    calculate_live_intervals();
2088
2089    foreach_list_safe(node, &this->instructions) {
2090       fs_inst *inst = (fs_inst *)node;
2091
2092       if (inst->dst.file == GRF && !inst->has_side_effects()) {
2093          bool dead = true;
2094
2095          for (int i = 0; i < inst->regs_written; i++) {
2096             int var = live_intervals->var_from_vgrf[inst->dst.reg];
2097             assert(live_intervals->end[var + inst->dst.reg_offset + i] >= pc);
2098             if (live_intervals->end[var + inst->dst.reg_offset + i] != pc) {
2099                dead = false;
2100                break;
2101             }
2102          }
2103
2104          if (dead) {
2105             /* Don't dead code eliminate instructions that write to the
2106              * accumulator as a side-effect. Instead just set the destination
2107              * to the null register to free it.
2108              */
2109             switch (inst->opcode) {
2110             case BRW_OPCODE_ADDC:
2111             case BRW_OPCODE_SUBB:
2112             case BRW_OPCODE_MACH:
2113                inst->dst = fs_reg(retype(brw_null_reg(), inst->dst.type));
2114                break;
2115             default:
2116                inst->remove();
2117                progress = true;
2118                break;
2119             }
2120          }
2121       }
2122
2123       pc++;
2124    }
2125
2126    if (progress)
2127       invalidate_live_intervals();
2128
2129    return progress;
2130 }
2131
2132 struct dead_code_hash_key
2133 {
2134    int vgrf;
2135    int reg_offset;
2136 };
2137
2138 static bool
2139 dead_code_hash_compare(const void *a, const void *b)
2140 {
2141    return memcmp(a, b, sizeof(struct dead_code_hash_key)) == 0;
2142 }
2143
2144 static void
2145 clear_dead_code_hash(struct hash_table *ht)
2146 {
2147    struct hash_entry *entry;
2148
2149    hash_table_foreach(ht, entry) {
2150       _mesa_hash_table_remove(ht, entry);
2151    }
2152 }
2153
2154 static void
2155 insert_dead_code_hash(struct hash_table *ht,
2156                       int vgrf, int reg_offset, fs_inst *inst)
2157 {
2158    /* We don't bother freeing keys, because they'll be GCed with the ht. */
2159    struct dead_code_hash_key *key = ralloc(ht, struct dead_code_hash_key);
2160
2161    key->vgrf = vgrf;
2162    key->reg_offset = reg_offset;
2163
2164    _mesa_hash_table_insert(ht, _mesa_hash_data(key, sizeof(*key)), key, inst);
2165 }
2166
2167 static struct hash_entry *
2168 get_dead_code_hash_entry(struct hash_table *ht, int vgrf, int reg_offset)
2169 {
2170    struct dead_code_hash_key key;
2171
2172    key.vgrf = vgrf;
2173    key.reg_offset = reg_offset;
2174
2175    return _mesa_hash_table_search(ht, _mesa_hash_data(&key, sizeof(key)), &key);
2176 }
2177
2178 static void
2179 remove_dead_code_hash(struct hash_table *ht,
2180                       int vgrf, int reg_offset)
2181 {
2182    struct hash_entry *entry = get_dead_code_hash_entry(ht, vgrf, reg_offset);
2183    if (!entry)
2184       return;
2185
2186    _mesa_hash_table_remove(ht, entry);
2187 }
2188
2189 /**
2190  * Walks basic blocks, removing any regs that are written but not read before
2191  * being redefined.
2192  *
2193  * The dead_code_eliminate() function implements a global dead code
2194  * elimination, but it only handles the removing the last write to a register
2195  * if it's never read.  This one can handle intermediate writes, but only
2196  * within a basic block.
2197  */
2198 bool
2199 fs_visitor::dead_code_eliminate_local()
2200 {
2201    struct hash_table *ht;
2202    bool progress = false;
2203
2204    ht = _mesa_hash_table_create(mem_ctx, dead_code_hash_compare);
2205
2206    if (ht == NULL) {
2207       return false;
2208    }
2209
2210    foreach_list_safe(node, &this->instructions) {
2211       fs_inst *inst = (fs_inst *)node;
2212
2213       /* At a basic block, empty the HT since we don't understand dataflow
2214        * here.
2215        */
2216       if (inst->is_control_flow()) {
2217          clear_dead_code_hash(ht);
2218          continue;
2219       }
2220
2221       /* Clear the HT of any instructions that got read. */
2222       for (int i = 0; i < 3; i++) {
2223          fs_reg src = inst->src[i];
2224          if (src.file != GRF)
2225             continue;
2226
2227          int read = 1;
2228          if (inst->is_send_from_grf())
2229             read = virtual_grf_sizes[src.reg] - src.reg_offset;
2230
2231          for (int reg_offset = src.reg_offset;
2232               reg_offset < src.reg_offset + read;
2233               reg_offset++) {
2234             remove_dead_code_hash(ht, src.reg, reg_offset);
2235          }
2236       }
2237
2238       /* Add any update of a GRF to the HT, removing a previous write if it
2239        * wasn't read.
2240        */
2241       if (inst->dst.file == GRF) {
2242          if (inst->regs_written > 1) {
2243             /* We don't know how to trim channels from an instruction's
2244              * writes, so we can't incrementally remove unread channels from
2245              * it.  Just remove whatever it overwrites from the table
2246              */
2247             for (int i = 0; i < inst->regs_written; i++) {
2248                remove_dead_code_hash(ht,
2249                                      inst->dst.reg,
2250                                      inst->dst.reg_offset + i);
2251             }
2252          } else {
2253             struct hash_entry *entry =
2254                get_dead_code_hash_entry(ht, inst->dst.reg,
2255                                         inst->dst.reg_offset);
2256
2257             if (entry) {
2258                if (inst->is_partial_write()) {
2259                   /* For a partial write, we can't remove any previous dead code
2260                    * candidate, since we're just modifying their result.
2261                    */
2262                } else {
2263                   /* We're completely updating a channel, and there was a
2264                    * previous write to the channel that wasn't read.  Kill it!
2265                    */
2266                   fs_inst *inst = (fs_inst *)entry->data;
2267                   inst->remove();
2268                   progress = true;
2269                }
2270
2271                _mesa_hash_table_remove(ht, entry);
2272             }
2273
2274             if (!inst->has_side_effects())
2275                insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
2276                                      inst);
2277          }
2278       }
2279    }
2280
2281    _mesa_hash_table_destroy(ht, NULL);
2282
2283    if (progress)
2284       invalidate_live_intervals();
2285
2286    return progress;
2287 }
2288
2289 /**
2290  * Implements register coalescing: Checks if the two registers involved in a
2291  * raw move don't interfere, in which case they can both be stored in the same
2292  * place and the MOV removed.
2293  *
2294  * To do this, all uses of the source of the MOV in the shader are replaced
2295  * with the destination of the MOV. For example:
2296  *
2297  * add vgrf3:F, vgrf1:F, vgrf2:F
2298  * mov vgrf4:F, vgrf3:F
2299  * mul vgrf5:F, vgrf5:F, vgrf4:F
2300  *
2301  * becomes
2302  *
2303  * add vgrf4:F, vgrf1:F, vgrf2:F
2304  * mul vgrf5:F, vgrf5:F, vgrf4:F
2305  */
2306 bool
2307 fs_visitor::register_coalesce()
2308 {
2309    bool progress = false;
2310
2311    calculate_live_intervals();
2312
2313    int src_size = 0;
2314    int channels_remaining = 0;
2315    int reg_from = -1, reg_to = -1;
2316    int reg_to_offset[MAX_SAMPLER_MESSAGE_SIZE];
2317    fs_inst *mov[MAX_SAMPLER_MESSAGE_SIZE];
2318
2319    foreach_list(node, &this->instructions) {
2320       fs_inst *inst = (fs_inst *)node;
2321
2322       if (inst->opcode != BRW_OPCODE_MOV ||
2323           inst->is_partial_write() ||
2324           inst->saturate ||
2325           inst->src[0].file != GRF ||
2326           inst->src[0].negate ||
2327           inst->src[0].abs ||
2328           !inst->src[0].is_contiguous() ||
2329           inst->dst.file != GRF ||
2330           inst->dst.type != inst->src[0].type) {
2331          continue;
2332       }
2333
2334       if (virtual_grf_sizes[inst->src[0].reg] >
2335           virtual_grf_sizes[inst->dst.reg])
2336          continue;
2337
2338       int var_from = live_intervals->var_from_reg(&inst->src[0]);
2339       int var_to = live_intervals->var_from_reg(&inst->dst);
2340
2341       if (live_intervals->vars_interfere(var_from, var_to) &&
2342           !inst->dst.equals(inst->src[0])) {
2343
2344          /* We know that the live ranges of A (var_from) and B (var_to)
2345           * interfere because of the ->vars_interfere() call above. If the end
2346           * of B's live range is after the end of A's range, then we know two
2347           * things:
2348           *  - the start of B's live range must be in A's live range (since we
2349           *    already know the two ranges interfere, this is the only remaining
2350           *    possibility)
2351           *  - the interference isn't of the form we're looking for (where B is
2352           *    entirely inside A)
2353           */
2354          if (live_intervals->end[var_to] > live_intervals->end[var_from])
2355             continue;
2356
2357          bool overwritten = false;
2358          int scan_ip = -1;
2359
2360          foreach_list(n, &this->instructions) {
2361             fs_inst *scan_inst = (fs_inst *)n;
2362             scan_ip++;
2363
2364             if (scan_inst->is_control_flow()) {
2365                overwritten = true;
2366                break;
2367             }
2368
2369             if (scan_ip <= live_intervals->start[var_to])
2370                continue;
2371
2372             if (scan_ip > live_intervals->end[var_to])
2373                break;
2374
2375             if (scan_inst->dst.equals(inst->dst) ||
2376                 scan_inst->dst.equals(inst->src[0])) {
2377                overwritten = true;
2378                break;
2379             }
2380          }
2381
2382          if (overwritten)
2383             continue;
2384       }
2385
2386       if (reg_from != inst->src[0].reg) {
2387          reg_from = inst->src[0].reg;
2388
2389          src_size = virtual_grf_sizes[inst->src[0].reg];
2390          assert(src_size <= MAX_SAMPLER_MESSAGE_SIZE);
2391
2392          channels_remaining = src_size;
2393          memset(mov, 0, sizeof(mov));
2394
2395          reg_to = inst->dst.reg;
2396       }
2397
2398       if (reg_to != inst->dst.reg)
2399          continue;
2400
2401       const int offset = inst->src[0].reg_offset;
2402       reg_to_offset[offset] = inst->dst.reg_offset;
2403       mov[offset] = inst;
2404       channels_remaining--;
2405
2406       if (channels_remaining)
2407          continue;
2408
2409       bool removed = false;
2410       for (int i = 0; i < src_size; i++) {
2411          if (mov[i]) {
2412             removed = true;
2413
2414             mov[i]->opcode = BRW_OPCODE_NOP;
2415             mov[i]->conditional_mod = BRW_CONDITIONAL_NONE;
2416             mov[i]->dst = reg_undef;
2417             mov[i]->src[0] = reg_undef;
2418             mov[i]->src[1] = reg_undef;
2419             mov[i]->src[2] = reg_undef;
2420          }
2421       }
2422
2423       foreach_list(node, &this->instructions) {
2424          fs_inst *scan_inst = (fs_inst *)node;
2425
2426          for (int i = 0; i < src_size; i++) {
2427             if (mov[i]) {
2428                if (scan_inst->dst.file == GRF &&
2429                    scan_inst->dst.reg == reg_from &&
2430                    scan_inst->dst.reg_offset == i) {
2431                   scan_inst->dst.reg = reg_to;
2432                   scan_inst->dst.reg_offset = reg_to_offset[i];
2433                }
2434                for (int j = 0; j < 3; j++) {
2435                   if (scan_inst->src[j].file == GRF &&
2436                       scan_inst->src[j].reg == reg_from &&
2437                       scan_inst->src[j].reg_offset == i) {
2438                      scan_inst->src[j].reg = reg_to;
2439                      scan_inst->src[j].reg_offset = reg_to_offset[i];
2440                   }
2441                }
2442             }
2443          }
2444       }
2445
2446       if (removed) {
2447          live_intervals->start[var_to] = MIN2(live_intervals->start[var_to],
2448                                               live_intervals->start[var_from]);
2449          live_intervals->end[var_to] = MAX2(live_intervals->end[var_to],
2450                                             live_intervals->end[var_from]);
2451          reg_from = -1;
2452       }
2453    }
2454
2455    foreach_list_safe(node, &this->instructions) {
2456       fs_inst *inst = (fs_inst *)node;
2457
2458       if (inst->opcode == BRW_OPCODE_NOP) {
2459          inst->remove();
2460          progress = true;
2461       }
2462    }
2463
2464    if (progress)
2465       invalidate_live_intervals();
2466
2467    return progress;
2468 }
2469
2470 bool
2471 fs_visitor::compute_to_mrf()
2472 {
2473    bool progress = false;
2474    int next_ip = 0;
2475
2476    calculate_live_intervals();
2477
2478    foreach_list_safe(node, &this->instructions) {
2479       fs_inst *inst = (fs_inst *)node;
2480
2481       int ip = next_ip;
2482       next_ip++;
2483
2484       if (inst->opcode != BRW_OPCODE_MOV ||
2485           inst->is_partial_write() ||
2486           inst->dst.file != MRF || inst->src[0].file != GRF ||
2487           inst->dst.type != inst->src[0].type ||
2488           inst->src[0].abs || inst->src[0].negate ||
2489           !inst->src[0].is_contiguous() ||
2490           inst->src[0].subreg_offset)
2491          continue;
2492
2493       /* Work out which hardware MRF registers are written by this
2494        * instruction.
2495        */
2496       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2497       int mrf_high;
2498       if (inst->dst.reg & BRW_MRF_COMPR4) {
2499          mrf_high = mrf_low + 4;
2500       } else if (dispatch_width == 16 &&
2501                  (!inst->force_uncompressed && !inst->force_sechalf)) {
2502          mrf_high = mrf_low + 1;
2503       } else {
2504          mrf_high = mrf_low;
2505       }
2506
2507       /* Can't compute-to-MRF this GRF if someone else was going to
2508        * read it later.
2509        */
2510       if (this->virtual_grf_end[inst->src[0].reg] > ip)
2511          continue;
2512
2513       /* Found a move of a GRF to a MRF.  Let's see if we can go
2514        * rewrite the thing that made this GRF to write into the MRF.
2515        */
2516       fs_inst *scan_inst;
2517       for (scan_inst = (fs_inst *)inst->prev;
2518            scan_inst->prev != NULL;
2519            scan_inst = (fs_inst *)scan_inst->prev) {
2520          if (scan_inst->dst.file == GRF &&
2521              scan_inst->dst.reg == inst->src[0].reg) {
2522             /* Found the last thing to write our reg we want to turn
2523              * into a compute-to-MRF.
2524              */
2525
2526             /* If this one instruction didn't populate all the
2527              * channels, bail.  We might be able to rewrite everything
2528              * that writes that reg, but it would require smarter
2529              * tracking to delay the rewriting until complete success.
2530              */
2531             if (scan_inst->is_partial_write())
2532                break;
2533
2534             /* Things returning more than one register would need us to
2535              * understand coalescing out more than one MOV at a time.
2536              */
2537             if (scan_inst->regs_written > 1)
2538                break;
2539
2540             /* SEND instructions can't have MRF as a destination. */
2541             if (scan_inst->mlen)
2542                break;
2543
2544             if (brw->gen == 6) {
2545                /* gen6 math instructions must have the destination be
2546                 * GRF, so no compute-to-MRF for them.
2547                 */
2548                if (scan_inst->is_math()) {
2549                   break;
2550                }
2551             }
2552
2553             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2554                /* Found the creator of our MRF's source value. */
2555                scan_inst->dst.file = MRF;
2556                scan_inst->dst.reg = inst->dst.reg;
2557                scan_inst->saturate |= inst->saturate;
2558                inst->remove();
2559                progress = true;
2560             }
2561             break;
2562          }
2563
2564          /* We don't handle control flow here.  Most computation of
2565           * values that end up in MRFs are shortly before the MRF
2566           * write anyway.
2567           */
2568          if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2569             break;
2570
2571          /* You can't read from an MRF, so if someone else reads our
2572           * MRF's source GRF that we wanted to rewrite, that stops us.
2573           */
2574          bool interfered = false;
2575          for (int i = 0; i < 3; i++) {
2576             if (scan_inst->src[i].file == GRF &&
2577                 scan_inst->src[i].reg == inst->src[0].reg &&
2578                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2579                interfered = true;
2580             }
2581          }
2582          if (interfered)
2583             break;
2584
2585          if (scan_inst->dst.file == MRF) {
2586             /* If somebody else writes our MRF here, we can't
2587              * compute-to-MRF before that.
2588              */
2589             int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2590             int scan_mrf_high;
2591
2592             if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2593                scan_mrf_high = scan_mrf_low + 4;
2594             } else if (dispatch_width == 16 &&
2595                        (!scan_inst->force_uncompressed &&
2596                         !scan_inst->force_sechalf)) {
2597                scan_mrf_high = scan_mrf_low + 1;
2598             } else {
2599                scan_mrf_high = scan_mrf_low;
2600             }
2601
2602             if (mrf_low == scan_mrf_low ||
2603                 mrf_low == scan_mrf_high ||
2604                 mrf_high == scan_mrf_low ||
2605                 mrf_high == scan_mrf_high) {
2606                break;
2607             }
2608          }
2609
2610          if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2611             /* Found a SEND instruction, which means that there are
2612              * live values in MRFs from base_mrf to base_mrf +
2613              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
2614              * above it.
2615              */
2616             if (mrf_low >= scan_inst->base_mrf &&
2617                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2618                break;
2619             }
2620             if (mrf_high >= scan_inst->base_mrf &&
2621                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2622                break;
2623             }
2624          }
2625       }
2626    }
2627
2628    if (progress)
2629       invalidate_live_intervals();
2630
2631    return progress;
2632 }
2633
2634 /**
2635  * Walks through basic blocks, looking for repeated MRF writes and
2636  * removing the later ones.
2637  */
2638 bool
2639 fs_visitor::remove_duplicate_mrf_writes()
2640 {
2641    fs_inst *last_mrf_move[16];
2642    bool progress = false;
2643
2644    /* Need to update the MRF tracking for compressed instructions. */
2645    if (dispatch_width == 16)
2646       return false;
2647
2648    memset(last_mrf_move, 0, sizeof(last_mrf_move));
2649
2650    foreach_list_safe(node, &this->instructions) {
2651       fs_inst *inst = (fs_inst *)node;
2652
2653       if (inst->is_control_flow()) {
2654          memset(last_mrf_move, 0, sizeof(last_mrf_move));
2655       }
2656
2657       if (inst->opcode == BRW_OPCODE_MOV &&
2658           inst->dst.file == MRF) {
2659          fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2660          if (prev_inst && inst->equals(prev_inst)) {
2661             inst->remove();
2662             progress = true;
2663             continue;
2664          }
2665       }
2666
2667       /* Clear out the last-write records for MRFs that were overwritten. */
2668       if (inst->dst.file == MRF) {
2669          last_mrf_move[inst->dst.reg] = NULL;
2670       }
2671
2672       if (inst->mlen > 0 && inst->base_mrf != -1) {
2673          /* Found a SEND instruction, which will include two or fewer
2674           * implied MRF writes.  We could do better here.
2675           */
2676          for (int i = 0; i < implied_mrf_writes(inst); i++) {
2677             last_mrf_move[inst->base_mrf + i] = NULL;
2678          }
2679       }
2680
2681       /* Clear out any MRF move records whose sources got overwritten. */
2682       if (inst->dst.file == GRF) {
2683          for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2684             if (last_mrf_move[i] &&
2685                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2686                last_mrf_move[i] = NULL;
2687             }
2688          }
2689       }
2690
2691       if (inst->opcode == BRW_OPCODE_MOV &&
2692           inst->dst.file == MRF &&
2693           inst->src[0].file == GRF &&
2694           !inst->is_partial_write()) {
2695          last_mrf_move[inst->dst.reg] = inst;
2696       }
2697    }
2698
2699    if (progress)
2700       invalidate_live_intervals();
2701
2702    return progress;
2703 }
2704
2705 static void
2706 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2707                         int first_grf, int grf_len)
2708 {
2709    bool inst_simd16 = (dispatch_width > 8 &&
2710                        !inst->force_uncompressed &&
2711                        !inst->force_sechalf);
2712
2713    /* Clear the flag for registers that actually got read (as expected). */
2714    for (int i = 0; i < 3; i++) {
2715       int grf;
2716       if (inst->src[i].file == GRF) {
2717          grf = inst->src[i].reg;
2718       } else if (inst->src[i].file == HW_REG &&
2719                  inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2720          grf = inst->src[i].fixed_hw_reg.nr;
2721       } else {
2722          continue;
2723       }
2724
2725       if (grf >= first_grf &&
2726           grf < first_grf + grf_len) {
2727          deps[grf - first_grf] = false;
2728          if (inst_simd16)
2729             deps[grf - first_grf + 1] = false;
2730       }
2731    }
2732 }
2733
2734 /**
2735  * Implements this workaround for the original 965:
2736  *
2737  *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2738  *      check for post destination dependencies on this instruction, software
2739  *      must ensure that there is no destination hazard for the case of ‘write
2740  *      followed by a posted write’ shown in the following example.
2741  *
2742  *      1. mov r3 0
2743  *      2. send r3.xy <rest of send instruction>
2744  *      3. mov r2 r3
2745  *
2746  *      Due to no post-destination dependency check on the ‘send’, the above
2747  *      code sequence could have two instructions (1 and 2) in flight at the
2748  *      same time that both consider ‘r3’ as the target of their final writes.
2749  */
2750 void
2751 fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
2752 {
2753    int reg_size = dispatch_width / 8;
2754    int write_len = inst->regs_written * reg_size;
2755    int first_write_grf = inst->dst.reg;
2756    bool needs_dep[BRW_MAX_MRF];
2757    assert(write_len < (int)sizeof(needs_dep) - 1);
2758
2759    memset(needs_dep, false, sizeof(needs_dep));
2760    memset(needs_dep, true, write_len);
2761
2762    clear_deps_for_inst_src(inst, dispatch_width,
2763                            needs_dep, first_write_grf, write_len);
2764
2765    /* Walk backwards looking for writes to registers we're writing which
2766     * aren't read since being written.  If we hit the start of the program,
2767     * we assume that there are no outstanding dependencies on entry to the
2768     * program.
2769     */
2770    for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2771         scan_inst != NULL;
2772         scan_inst = (fs_inst *)scan_inst->prev) {
2773
2774       /* If we hit control flow, assume that there *are* outstanding
2775        * dependencies, and force their cleanup before our instruction.
2776        */
2777       if (scan_inst->is_control_flow()) {
2778          for (int i = 0; i < write_len; i++) {
2779             if (needs_dep[i]) {
2780                inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2781             }
2782          }
2783          return;
2784       }
2785
2786       bool scan_inst_simd16 = (dispatch_width > 8 &&
2787                                !scan_inst->force_uncompressed &&
2788                                !scan_inst->force_sechalf);
2789
2790       /* We insert our reads as late as possible on the assumption that any
2791        * instruction but a MOV that might have left us an outstanding
2792        * dependency has more latency than a MOV.
2793        */
2794       if (scan_inst->dst.file == GRF) {
2795          for (int i = 0; i < scan_inst->regs_written; i++) {
2796             int reg = scan_inst->dst.reg + i * reg_size;
2797
2798             if (reg >= first_write_grf &&
2799                 reg < first_write_grf + write_len &&
2800                 needs_dep[reg - first_write_grf]) {
2801                inst->insert_before(DEP_RESOLVE_MOV(reg));
2802                needs_dep[reg - first_write_grf] = false;
2803                if (scan_inst_simd16)
2804                   needs_dep[reg - first_write_grf + 1] = false;
2805             }
2806          }
2807       }
2808
2809       /* Clear the flag for registers that actually got read (as expected). */
2810       clear_deps_for_inst_src(scan_inst, dispatch_width,
2811                               needs_dep, first_write_grf, write_len);
2812
2813       /* Continue the loop only if we haven't resolved all the dependencies */
2814       int i;
2815       for (i = 0; i < write_len; i++) {
2816          if (needs_dep[i])
2817             break;
2818       }
2819       if (i == write_len)
2820          return;
2821    }
2822 }
2823
2824 /**
2825  * Implements this workaround for the original 965:
2826  *
2827  *     "[DevBW, DevCL] Errata: A destination register from a send can not be
2828  *      used as a destination register until after it has been sourced by an
2829  *      instruction with a different destination register.
2830  */
2831 void
2832 fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
2833 {
2834    int write_len = inst->regs_written * dispatch_width / 8;
2835    int first_write_grf = inst->dst.reg;
2836    bool needs_dep[BRW_MAX_MRF];
2837    assert(write_len < (int)sizeof(needs_dep) - 1);
2838
2839    memset(needs_dep, false, sizeof(needs_dep));
2840    memset(needs_dep, true, write_len);
2841    /* Walk forwards looking for writes to registers we're writing which aren't
2842     * read before being written.
2843     */
2844    for (fs_inst *scan_inst = (fs_inst *)inst->next;
2845         !scan_inst->is_tail_sentinel();
2846         scan_inst = (fs_inst *)scan_inst->next) {
2847       /* If we hit control flow, force resolve all remaining dependencies. */
2848       if (scan_inst->is_control_flow()) {
2849          for (int i = 0; i < write_len; i++) {
2850             if (needs_dep[i])
2851                scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2852          }
2853          return;
2854       }
2855
2856       /* Clear the flag for registers that actually got read (as expected). */
2857       clear_deps_for_inst_src(scan_inst, dispatch_width,
2858                               needs_dep, first_write_grf, write_len);
2859
2860       /* We insert our reads as late as possible since they're reading the
2861        * result of a SEND, which has massive latency.
2862        */
2863       if (scan_inst->dst.file == GRF &&
2864           scan_inst->dst.reg >= first_write_grf &&
2865           scan_inst->dst.reg < first_write_grf + write_len &&
2866           needs_dep[scan_inst->dst.reg - first_write_grf]) {
2867          scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2868          needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2869       }
2870
2871       /* Continue the loop only if we haven't resolved all the dependencies */
2872       int i;
2873       for (i = 0; i < write_len; i++) {
2874          if (needs_dep[i])
2875             break;
2876       }
2877       if (i == write_len)
2878          return;
2879    }
2880
2881    /* If we hit the end of the program, resolve all remaining dependencies out
2882     * of paranoia.
2883     */
2884    fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2885    assert(last_inst->eot);
2886    for (int i = 0; i < write_len; i++) {
2887       if (needs_dep[i])
2888          last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2889    }
2890 }
2891
2892 void
2893 fs_visitor::insert_gen4_send_dependency_workarounds()
2894 {
2895    if (brw->gen != 4 || brw->is_g4x)
2896       return;
2897
2898    /* Note that we're done with register allocation, so GRF fs_regs always
2899     * have a .reg_offset of 0.
2900     */
2901
2902    foreach_list_safe(node, &this->instructions) {
2903       fs_inst *inst = (fs_inst *)node;
2904
2905       if (inst->mlen != 0 && inst->dst.file == GRF) {
2906          insert_gen4_pre_send_dependency_workarounds(inst);
2907          insert_gen4_post_send_dependency_workarounds(inst);
2908       }
2909    }
2910 }
2911
2912 /**
2913  * Turns the generic expression-style uniform pull constant load instruction
2914  * into a hardware-specific series of instructions for loading a pull
2915  * constant.
2916  *
2917  * The expression style allows the CSE pass before this to optimize out
2918  * repeated loads from the same offset, and gives the pre-register-allocation
2919  * scheduling full flexibility, while the conversion to native instructions
2920  * allows the post-register-allocation scheduler the best information
2921  * possible.
2922  *
2923  * Note that execution masking for setting up pull constant loads is special:
2924  * the channels that need to be written are unrelated to the current execution
2925  * mask, since a later instruction will use one of the result channels as a
2926  * source operand for all 8 or 16 of its channels.
2927  */
2928 void
2929 fs_visitor::lower_uniform_pull_constant_loads()
2930 {
2931    foreach_list(node, &this->instructions) {
2932       fs_inst *inst = (fs_inst *)node;
2933
2934       if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2935          continue;
2936
2937       if (brw->gen >= 7) {
2938          /* The offset arg before was a vec4-aligned byte offset.  We need to
2939           * turn it into a dword offset.
2940           */
2941          fs_reg const_offset_reg = inst->src[1];
2942          assert(const_offset_reg.file == IMM &&
2943                 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2944          const_offset_reg.imm.u /= 4;
2945          fs_reg payload = fs_reg(this, glsl_type::uint_type);
2946
2947          /* This is actually going to be a MOV, but since only the first dword
2948           * is accessed, we have a special opcode to do just that one.  Note
2949           * that this needs to be an operation that will be considered a def
2950           * by live variable analysis, or register allocation will explode.
2951           */
2952          fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2953                                                payload, const_offset_reg);
2954          setup->force_writemask_all = true;
2955
2956          setup->ir = inst->ir;
2957          setup->annotation = inst->annotation;
2958          inst->insert_before(setup);
2959
2960          /* Similarly, this will only populate the first 4 channels of the
2961           * result register (since we only use smear values from 0-3), but we
2962           * don't tell the optimizer.
2963           */
2964          inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2965          inst->src[1] = payload;
2966
2967          invalidate_live_intervals();
2968       } else {
2969          /* Before register allocation, we didn't tell the scheduler about the
2970           * MRF we use.  We know it's safe to use this MRF because nothing
2971           * else does except for register spill/unspill, which generates and
2972           * uses its MRF within a single IR instruction.
2973           */
2974          inst->base_mrf = 14;
2975          inst->mlen = 1;
2976       }
2977    }
2978 }
2979
2980 void
2981 fs_visitor::dump_instructions()
2982 {
2983    calculate_register_pressure();
2984
2985    int ip = 0, max_pressure = 0;
2986    foreach_list(node, &this->instructions) {
2987       backend_instruction *inst = (backend_instruction *)node;
2988       max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
2989       printf("{%3d} %4d: ", regs_live_at_ip[ip], ip);
2990       dump_instruction(inst);
2991       ++ip;
2992    }
2993    printf("Maximum %3d registers live at once.\n", max_pressure);
2994 }
2995
2996 void
2997 fs_visitor::dump_instruction(backend_instruction *be_inst)
2998 {
2999    fs_inst *inst = (fs_inst *)be_inst;
3000
3001    if (inst->predicate) {
3002       printf("(%cf0.%d) ",
3003              inst->predicate_inverse ? '-' : '+',
3004              inst->flag_subreg);
3005    }
3006
3007    printf("%s", brw_instruction_name(inst->opcode));
3008    if (inst->saturate)
3009       printf(".sat");
3010    if (inst->conditional_mod) {
3011       printf("%s", conditional_modifier[inst->conditional_mod]);
3012       if (!inst->predicate &&
3013           (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
3014                               inst->opcode != BRW_OPCODE_IF &&
3015                               inst->opcode != BRW_OPCODE_WHILE))) {
3016          printf(".f0.%d", inst->flag_subreg);
3017       }
3018    }
3019    printf(" ");
3020
3021
3022    switch (inst->dst.file) {
3023    case GRF:
3024       printf("vgrf%d", inst->dst.reg);
3025       if (virtual_grf_sizes[inst->dst.reg] != 1 ||
3026           inst->dst.subreg_offset)
3027          printf("+%d.%d", inst->dst.reg_offset, inst->dst.subreg_offset);
3028       break;
3029    case MRF:
3030       printf("m%d", inst->dst.reg);
3031       break;
3032    case BAD_FILE:
3033       printf("(null)");
3034       break;
3035    case UNIFORM:
3036       printf("***u%d***", inst->dst.reg);
3037       break;
3038    case HW_REG:
3039       if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3040          switch (inst->dst.fixed_hw_reg.nr) {
3041          case BRW_ARF_NULL:
3042             printf("null");
3043             break;
3044          case BRW_ARF_ADDRESS:
3045             printf("a0.%d", inst->dst.fixed_hw_reg.subnr);
3046             break;
3047          case BRW_ARF_ACCUMULATOR:
3048             printf("acc%d", inst->dst.fixed_hw_reg.subnr);
3049             break;
3050          case BRW_ARF_FLAG:
3051             printf("f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3052                              inst->dst.fixed_hw_reg.subnr);
3053             break;
3054          default:
3055             printf("arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3056                                inst->dst.fixed_hw_reg.subnr);
3057             break;
3058          }
3059       } else {
3060          printf("hw_reg%d", inst->dst.fixed_hw_reg.nr);
3061       }
3062       if (inst->dst.fixed_hw_reg.subnr)
3063          printf("+%d", inst->dst.fixed_hw_reg.subnr);
3064       break;
3065    default:
3066       printf("???");
3067       break;
3068    }
3069    printf(":%s, ", reg_encoding[inst->dst.type]);
3070
3071    for (int i = 0; i < 3 && inst->src[i].file != BAD_FILE; i++) {
3072       if (inst->src[i].negate)
3073          printf("-");
3074       if (inst->src[i].abs)
3075          printf("|");
3076       switch (inst->src[i].file) {
3077       case GRF:
3078          printf("vgrf%d", inst->src[i].reg);
3079          if (virtual_grf_sizes[inst->src[i].reg] != 1 ||
3080              inst->src[i].subreg_offset)
3081             printf("+%d.%d", inst->src[i].reg_offset,
3082                    inst->src[i].subreg_offset);
3083          break;
3084       case MRF:
3085          printf("***m%d***", inst->src[i].reg);
3086          break;
3087       case UNIFORM:
3088          printf("u%d", inst->src[i].reg);
3089          if (virtual_grf_sizes[inst->src[i].reg] != 1 ||
3090              inst->src[i].subreg_offset)
3091             printf("+%d.%d", inst->src[i].reg_offset,
3092                    inst->src[i].subreg_offset);
3093          break;
3094       case BAD_FILE:
3095          printf("(null)");
3096          break;
3097       case IMM:
3098          switch (inst->src[i].type) {
3099          case BRW_REGISTER_TYPE_F:
3100             printf("%ff", inst->src[i].imm.f);
3101             break;
3102          case BRW_REGISTER_TYPE_D:
3103             printf("%dd", inst->src[i].imm.i);
3104             break;
3105          case BRW_REGISTER_TYPE_UD:
3106             printf("%uu", inst->src[i].imm.u);
3107             break;
3108          default:
3109             printf("???");
3110             break;
3111          }
3112          break;
3113       case HW_REG:
3114          if (inst->src[i].fixed_hw_reg.negate)
3115             printf("-");
3116          if (inst->src[i].fixed_hw_reg.abs)
3117             printf("|");
3118          if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3119             switch (inst->src[i].fixed_hw_reg.nr) {
3120             case BRW_ARF_NULL:
3121                printf("null");
3122                break;
3123             case BRW_ARF_ADDRESS:
3124                printf("a0.%d", inst->src[i].fixed_hw_reg.subnr);
3125                break;
3126             case BRW_ARF_ACCUMULATOR:
3127                printf("acc%d", inst->src[i].fixed_hw_reg.subnr);
3128                break;
3129             case BRW_ARF_FLAG:
3130                printf("f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3131                                 inst->src[i].fixed_hw_reg.subnr);
3132                break;
3133             default:
3134                printf("arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3135                                   inst->src[i].fixed_hw_reg.subnr);
3136                break;
3137             }
3138          } else {
3139             printf("hw_reg%d", inst->src[i].fixed_hw_reg.nr);
3140          }
3141          if (inst->src[i].fixed_hw_reg.subnr)
3142             printf("+%d", inst->src[i].fixed_hw_reg.subnr);
3143          if (inst->src[i].fixed_hw_reg.abs)
3144             printf("|");
3145          break;
3146       default:
3147          printf("???");
3148          break;
3149       }
3150       if (inst->src[i].abs)
3151          printf("|");
3152
3153       if (inst->src[i].file != IMM) {
3154          printf(":%s", brw_reg_type_letters(inst->src[i].type));
3155       }
3156
3157       if (i < 2 && inst->src[i + 1].file != BAD_FILE)
3158          printf(", ");
3159    }
3160
3161    printf(" ");
3162
3163    if (inst->force_uncompressed)
3164       printf("1sthalf ");
3165
3166    if (inst->force_sechalf)
3167       printf("2ndhalf ");
3168
3169    printf("\n");
3170 }
3171
3172 /**
3173  * Possibly returns an instruction that set up @param reg.
3174  *
3175  * Sometimes we want to take the result of some expression/variable
3176  * dereference tree and rewrite the instruction generating the result
3177  * of the tree.  When processing the tree, we know that the
3178  * instructions generated are all writing temporaries that are dead
3179  * outside of this tree.  So, if we have some instructions that write
3180  * a temporary, we're free to point that temp write somewhere else.
3181  *
3182  * Note that this doesn't guarantee that the instruction generated
3183  * only reg -- it might be the size=4 destination of a texture instruction.
3184  */
3185 fs_inst *
3186 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3187                                            fs_inst *end,
3188                                            fs_reg reg)
3189 {
3190    if (end == start ||
3191        end->is_partial_write() ||
3192        reg.reladdr ||
3193        !reg.equals(end->dst)) {
3194       return NULL;
3195    } else {
3196       return end;
3197    }
3198 }
3199
3200 void
3201 fs_visitor::setup_payload_gen6()
3202 {
3203    bool uses_depth =
3204       (fp->Base.InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3205    unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
3206
3207    assert(brw->gen >= 6);
3208
3209    /* R0-1: masks, pixel X/Y coordinates. */
3210    c->nr_payload_regs = 2;
3211    /* R2: only for 32-pixel dispatch.*/
3212
3213    /* R3-26: barycentric interpolation coordinates.  These appear in the
3214     * same order that they appear in the brw_wm_barycentric_interp_mode
3215     * enum.  Each set of coordinates occupies 2 registers if dispatch width
3216     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
3217     * appear if they were enabled using the "Barycentric Interpolation
3218     * Mode" bits in WM_STATE.
3219     */
3220    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3221       if (barycentric_interp_modes & (1 << i)) {
3222          c->barycentric_coord_reg[i] = c->nr_payload_regs;
3223          c->nr_payload_regs += 2;
3224          if (dispatch_width == 16) {
3225             c->nr_payload_regs += 2;
3226          }
3227       }
3228    }
3229
3230    /* R27: interpolated depth if uses source depth */
3231    if (uses_depth) {
3232       c->source_depth_reg = c->nr_payload_regs;
3233       c->nr_payload_regs++;
3234       if (dispatch_width == 16) {
3235          /* R28: interpolated depth if not SIMD8. */
3236          c->nr_payload_regs++;
3237       }
3238    }
3239    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3240    if (uses_depth) {
3241       c->source_w_reg = c->nr_payload_regs;
3242       c->nr_payload_regs++;
3243       if (dispatch_width == 16) {
3244          /* R30: interpolated W if not SIMD8. */
3245          c->nr_payload_regs++;
3246       }
3247    }
3248
3249    c->prog_data.uses_pos_offset = c->key.compute_pos_offset;
3250    /* R31: MSAA position offsets. */
3251    if (c->prog_data.uses_pos_offset) {
3252       c->sample_pos_reg = c->nr_payload_regs;
3253       c->nr_payload_regs++;
3254    }
3255
3256    /* R32: MSAA input coverage mask */
3257    if (fp->Base.SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
3258       assert(brw->gen >= 7);
3259       c->sample_mask_reg = c->nr_payload_regs;
3260       c->nr_payload_regs++;
3261       if (dispatch_width == 16) {
3262          /* R33: input coverage mask if not SIMD8. */
3263          c->nr_payload_regs++;
3264       }
3265    }
3266
3267    /* R34-: bary for 32-pixel. */
3268    /* R58-59: interp W for 32-pixel. */
3269
3270    if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3271       c->source_depth_to_render_target = true;
3272    }
3273 }
3274
3275 void
3276 fs_visitor::assign_binding_table_offsets()
3277 {
3278    uint32_t next_binding_table_offset = 0;
3279
3280    /* If there are no color regions, we still perform an FB write to a null
3281     * renderbuffer, which we place at surface index 0.
3282     */
3283    c->prog_data.binding_table.render_target_start = next_binding_table_offset;
3284    next_binding_table_offset += MAX2(c->key.nr_color_regions, 1);
3285
3286    assign_common_binding_table_offsets(next_binding_table_offset);
3287 }
3288
3289 void
3290 fs_visitor::calculate_register_pressure()
3291 {
3292    calculate_live_intervals();
3293
3294    int num_instructions = 0;
3295    foreach_list(node, &this->instructions) {
3296       ++num_instructions;
3297    }
3298
3299    regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
3300
3301    for (int reg = 0; reg < virtual_grf_count; reg++) {
3302       for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
3303          regs_live_at_ip[ip] += virtual_grf_sizes[reg];
3304    }
3305 }
3306
3307 /**
3308  * Look for repeated FS_OPCODE_MOV_DISPATCH_TO_FLAGS and drop the later ones.
3309  *
3310  * The needs_unlit_centroid_workaround ends up producing one of these per
3311  * channel of centroid input, so it's good to clean them up.
3312  *
3313  * An assumption here is that nothing ever modifies the dispatched pixels
3314  * value that FS_OPCODE_MOV_DISPATCH_TO_FLAGS reads from, but the hardware
3315  * dictates that anyway.
3316  */
3317 void
3318 fs_visitor::opt_drop_redundant_mov_to_flags()
3319 {
3320    bool flag_mov_found[2] = {false};
3321
3322    foreach_list_safe(node, &this->instructions) {
3323       fs_inst *inst = (fs_inst *)node;
3324
3325       if (inst->is_control_flow()) {
3326          memset(flag_mov_found, 0, sizeof(flag_mov_found));
3327       } else if (inst->opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS) {
3328          if (!flag_mov_found[inst->flag_subreg])
3329             flag_mov_found[inst->flag_subreg] = true;
3330          else
3331             inst->remove();
3332       } else if (inst->writes_flag()) {
3333          flag_mov_found[inst->flag_subreg] = false;
3334       }
3335    }
3336 }
3337
3338 bool
3339 fs_visitor::run()
3340 {
3341    sanity_param_count = fp->Base.Parameters->NumParameters;
3342    bool allocated_without_spills;
3343
3344    assign_binding_table_offsets();
3345
3346    if (brw->gen >= 6)
3347       setup_payload_gen6();
3348    else
3349       setup_payload_gen4();
3350
3351    if (0) {
3352       emit_dummy_fs();
3353    } else {
3354       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3355          emit_shader_time_begin();
3356
3357       calculate_urb_setup();
3358       if (fp->Base.InputsRead > 0) {
3359          if (brw->gen < 6)
3360             emit_interpolation_setup_gen4();
3361          else
3362             emit_interpolation_setup_gen6();
3363       }
3364
3365       /* We handle discards by keeping track of the still-live pixels in f0.1.
3366        * Initialize it with the dispatched pixels.
3367        */
3368       if (fp->UsesKill || c->key.alpha_test_func) {
3369          fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
3370          discard_init->flag_subreg = 1;
3371       }
3372
3373       /* Generate FS IR for main().  (the visitor only descends into
3374        * functions called "main").
3375        */
3376       if (shader) {
3377          foreach_list(node, &*shader->base.ir) {
3378             ir_instruction *ir = (ir_instruction *)node;
3379             base_ir = ir;
3380             this->result = reg_undef;
3381             ir->accept(this);
3382          }
3383       } else {
3384          emit_fragment_program_code();
3385       }
3386       base_ir = NULL;
3387       if (failed)
3388          return false;
3389
3390       emit(FS_OPCODE_PLACEHOLDER_HALT);
3391
3392       if (c->key.alpha_test_func)
3393          emit_alpha_test();
3394
3395       emit_fb_writes();
3396
3397       split_virtual_grfs();
3398
3399       move_uniform_array_access_to_pull_constants();
3400       remove_dead_constants();
3401       setup_pull_constants();
3402
3403       opt_drop_redundant_mov_to_flags();
3404
3405       bool progress;
3406       do {
3407          progress = false;
3408
3409          compact_virtual_grfs();
3410
3411          progress = remove_duplicate_mrf_writes() || progress;
3412
3413          progress = opt_algebraic() || progress;
3414          progress = opt_cse() || progress;
3415          progress = opt_copy_propagate() || progress;
3416          progress = opt_peephole_predicated_break() || progress;
3417          progress = dead_code_eliminate() || progress;
3418          progress = dead_code_eliminate_local() || progress;
3419          progress = opt_peephole_sel() || progress;
3420          progress = dead_control_flow_eliminate(this) || progress;
3421          progress = opt_saturate_propagation() || progress;
3422          progress = register_coalesce() || progress;
3423          progress = compute_to_mrf() || progress;
3424       } while (progress);
3425
3426       lower_uniform_pull_constant_loads();
3427
3428       assign_curb_setup();
3429       assign_urb_setup();
3430
3431       static enum instruction_scheduler_mode pre_modes[] = {
3432          SCHEDULE_PRE,
3433          SCHEDULE_PRE_NON_LIFO,
3434          SCHEDULE_PRE_LIFO,
3435       };
3436
3437       /* Try each scheduling heuristic to see if it can successfully register
3438        * allocate without spilling.  They should be ordered by decreasing
3439        * performance but increasing likelihood of allocating.
3440        */
3441       for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3442          schedule_instructions(pre_modes[i]);
3443
3444          if (0) {
3445             assign_regs_trivial();
3446             allocated_without_spills = true;
3447          } else {
3448             allocated_without_spills = assign_regs(false);
3449          }
3450          if (allocated_without_spills)
3451             break;
3452       }
3453
3454       if (!allocated_without_spills) {
3455          /* We assume that any spilling is worse than just dropping back to
3456           * SIMD8.  There's probably actually some intermediate point where
3457           * SIMD16 with a couple of spills is still better.
3458           */
3459          if (dispatch_width == 16) {
3460             fail("Failure to register allocate.  Reduce number of "
3461                  "live scalar values to avoid this.");
3462          }
3463
3464          /* Since we're out of heuristics, just go spill registers until we
3465           * get an allocation.
3466           */
3467          while (!assign_regs(true)) {
3468             if (failed)
3469                break;
3470          }
3471       }
3472    }
3473    assert(force_uncompressed_stack == 0);
3474
3475    /* This must come after all optimization and register allocation, since
3476     * it inserts dead code that happens to have side effects, and it does
3477     * so based on the actual physical registers in use.
3478     */
3479    insert_gen4_send_dependency_workarounds();
3480
3481    if (failed)
3482       return false;
3483
3484    if (!allocated_without_spills)
3485       schedule_instructions(SCHEDULE_POST);
3486
3487    if (dispatch_width == 8)
3488       c->prog_data.reg_blocks = brw_register_blocks(grf_used);
3489    else
3490       c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
3491
3492    /* If any state parameters were appended, then ParameterValues could have
3493     * been realloced, in which case the driver uniform storage set up by
3494     * _mesa_associate_uniform_storage() would point to freed memory.  Make
3495     * sure that didn't happen.
3496     */
3497    assert(sanity_param_count == fp->Base.Parameters->NumParameters);
3498
3499    return !failed;
3500 }
3501
3502 const unsigned *
3503 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
3504                struct gl_fragment_program *fp,
3505                struct gl_shader_program *prog,
3506                unsigned *final_assembly_size)
3507 {
3508    bool start_busy = false;
3509    double start_time = 0;
3510
3511    if (unlikely(brw->perf_debug)) {
3512       start_busy = (brw->batch.last_bo &&
3513                     drm_intel_bo_busy(brw->batch.last_bo));
3514       start_time = get_time();
3515    }
3516
3517    struct brw_shader *shader = NULL;
3518    if (prog)
3519       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3520
3521    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3522       if (prog) {
3523          printf("GLSL IR for native fragment shader %d:\n", prog->Name);
3524          _mesa_print_ir(shader->base.ir, NULL);
3525          printf("\n\n");
3526       } else {
3527          printf("ARB_fragment_program %d ir for native fragment shader\n",
3528                 fp->Base.Id);
3529          _mesa_print_program(&fp->Base);
3530       }
3531    }
3532
3533    /* Now the main event: Visit the shader IR and generate our FS IR for it.
3534     */
3535    fs_visitor v(brw, c, prog, fp, 8);
3536    if (!v.run()) {
3537       if (prog) {
3538          prog->LinkStatus = false;
3539          ralloc_strcat(&prog->InfoLog, v.fail_msg);
3540       }
3541
3542       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3543                     v.fail_msg);
3544
3545       return NULL;
3546    }
3547
3548    exec_list *simd16_instructions = NULL;
3549    fs_visitor v2(brw, c, prog, fp, 16);
3550    if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16))) {
3551       if (c->prog_data.base.nr_pull_params == 0) {
3552          /* Try a SIMD16 compile */
3553          v2.import_uniforms(&v);
3554          if (!v2.run()) {
3555             perf_debug("SIMD16 shader failed to compile, falling back to "
3556                        "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
3557          } else {
3558             simd16_instructions = &v2.instructions;
3559          }
3560       } else {
3561          perf_debug("Skipping SIMD16 due to pull parameters.\n");
3562       }
3563    }
3564
3565    const unsigned *assembly = NULL;
3566    if (brw->gen >= 8) {
3567       gen8_fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
3568       assembly = g.generate_assembly(&v.instructions, simd16_instructions,
3569                                      final_assembly_size);
3570    } else {
3571       fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
3572       assembly = g.generate_assembly(&v.instructions, simd16_instructions,
3573                                      final_assembly_size);
3574    }
3575
3576    if (unlikely(brw->perf_debug) && shader) {
3577       if (shader->compiled_once)
3578          brw_wm_debug_recompile(brw, prog, &c->key);
3579       shader->compiled_once = true;
3580
3581       if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
3582          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3583                     (get_time() - start_time) * 1000);
3584       }
3585    }
3586
3587    return assembly;
3588 }
3589
3590 bool
3591 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
3592 {
3593    struct brw_context *brw = brw_context(ctx);
3594    struct brw_wm_prog_key key;
3595
3596    if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
3597       return true;
3598
3599    struct gl_fragment_program *fp = (struct gl_fragment_program *)
3600       prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
3601    struct brw_fragment_program *bfp = brw_fragment_program(fp);
3602    bool program_uses_dfdy = fp->UsesDFdy;
3603
3604    memset(&key, 0, sizeof(key));
3605
3606    if (brw->gen < 6) {
3607       if (fp->UsesKill)
3608          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3609
3610       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3611          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3612
3613       /* Just assume depth testing. */
3614       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3615       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3616    }
3617
3618    if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
3619                                          BRW_FS_VARYING_INPUT_MASK) > 16)
3620       key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
3621
3622    key.clamp_fragment_color = ctx->API == API_OPENGL_COMPAT;
3623
3624    unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
3625    for (unsigned i = 0; i < sampler_count; i++) {
3626       if (fp->Base.ShadowSamplers & (1 << i)) {
3627          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3628          key.tex.swizzles[i] =
3629             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3630       } else {
3631          /* Color sampler: assume no swizzling. */
3632          key.tex.swizzles[i] = SWIZZLE_XYZW;
3633       }
3634    }
3635
3636    if (fp->Base.InputsRead & VARYING_BIT_POS) {
3637       key.drawable_height = ctx->DrawBuffer->Height;
3638    }
3639
3640    key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
3641          ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
3642          BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
3643
3644    if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3645       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
3646                           key.nr_color_regions > 1;
3647    }
3648
3649    /* GL_FRAGMENT_SHADER_DERIVATIVE_HINT is almost always GL_DONT_CARE.  The
3650     * quality of the derivatives is likely to be determined by the driconf
3651     * option.
3652     */
3653    key.high_quality_derivatives = brw->disable_derivative_optimization;
3654
3655    key.program_string_id = bfp->id;
3656
3657    uint32_t old_prog_offset = brw->wm.base.prog_offset;
3658    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3659
3660    bool success = do_wm_prog(brw, prog, bfp, &key);
3661
3662    brw->wm.base.prog_offset = old_prog_offset;
3663    brw->wm.prog_data = old_prog_data;
3664
3665    return success;
3666 }