src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 extern "C" {
  32
  33 #include <sys/types.h>
  34
  35 #include "main/hash_table.h"
  36 #include "main/macros.h"
  37 #include "main/shaderobj.h"
  38 #include "main/fbobject.h"
  39 #include "program/prog_parameter.h"
  40 #include "program/prog_print.h"
  41 #include "program/register_allocate.h"
  42 #include "program/sampler.h"
  43 #include "program/hash_table.h"
  44 #include "brw_context.h"
  45 #include "brw_eu.h"
  46 #include "brw_wm.h"
  47 }
  48 #include "brw_fs.h"
  49 #include "brw_dead_control_flow.h"
  50 #include "main/uniforms.h"
  51 #include "brw_fs_live_variables.h"
  52 #include "glsl/glsl_types.h"
  53
  54 void
  55 fs_inst::init()
  56 {
  57    memset(this, 0, sizeof(*this));
  58    this->opcode = BRW_OPCODE_NOP;
  59    this->conditional_mod = BRW_CONDITIONAL_NONE;
  60
  61    this->dst = reg_undef;
  62    this->src[0] = reg_undef;
  63    this->src[1] = reg_undef;
  64    this->src[2] = reg_undef;
  65
  66    /* This will be the case for almost all instructions. */
  67    this->regs_written = 1;
  68 }
  69
  70 fs_inst::fs_inst()
  71 {
  72    init();
  73 }
  74
  75 fs_inst::fs_inst(enum opcode opcode)
  76 {
  77    init();
  78    this->opcode = opcode;
  79 }
  80
  81 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
  82 {
  83    init();
  84    this->opcode = opcode;
  85    this->dst = dst;
  86
  87    if (dst.file == GRF)
  88       assert(dst.reg_offset >= 0);
  89 }
  90
  91 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
  92 {
  93    init();
  94    this->opcode = opcode;
  95    this->dst = dst;
  96    this->src[0] = src0;
  97
  98    if (dst.file == GRF)
  99       assert(dst.reg_offset >= 0);
 100    if (src[0].file == GRF)
 101       assert(src[0].reg_offset >= 0);
 102 }
 103
 104 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 105 {
 106    init();
 107    this->opcode = opcode;
 108    this->dst = dst;
 109    this->src[0] = src0;
 110    this->src[1] = src1;
 111
 112    if (dst.file == GRF)
 113       assert(dst.reg_offset >= 0);
 114    if (src[0].file == GRF)
 115       assert(src[0].reg_offset >= 0);
 116    if (src[1].file == GRF)
 117       assert(src[1].reg_offset >= 0);
 118 }
 119
 120 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
 121                  fs_reg src0, fs_reg src1, fs_reg src2)
 122 {
 123    init();
 124    this->opcode = opcode;
 125    this->dst = dst;
 126    this->src[0] = src0;
 127    this->src[1] = src1;
 128    this->src[2] = src2;
 129
 130    if (dst.file == GRF)
 131       assert(dst.reg_offset >= 0);
 132    if (src[0].file == GRF)
 133       assert(src[0].reg_offset >= 0);
 134    if (src[1].file == GRF)
 135       assert(src[1].reg_offset >= 0);
 136    if (src[2].file == GRF)
 137       assert(src[2].reg_offset >= 0);
 138 }
 139
 140 #define ALU1(op)                                                        \
 141    fs_inst *                                                            \
 142    fs_visitor::op(fs_reg dst, fs_reg src0)                              \
 143    {                                                                    \
 144       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
 145    }
 146
 147 #define ALU2(op)                                                        \
 148    fs_inst *                                                            \
 149    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1)                 \
 150    {                                                                    \
 151       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
 152    }
 153
 154 #define ALU3(op)                                                        \
 155    fs_inst *                                                            \
 156    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1, fs_reg src2)    \
 157    {                                                                    \
 158       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
 159    }
 160
 161 ALU1(NOT)
 162 ALU1(MOV)
 163 ALU1(FRC)
 164 ALU1(RNDD)
 165 ALU1(RNDE)
 166 ALU1(RNDZ)
 167 ALU2(ADD)
 168 ALU2(MUL)
 169 ALU2(MACH)
 170 ALU2(AND)
 171 ALU2(OR)
 172 ALU2(XOR)
 173 ALU2(SHL)
 174 ALU2(SHR)
 175 ALU2(ASR)
 176 ALU3(LRP)
 177 ALU1(BFREV)
 178 ALU3(BFE)
 179 ALU2(BFI1)
 180 ALU3(BFI2)
 181 ALU1(FBH)
 182 ALU1(FBL)
 183 ALU1(CBIT)
 184 ALU3(MAD)
 185 ALU2(ADDC)
 186 ALU2(SUBB)
 187 ALU2(SEL)
 188
 189 /** Gen4 predicated IF. */
 190 fs_inst *
 191 fs_visitor::IF(uint32_t predicate)
 192 {
 193    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
 194    inst->predicate = predicate;
 195    return inst;
 196 }
 197
 198 /** Gen6 IF with embedded comparison. */
 199 fs_inst *
 200 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
 201 {
 202    assert(brw->gen == 6);
 203    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
 204                                         reg_null_d, src0, src1);
 205    inst->conditional_mod = condition;
 206    return inst;
 207 }
 208
 209 /**
 210  * CMP: Sets the low bit of the destination channels with the result
 211  * of the comparison, while the upper bits are undefined, and updates
 212  * the flag register with the packed 16 bits of the result.
 213  */
 214 fs_inst *
 215 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
 216 {
 217    fs_inst *inst;
 218
 219    /* Take the instruction:
 220     *
 221     * CMP null<d> src0<f> src1<f>
 222     *
 223     * Original gen4 does type conversion to the destination type before
 224     * comparison, producing garbage results for floating point comparisons.
 225     * gen5 does the comparison on the execution type (resolved source types),
 226     * so dst type doesn't matter.  gen6 does comparison and then uses the
 227     * result as if it was the dst type with no conversion, which happens to
 228     * mostly work out for float-interpreted-as-int since our comparisons are
 229     * for >0, =0, <0.
 230     */
 231    if (brw->gen == 4) {
 232       dst.type = src0.type;
 233       if (dst.file == HW_REG)
 234          dst.fixed_hw_reg.type = dst.type;
 235    }
 236
 237    resolve_ud_negate(&src0);
 238    resolve_ud_negate(&src1);
 239
 240    inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
 241    inst->conditional_mod = condition;
 242
 243    return inst;
 244 }
 245
 246 exec_list
 247 fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
 248                                        fs_reg varying_offset,
 249                                        uint32_t const_offset)
 250 {
 251    exec_list instructions;
 252    fs_inst *inst;
 253
 254    /* We have our constant surface use a pitch of 4 bytes, so our index can
 255     * be any component of a vector, and then we load 4 contiguous
 256     * components starting from that.
 257     *
 258     * We break down the const_offset to a portion added to the variable
 259     * offset and a portion done using reg_offset, which means that if you
 260     * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
 261     * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
 262     * CSE can later notice that those loads are all the same and eliminate
 263     * the redundant ones.
 264     */
 265    fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
 266    instructions.push_tail(ADD(vec4_offset,
 267                               varying_offset, const_offset & ~3));
 268
 269    int scale = 1;
 270    if (brw->gen == 4 && dispatch_width == 8) {
 271       /* Pre-gen5, we can either use a SIMD8 message that requires (header,
 272        * u, v, r) as parameters, or we can just use the SIMD16 message
 273        * consisting of (header, u).  We choose the second, at the cost of a
 274        * longer return length.
 275        */
 276       scale = 2;
 277    }
 278
 279    enum opcode op;
 280    if (brw->gen >= 7)
 281       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
 282    else
 283       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
 284    fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4 * scale), dst.type);
 285    inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
 286    inst->regs_written = 4 * scale;
 287    instructions.push_tail(inst);
 288
 289    if (brw->gen < 7) {
 290       inst->base_mrf = 13;
 291       inst->header_present = true;
 292       if (brw->gen == 4)
 293          inst->mlen = 3;
 294       else
 295          inst->mlen = 1 + dispatch_width / 8;
 296    }
 297
 298    vec4_result.reg_offset += (const_offset & 3) * scale;
 299    instructions.push_tail(MOV(dst, vec4_result));
 300
 301    return instructions;
 302 }
 303
 304 /**
 305  * A helper for MOV generation for fixing up broken hardware SEND dependency
 306  * handling.
 307  */
 308 fs_inst *
 309 fs_visitor::DEP_RESOLVE_MOV(int grf)
 310 {
 311    fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
 312
 313    inst->ir = NULL;
 314    inst->annotation = "send dependency resolve";
 315
 316    /* The caller always wants uncompressed to emit the minimal extra
 317     * dependencies, and to avoid having to deal with aligning its regs to 2.
 318     */
 319    inst->force_uncompressed = true;
 320
 321    return inst;
 322 }
 323
 324 bool
 325 fs_inst::equals(fs_inst *inst)
 326 {
 327    return (opcode == inst->opcode &&
 328            dst.equals(inst->dst) &&
 329            src[0].equals(inst->src[0]) &&
 330            src[1].equals(inst->src[1]) &&
 331            src[2].equals(inst->src[2]) &&
 332            saturate == inst->saturate &&
 333            predicate == inst->predicate &&
 334            conditional_mod == inst->conditional_mod &&
 335            mlen == inst->mlen &&
 336            base_mrf == inst->base_mrf &&
 337            sampler == inst->sampler &&
 338            target == inst->target &&
 339            eot == inst->eot &&
 340            header_present == inst->header_present &&
 341            shadow_compare == inst->shadow_compare &&
 342            offset == inst->offset);
 343 }
 344
 345 bool
 346 fs_inst::overwrites_reg(const fs_reg &reg)
 347 {
 348    return (reg.file == dst.file &&
 349            reg.reg == dst.reg &&
 350            reg.reg_offset >= dst.reg_offset  &&
 351            reg.reg_offset < dst.reg_offset + regs_written);
 352 }
 353
 354 bool
 355 fs_inst::is_send_from_grf()
 356 {
 357    return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
 358            opcode == SHADER_OPCODE_SHADER_TIME_ADD ||
 359            (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
 360             src[1].file == GRF) ||
 361            (is_tex() && src[0].file == GRF));
 362 }
 363
 364 bool
 365 fs_visitor::can_do_source_mods(fs_inst *inst)
 366 {
 367    if (brw->gen == 6 && inst->is_math())
 368       return false;
 369
 370    if (inst->is_send_from_grf())
 371       return false;
 372
 373    if (!inst->can_do_source_mods())
 374       return false;
 375
 376    return true;
 377 }
 378
 379 void
 380 fs_reg::init()
 381 {
 382    memset(this, 0, sizeof(*this));
 383    stride = 1;
 384 }
 385
 386 /** Generic unset register constructor. */
 387 fs_reg::fs_reg()
 388 {
 389    init();
 390    this->file = BAD_FILE;
 391 }
 392
 393 /** Immediate value constructor. */
 394 fs_reg::fs_reg(float f)
 395 {
 396    init();
 397    this->file = IMM;
 398    this->type = BRW_REGISTER_TYPE_F;
 399    this->imm.f = f;
 400 }
 401
 402 /** Immediate value constructor. */
 403 fs_reg::fs_reg(int32_t i)
 404 {
 405    init();
 406    this->file = IMM;
 407    this->type = BRW_REGISTER_TYPE_D;
 408    this->imm.i = i;
 409 }
 410
 411 /** Immediate value constructor. */
 412 fs_reg::fs_reg(uint32_t u)
 413 {
 414    init();
 415    this->file = IMM;
 416    this->type = BRW_REGISTER_TYPE_UD;
 417    this->imm.u = u;
 418 }
 419
 420 /** Fixed brw_reg. */
 421 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
 422 {
 423    init();
 424    this->file = HW_REG;
 425    this->fixed_hw_reg = fixed_hw_reg;
 426    this->type = fixed_hw_reg.type;
 427 }
 428
 429 bool
 430 fs_reg::equals(const fs_reg &r) const
 431 {
 432    return (file == r.file &&
 433            reg == r.reg &&
 434            reg_offset == r.reg_offset &&
 435            subreg_offset == r.subreg_offset &&
 436            type == r.type &&
 437            negate == r.negate &&
 438            abs == r.abs &&
 439            !reladdr && !r.reladdr &&
 440            memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
 441                   sizeof(fixed_hw_reg)) == 0 &&
 442            stride == r.stride &&
 443            imm.u == r.imm.u);
 444 }
 445
 446 fs_reg
 447 fs_reg::retype(uint32_t type)
 448 {
 449    fs_reg result = *this;
 450    result.type = type;
 451    return result;
 452 }
 453
 454 fs_reg &
 455 fs_reg::apply_stride(unsigned stride)
 456 {
 457    assert((this->stride * stride) <= 4 &&
 458           (is_power_of_two(stride) || stride == 0) &&
 459           file != HW_REG && file != IMM);
 460    this->stride *= stride;
 461    return *this;
 462 }
 463
 464 fs_reg &
 465 fs_reg::set_smear(unsigned subreg)
 466 {
 467    assert(file != HW_REG && file != IMM);
 468    subreg_offset = subreg * type_sz(type);
 469    stride = 0;
 470    return *this;
 471 }
 472
 473 bool
 474 fs_reg::is_contiguous() const
 475 {
 476    return stride == 1;
 477 }
 478
 479 bool
 480 fs_reg::is_zero() const
 481 {
 482    if (file != IMM)
 483       return false;
 484
 485    return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
 486 }
 487
 488 bool
 489 fs_reg::is_one() const
 490 {
 491    if (file != IMM)
 492       return false;
 493
 494    return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
 495 }
 496
 497 bool
 498 fs_reg::is_null() const
 499 {
 500    return file == HW_REG &&
 501           fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
 502           fixed_hw_reg.nr == BRW_ARF_NULL;
 503 }
 504
 505 bool
 506 fs_reg::is_valid_3src() const
 507 {
 508    return file == GRF || file == UNIFORM;
 509 }
 510
 511 int
 512 fs_visitor::type_size(const struct glsl_type *type)
 513 {
 514    unsigned int size, i;
 515
 516    switch (type->base_type) {
 517    case GLSL_TYPE_UINT:
 518    case GLSL_TYPE_INT:
 519    case GLSL_TYPE_FLOAT:
 520    case GLSL_TYPE_BOOL:
 521       return type->components();
 522    case GLSL_TYPE_ARRAY:
 523       return type_size(type->fields.array) * type->length;
 524    case GLSL_TYPE_STRUCT:
 525       size = 0;
 526       for (i = 0; i < type->length; i++) {
 527          size += type_size(type->fields.structure[i].type);
 528       }
 529       return size;
 530    case GLSL_TYPE_SAMPLER:
 531       /* Samplers take up no register space, since they're baked in at
 532        * link time.
 533        */
 534       return 0;
 535    case GLSL_TYPE_ATOMIC_UINT:
 536       return 0;
 537    case GLSL_TYPE_IMAGE:
 538    case GLSL_TYPE_VOID:
 539    case GLSL_TYPE_ERROR:
 540    case GLSL_TYPE_INTERFACE:
 541       assert(!"not reached");
 542       break;
 543    }
 544
 545    return 0;
 546 }
 547
 548 fs_reg
 549 fs_visitor::get_timestamp()
 550 {
 551    assert(brw->gen >= 7);
 552
 553    fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 554                                           BRW_ARF_TIMESTAMP,
 555                                           0),
 556                              BRW_REGISTER_TYPE_UD));
 557
 558    fs_reg dst = fs_reg(this, glsl_type::uint_type);
 559
 560    fs_inst *mov = emit(MOV(dst, ts));
 561    /* We want to read the 3 fields we care about (mostly field 0, but also 2)
 562     * even if it's not enabled in the dispatch.
 563     */
 564    mov->force_writemask_all = true;
 565    mov->force_uncompressed = true;
 566
 567    /* The caller wants the low 32 bits of the timestamp.  Since it's running
 568     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
 569     * which is plenty of time for our purposes.  It is identical across the
 570     * EUs, but since it's tracking GPU core speed it will increment at a
 571     * varying rate as render P-states change.
 572     *
 573     * The caller could also check if render P-states have changed (or anything
 574     * else that might disrupt timing) by setting smear to 2 and checking if
 575     * that field is != 0.
 576     */
 577    dst.set_smear(0);
 578
 579    return dst;
 580 }
 581
 582 void
 583 fs_visitor::emit_shader_time_begin()
 584 {
 585    current_annotation = "shader time start";
 586    shader_start_time = get_timestamp();
 587 }
 588
 589 void
 590 fs_visitor::emit_shader_time_end()
 591 {
 592    current_annotation = "shader time end";
 593
 594    enum shader_time_shader_type type, written_type, reset_type;
 595    if (dispatch_width == 8) {
 596       type = ST_FS8;
 597       written_type = ST_FS8_WRITTEN;
 598       reset_type = ST_FS8_RESET;
 599    } else {
 600       assert(dispatch_width == 16);
 601       type = ST_FS16;
 602       written_type = ST_FS16_WRITTEN;
 603       reset_type = ST_FS16_RESET;
 604    }
 605
 606    fs_reg shader_end_time = get_timestamp();
 607
 608    /* Check that there weren't any timestamp reset events (assuming these
 609     * were the only two timestamp reads that happened).
 610     */
 611    fs_reg reset = shader_end_time;
 612    reset.set_smear(2);
 613    fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
 614    test->conditional_mod = BRW_CONDITIONAL_Z;
 615    emit(IF(BRW_PREDICATE_NORMAL));
 616
 617    push_force_uncompressed();
 618    fs_reg start = shader_start_time;
 619    start.negate = true;
 620    fs_reg diff = fs_reg(this, glsl_type::uint_type);
 621    emit(ADD(diff, start, shader_end_time));
 622
 623    /* If there were no instructions between the two timestamp gets, the diff
 624     * is 2 cycles.  Remove that overhead, so I can forget about that when
 625     * trying to determine the time taken for single instructions.
 626     */
 627    emit(ADD(diff, diff, fs_reg(-2u)));
 628
 629    emit_shader_time_write(type, diff);
 630    emit_shader_time_write(written_type, fs_reg(1u));
 631    emit(BRW_OPCODE_ELSE);
 632    emit_shader_time_write(reset_type, fs_reg(1u));
 633    emit(BRW_OPCODE_ENDIF);
 634
 635    pop_force_uncompressed();
 636 }
 637
 638 void
 639 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
 640                                    fs_reg value)
 641 {
 642    int shader_time_index =
 643       brw_get_shader_time_index(brw, shader_prog, &fp->Base, type);
 644    fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
 645
 646    fs_reg payload;
 647    if (dispatch_width == 8)
 648       payload = fs_reg(this, glsl_type::uvec2_type);
 649    else
 650       payload = fs_reg(this, glsl_type::uint_type);
 651
 652    emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
 653                 fs_reg(), payload, offset, value));
 654 }
 655
 656 void
 657 fs_visitor::fail(const char *format, ...)
 658 {
 659    va_list va;
 660    char *msg;
 661
 662    if (failed)
 663       return;
 664
 665    failed = true;
 666
 667    va_start(va, format);
 668    msg = ralloc_vasprintf(mem_ctx, format, va);
 669    va_end(va);
 670    msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
 671
 672    this->fail_msg = msg;
 673
 674    if (INTEL_DEBUG & DEBUG_WM) {
 675       fprintf(stderr, "%s",  msg);
 676    }
 677 }
 678
 679 fs_inst *
 680 fs_visitor::emit(enum opcode opcode)
 681 {
 682    return emit(fs_inst(opcode));
 683 }
 684
 685 fs_inst *
 686 fs_visitor::emit(enum opcode opcode, fs_reg dst)
 687 {
 688    return emit(fs_inst(opcode, dst));
 689 }
 690
 691 fs_inst *
 692 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
 693 {
 694    return emit(fs_inst(opcode, dst, src0));
 695 }
 696
 697 fs_inst *
 698 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 699 {
 700    return emit(fs_inst(opcode, dst, src0, src1));
 701 }
 702
 703 fs_inst *
 704 fs_visitor::emit(enum opcode opcode, fs_reg dst,
 705                  fs_reg src0, fs_reg src1, fs_reg src2)
 706 {
 707    return emit(fs_inst(opcode, dst, src0, src1, src2));
 708 }
 709
 710 void
 711 fs_visitor::push_force_uncompressed()
 712 {
 713    force_uncompressed_stack++;
 714 }
 715
 716 void
 717 fs_visitor::pop_force_uncompressed()
 718 {
 719    force_uncompressed_stack--;
 720    assert(force_uncompressed_stack >= 0);
 721 }
 722
 723 /**
 724  * Returns true if the instruction has a flag that means it won't
 725  * update an entire destination register.
 726  *
 727  * For example, dead code elimination and live variable analysis want to know
 728  * when a write to a variable screens off any preceding values that were in
 729  * it.
 730  */
 731 bool
 732 fs_inst::is_partial_write()
 733 {
 734    return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
 735            this->force_uncompressed ||
 736            this->force_sechalf || !this->dst.is_contiguous());
 737 }
 738
 739 int
 740 fs_inst::regs_read(fs_visitor *v, int arg)
 741 {
 742    if (is_tex() && arg == 0 && src[0].file == GRF) {
 743       if (v->dispatch_width == 16)
 744          return (mlen + 1) / 2;
 745       else
 746          return mlen;
 747    }
 748    return 1;
 749 }
 750
 751 bool
 752 fs_inst::reads_flag()
 753 {
 754    return predicate;
 755 }
 756
 757 bool
 758 fs_inst::writes_flag()
 759 {
 760    return (conditional_mod && opcode != BRW_OPCODE_SEL) ||
 761           opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
 762 }
 763
 764 /**
 765  * Returns how many MRFs an FS opcode will write over.
 766  *
 767  * Note that this is not the 0 or 1 implied writes in an actual gen
 768  * instruction -- the FS opcodes often generate MOVs in addition.
 769  */
 770 int
 771 fs_visitor::implied_mrf_writes(fs_inst *inst)
 772 {
 773    if (inst->mlen == 0)
 774       return 0;
 775
 776    if (inst->base_mrf == -1)
 777       return 0;
 778
 779    switch (inst->opcode) {
 780    case SHADER_OPCODE_RCP:
 781    case SHADER_OPCODE_RSQ:
 782    case SHADER_OPCODE_SQRT:
 783    case SHADER_OPCODE_EXP2:
 784    case SHADER_OPCODE_LOG2:
 785    case SHADER_OPCODE_SIN:
 786    case SHADER_OPCODE_COS:
 787       return 1 * dispatch_width / 8;
 788    case SHADER_OPCODE_POW:
 789    case SHADER_OPCODE_INT_QUOTIENT:
 790    case SHADER_OPCODE_INT_REMAINDER:
 791       return 2 * dispatch_width / 8;
 792    case SHADER_OPCODE_TEX:
 793    case FS_OPCODE_TXB:
 794    case SHADER_OPCODE_TXD:
 795    case SHADER_OPCODE_TXF:
 796    case SHADER_OPCODE_TXF_CMS:
 797    case SHADER_OPCODE_TXF_MCS:
 798    case SHADER_OPCODE_TG4:
 799    case SHADER_OPCODE_TG4_OFFSET:
 800    case SHADER_OPCODE_TXL:
 801    case SHADER_OPCODE_TXS:
 802    case SHADER_OPCODE_LOD:
 803       return 1;
 804    case FS_OPCODE_FB_WRITE:
 805       return 2;
 806    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 807    case SHADER_OPCODE_GEN4_SCRATCH_READ:
 808       return 1;
 809    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
 810       return inst->mlen;
 811    case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
 812       return 2;
 813    case SHADER_OPCODE_UNTYPED_ATOMIC:
 814    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
 815       return 0;
 816    default:
 817       assert(!"not reached");
 818       return inst->mlen;
 819    }
 820 }
 821
 822 int
 823 fs_visitor::virtual_grf_alloc(int size)
 824 {
 825    if (virtual_grf_array_size <= virtual_grf_count) {
 826       if (virtual_grf_array_size == 0)
 827          virtual_grf_array_size = 16;
 828       else
 829          virtual_grf_array_size *= 2;
 830       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 831                                    virtual_grf_array_size);
 832    }
 833    virtual_grf_sizes[virtual_grf_count] = size;
 834    return virtual_grf_count++;
 835 }
 836
 837 /** Fixed HW reg constructor. */
 838 fs_reg::fs_reg(enum register_file file, int reg)
 839 {
 840    init();
 841    this->file = file;
 842    this->reg = reg;
 843    this->type = BRW_REGISTER_TYPE_F;
 844 }
 845
 846 /** Fixed HW reg constructor. */
 847 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
 848 {
 849    init();
 850    this->file = file;
 851    this->reg = reg;
 852    this->type = type;
 853 }
 854
 855 /** Automatic reg constructor. */
 856 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
 857 {
 858    init();
 859
 860    this->file = GRF;
 861    this->reg = v->virtual_grf_alloc(v->type_size(type));
 862    this->reg_offset = 0;
 863    this->type = brw_type_for_base_type(type);
 864 }
 865
 866 fs_reg *
 867 fs_visitor::variable_storage(ir_variable *var)
 868 {
 869    return (fs_reg *)hash_table_find(this->variable_ht, var);
 870 }
 871
 872 void
 873 import_uniforms_callback(const void *key,
 874                          void *data,
 875                          void *closure)
 876 {
 877    struct hash_table *dst_ht = (struct hash_table *)closure;
 878    const fs_reg *reg = (const fs_reg *)data;
 879
 880    if (reg->file != UNIFORM)
 881       return;
 882
 883    hash_table_insert(dst_ht, data, key);
 884 }
 885
 886 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
 887  * This brings in those uniform definitions
 888  */
 889 void
 890 fs_visitor::import_uniforms(fs_visitor *v)
 891 {
 892    hash_table_call_foreach(v->variable_ht,
 893                            import_uniforms_callback,
 894                            variable_ht);
 895    this->params_remap = v->params_remap;
 896    this->nr_params_remap = v->nr_params_remap;
 897 }
 898
 899 /* Our support for uniforms is piggy-backed on the struct
 900  * gl_fragment_program, because that's where the values actually
 901  * get stored, rather than in some global gl_shader_program uniform
 902  * store.
 903  */
 904 void
 905 fs_visitor::setup_uniform_values(ir_variable *ir)
 906 {
 907    int namelen = strlen(ir->name);
 908
 909    /* The data for our (non-builtin) uniforms is stored in a series of
 910     * gl_uniform_driver_storage structs for each subcomponent that
 911     * glGetUniformLocation() could name.  We know it's been set up in the same
 912     * order we'd walk the type, so walk the list of storage and find anything
 913     * with our name, or the prefix of a component that starts with our name.
 914     */
 915    unsigned params_before = stage_prog_data->nr_params;
 916    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
 917       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 918
 919       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 920           (storage->name[namelen] != 0 &&
 921            storage->name[namelen] != '.' &&
 922            storage->name[namelen] != '[')) {
 923          continue;
 924       }
 925
 926       unsigned slots = storage->type->component_slots();
 927       if (storage->array_elements)
 928          slots *= storage->array_elements;
 929
 930       for (unsigned i = 0; i < slots; i++) {
 931          stage_prog_data->param[stage_prog_data->nr_params++] =
 932             &storage->storage[i].f;
 933       }
 934    }
 935
 936    /* Make sure we actually initialized the right amount of stuff here. */
 937    assert(params_before + ir->type->component_slots() ==
 938           stage_prog_data->nr_params);
 939    (void)params_before;
 940 }
 941
 942
 943 /* Our support for builtin uniforms is even scarier than non-builtin.
 944  * It sits on top of the PROG_STATE_VAR parameters that are
 945  * automatically updated from GL context state.
 946  */
 947 void
 948 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
 949 {
 950    const ir_state_slot *const slots = ir->state_slots;
 951    assert(ir->state_slots != NULL);
 952
 953    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 954       /* This state reference has already been setup by ir_to_mesa, but we'll
 955        * get the same index back here.
 956        */
 957       int index = _mesa_add_state_reference(this->fp->Base.Parameters,
 958                                             (gl_state_index *)slots[i].tokens);
 959
 960       /* Add each of the unique swizzles of the element as a parameter.
 961        * This'll end up matching the expected layout of the
 962        * array/matrix/structure we're trying to fill in.
 963        */
 964       int last_swiz = -1;
 965       for (unsigned int j = 0; j < 4; j++) {
 966          int swiz = GET_SWZ(slots[i].swizzle, j);
 967          if (swiz == last_swiz)
 968             break;
 969          last_swiz = swiz;
 970
 971          stage_prog_data->param[stage_prog_data->nr_params++] =
 972             &fp->Base.Parameters->ParameterValues[index][swiz].f;
 973       }
 974    }
 975 }
 976
 977 fs_reg *
 978 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
 979 {
 980    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 981    fs_reg wpos = *reg;
 982    bool flip = !ir->data.origin_upper_left ^ c->key.render_to_fbo;
 983
 984    /* gl_FragCoord.x */
 985    if (ir->data.pixel_center_integer) {
 986       emit(MOV(wpos, this->pixel_x));
 987    } else {
 988       emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
 989    }
 990    wpos.reg_offset++;
 991
 992    /* gl_FragCoord.y */
 993    if (!flip && ir->data.pixel_center_integer) {
 994       emit(MOV(wpos, this->pixel_y));
 995    } else {
 996       fs_reg pixel_y = this->pixel_y;
 997       float offset = (ir->data.pixel_center_integer ? 0.0 : 0.5);
 998
 999       if (flip) {
1000          pixel_y.negate = true;
1001          offset += c->key.drawable_height - 1.0;
1002       }
1003
1004       emit(ADD(wpos, pixel_y, fs_reg(offset)));
1005    }
1006    wpos.reg_offset++;
1007
1008    /* gl_FragCoord.z */
1009    if (brw->gen >= 6) {
1010       emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
1011    } else {
1012       emit(FS_OPCODE_LINTERP, wpos,
1013            this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1014            this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1015            interp_reg(VARYING_SLOT_POS, 2));
1016    }
1017    wpos.reg_offset++;
1018
1019    /* gl_FragCoord.w: Already set up in emit_interpolation */
1020    emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1021
1022    return reg;
1023 }
1024
1025 fs_inst *
1026 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1027                          glsl_interp_qualifier interpolation_mode,
1028                          bool is_centroid, bool is_sample)
1029 {
1030    brw_wm_barycentric_interp_mode barycoord_mode;
1031    if (brw->gen >= 6) {
1032       if (is_centroid) {
1033          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1034             barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1035          else
1036             barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1037       } else if (is_sample) {
1038           if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1039             barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1040          else
1041             barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1042       } else {
1043          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1044             barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1045          else
1046             barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1047       }
1048    } else {
1049       /* On Ironlake and below, there is only one interpolation mode.
1050        * Centroid interpolation doesn't mean anything on this hardware --
1051        * there is no multisampling.
1052        */
1053       barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1054    }
1055    return emit(FS_OPCODE_LINTERP, attr,
1056                this->delta_x[barycoord_mode],
1057                this->delta_y[barycoord_mode], interp);
1058 }
1059
1060 fs_reg *
1061 fs_visitor::emit_general_interpolation(ir_variable *ir)
1062 {
1063    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1064    reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
1065    fs_reg attr = *reg;
1066
1067    unsigned int array_elements;
1068    const glsl_type *type;
1069
1070    if (ir->type->is_array()) {
1071       array_elements = ir->type->length;
1072       if (array_elements == 0) {
1073          fail("dereferenced array '%s' has length 0\n", ir->name);
1074       }
1075       type = ir->type->fields.array;
1076    } else {
1077       array_elements = 1;
1078       type = ir->type;
1079    }
1080
1081    glsl_interp_qualifier interpolation_mode =
1082       ir->determine_interpolation_mode(c->key.flat_shade);
1083
1084    int location = ir->data.location;
1085    for (unsigned int i = 0; i < array_elements; i++) {
1086       for (unsigned int j = 0; j < type->matrix_columns; j++) {
1087          if (c->prog_data.urb_setup[location] == -1) {
1088             /* If there's no incoming setup data for this slot, don't
1089              * emit interpolation for it.
1090              */
1091             attr.reg_offset += type->vector_elements;
1092             location++;
1093             continue;
1094          }
1095
1096          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1097             /* Constant interpolation (flat shading) case. The SF has
1098              * handed us defined values in only the constant offset
1099              * field of the setup reg.
1100              */
1101             for (unsigned int k = 0; k < type->vector_elements; k++) {
1102                struct brw_reg interp = interp_reg(location, k);
1103                interp = suboffset(interp, 3);
1104                interp.type = reg->type;
1105                emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1106                attr.reg_offset++;
1107             }
1108          } else {
1109             /* Smooth/noperspective interpolation case. */
1110             for (unsigned int k = 0; k < type->vector_elements; k++) {
1111                struct brw_reg interp = interp_reg(location, k);
1112                emit_linterp(attr, fs_reg(interp), interpolation_mode,
1113                             ir->data.centroid && !c->key.persample_shading,
1114                             ir->data.sample || c->key.persample_shading);
1115                if (brw->needs_unlit_centroid_workaround && ir->data.centroid) {
1116                   /* Get the pixel/sample mask into f0 so that we know
1117                    * which pixels are lit.  Then, for each channel that is
1118                    * unlit, replace the centroid data with non-centroid
1119                    * data.
1120                    */
1121                   emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1122                   fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1123                                                interpolation_mode,
1124                                                false, false);
1125                   inst->predicate = BRW_PREDICATE_NORMAL;
1126                   inst->predicate_inverse = true;
1127                }
1128                if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1129                   emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1130                }
1131                attr.reg_offset++;
1132             }
1133
1134          }
1135          location++;
1136       }
1137    }
1138
1139    return reg;
1140 }
1141
1142 fs_reg *
1143 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1144 {
1145    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1146
1147    /* The frontfacing comes in as a bit in the thread payload. */
1148    if (brw->gen >= 6) {
1149       emit(BRW_OPCODE_ASR, *reg,
1150            fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1151            fs_reg(15));
1152       emit(BRW_OPCODE_NOT, *reg, *reg);
1153       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1154    } else {
1155       struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1156       /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1157        * us front face
1158        */
1159       emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1160       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1161    }
1162
1163    return reg;
1164 }
1165
1166 void
1167 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1168 {
1169    assert(dst.type == BRW_REGISTER_TYPE_F);
1170
1171    if (c->key.compute_pos_offset) {
1172       /* Convert int_sample_pos to floating point */
1173       emit(MOV(dst, int_sample_pos));
1174       /* Scale to the range [0, 1] */
1175       emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1176    }
1177    else {
1178       /* From ARB_sample_shading specification:
1179        * "When rendering to a non-multisample buffer, or if multisample
1180        *  rasterization is disabled, gl_SamplePosition will always be
1181        *  (0.5, 0.5).
1182        */
1183       emit(MOV(dst, fs_reg(0.5f)));
1184    }
1185 }
1186
1187 fs_reg *
1188 fs_visitor::emit_samplepos_setup(ir_variable *ir)
1189 {
1190    assert(brw->gen >= 6);
1191    assert(ir->type == glsl_type::vec2_type);
1192
1193    this->current_annotation = "compute sample position";
1194    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1195    fs_reg pos = *reg;
1196    fs_reg int_sample_x = fs_reg(this, glsl_type::int_type);
1197    fs_reg int_sample_y = fs_reg(this, glsl_type::int_type);
1198
1199    /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1200     * mode will be enabled.
1201     *
1202     * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1203     * R31.1:0         Position Offset X/Y for Slot[3:0]
1204     * R31.3:2         Position Offset X/Y for Slot[7:4]
1205     * .....
1206     *
1207     * The X, Y sample positions come in as bytes in  thread payload. So, read
1208     * the positions using vstride=16, width=8, hstride=2.
1209     */
1210    struct brw_reg sample_pos_reg =
1211       stride(retype(brw_vec1_grf(c->sample_pos_reg, 0),
1212                     BRW_REGISTER_TYPE_B), 16, 8, 2);
1213
1214    emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1215    if (dispatch_width == 16) {
1216       fs_inst *inst = emit(MOV(half(int_sample_x, 1),
1217                                fs_reg(suboffset(sample_pos_reg, 16))));
1218       inst->force_sechalf = true;
1219    }
1220    /* Compute gl_SamplePosition.x */
1221    compute_sample_position(pos, int_sample_x);
1222    pos.reg_offset++;
1223    emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1224    if (dispatch_width == 16) {
1225       fs_inst *inst = emit(MOV(half(int_sample_y, 1),
1226                                fs_reg(suboffset(sample_pos_reg, 17))));
1227       inst->force_sechalf = true;
1228    }
1229    /* Compute gl_SamplePosition.y */
1230    compute_sample_position(pos, int_sample_y);
1231    return reg;
1232 }
1233
1234 fs_reg *
1235 fs_visitor::emit_sampleid_setup(ir_variable *ir)
1236 {
1237    assert(brw->gen >= 6);
1238
1239    this->current_annotation = "compute sample id";
1240    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1241
1242    if (c->key.compute_sample_id) {
1243       fs_reg t1 = fs_reg(this, glsl_type::int_type);
1244       fs_reg t2 = fs_reg(this, glsl_type::int_type);
1245       t2.type = BRW_REGISTER_TYPE_UW;
1246
1247       /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1248        * 8x multisampling, subspan 0 will represent sample N (where N
1249        * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1250        * 7. We can find the value of N by looking at R0.0 bits 7:6
1251        * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1252        * (since samples are always delivered in pairs). That is, we
1253        * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1254        * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1255        * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1256        * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1257        * populating a temporary variable with the sequence (0, 1, 2, 3),
1258        * and then reading from it using vstride=1, width=4, hstride=0.
1259        * These computations hold good for 4x multisampling as well.
1260        */
1261       emit(BRW_OPCODE_AND, t1,
1262            fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1263            fs_reg(brw_imm_d(0xc0)));
1264       emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1265       /* This works for both SIMD8 and SIMD16 */
1266       emit(MOV(t2, brw_imm_v(0x3210)));
1267       /* This special instruction takes care of setting vstride=1,
1268        * width=4, hstride=0 of t2 during an ADD instruction.
1269        */
1270       emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1271    } else {
1272       /* As per GL_ARB_sample_shading specification:
1273        * "When rendering to a non-multisample buffer, or if multisample
1274        *  rasterization is disabled, gl_SampleID will always be zero."
1275        */
1276       emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1277    }
1278
1279    return reg;
1280 }
1281
1282 fs_reg *
1283 fs_visitor::emit_samplemaskin_setup(ir_variable *ir)
1284 {
1285    assert(brw->gen >= 7);
1286    this->current_annotation = "compute gl_SampleMaskIn";
1287    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1288    emit(MOV(*reg, fs_reg(retype(brw_vec8_grf(c->sample_mask_reg, 0), BRW_REGISTER_TYPE_D))));
1289    return reg;
1290 }
1291
1292 fs_reg
1293 fs_visitor::fix_math_operand(fs_reg src)
1294 {
1295    /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1296     * might be able to do better by doing execsize = 1 math and then
1297     * expanding that result out, but we would need to be careful with
1298     * masking.
1299     *
1300     * The hardware ignores source modifiers (negate and abs) on math
1301     * instructions, so we also move to a temp to set those up.
1302     */
1303    if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1304        !src.abs && !src.negate)
1305       return src;
1306
1307    /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1308     * operands to math
1309     */
1310    if (brw->gen >= 7 && src.file != IMM)
1311       return src;
1312
1313    fs_reg expanded = fs_reg(this, glsl_type::float_type);
1314    expanded.type = src.type;
1315    emit(BRW_OPCODE_MOV, expanded, src);
1316    return expanded;
1317 }
1318
1319 fs_inst *
1320 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1321 {
1322    switch (opcode) {
1323    case SHADER_OPCODE_RCP:
1324    case SHADER_OPCODE_RSQ:
1325    case SHADER_OPCODE_SQRT:
1326    case SHADER_OPCODE_EXP2:
1327    case SHADER_OPCODE_LOG2:
1328    case SHADER_OPCODE_SIN:
1329    case SHADER_OPCODE_COS:
1330       break;
1331    default:
1332       assert(!"not reached: bad math opcode");
1333       return NULL;
1334    }
1335
1336    /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
1337     * might be able to do better by doing execsize = 1 math and then
1338     * expanding that result out, but we would need to be careful with
1339     * masking.
1340     *
1341     * Gen 6 hardware ignores source modifiers (negate and abs) on math
1342     * instructions, so we also move to a temp to set those up.
1343     */
1344    if (brw->gen >= 6)
1345       src = fix_math_operand(src);
1346
1347    fs_inst *inst = emit(opcode, dst, src);
1348
1349    if (brw->gen < 6) {
1350       inst->base_mrf = 2;
1351       inst->mlen = dispatch_width / 8;
1352    }
1353
1354    return inst;
1355 }
1356
1357 fs_inst *
1358 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1359 {
1360    int base_mrf = 2;
1361    fs_inst *inst;
1362
1363    switch (opcode) {
1364    case SHADER_OPCODE_INT_QUOTIENT:
1365    case SHADER_OPCODE_INT_REMAINDER:
1366       if (brw->gen >= 7 && dispatch_width == 16)
1367          fail("SIMD16 INTDIV unsupported\n");
1368       break;
1369    case SHADER_OPCODE_POW:
1370       break;
1371    default:
1372       assert(!"not reached: unsupported binary math opcode.");
1373       return NULL;
1374    }
1375
1376    if (brw->gen >= 6) {
1377       src0 = fix_math_operand(src0);
1378       src1 = fix_math_operand(src1);
1379
1380       inst = emit(opcode, dst, src0, src1);
1381    } else {
1382       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1383        * "Message Payload":
1384        *
1385        * "Operand0[7].  For the INT DIV functions, this operand is the
1386        *  denominator."
1387        *  ...
1388        * "Operand1[7].  For the INT DIV functions, this operand is the
1389        *  numerator."
1390        */
1391       bool is_int_div = opcode != SHADER_OPCODE_POW;
1392       fs_reg &op0 = is_int_div ? src1 : src0;
1393       fs_reg &op1 = is_int_div ? src0 : src1;
1394
1395       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1396       inst = emit(opcode, dst, op0, reg_null_f);
1397
1398       inst->base_mrf = base_mrf;
1399       inst->mlen = 2 * dispatch_width / 8;
1400    }
1401    return inst;
1402 }
1403
1404 void
1405 fs_visitor::assign_curb_setup()
1406 {
1407    c->prog_data.curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1408    if (dispatch_width == 8) {
1409       c->prog_data.first_curbe_grf = c->nr_payload_regs;
1410    } else {
1411       c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1412    }
1413
1414    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1415    foreach_list(node, &this->instructions) {
1416       fs_inst *inst = (fs_inst *)node;
1417
1418       for (unsigned int i = 0; i < 3; i++) {
1419          if (inst->src[i].file == UNIFORM) {
1420             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1421             struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1422                                                   constant_nr / 8,
1423                                                   constant_nr % 8);
1424
1425             inst->src[i].file = HW_REG;
1426             inst->src[i].fixed_hw_reg = byte_offset(
1427                retype(brw_reg, inst->src[i].type),
1428                inst->src[i].subreg_offset);
1429          }
1430       }
1431    }
1432 }
1433
1434 void
1435 fs_visitor::calculate_urb_setup()
1436 {
1437    for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1438       c->prog_data.urb_setup[i] = -1;
1439    }
1440
1441    int urb_next = 0;
1442    /* Figure out where each of the incoming setup attributes lands. */
1443    if (brw->gen >= 6) {
1444       if (_mesa_bitcount_64(fp->Base.InputsRead &
1445                             BRW_FS_VARYING_INPUT_MASK) <= 16) {
1446          /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1447           * first 16 varying inputs, so we can put them wherever we want.
1448           * Just put them in order.
1449           *
1450           * This is useful because it means that (a) inputs not used by the
1451           * fragment shader won't take up valuable register space, and (b) we
1452           * won't have to recompile the fragment shader if it gets paired with
1453           * a different vertex (or geometry) shader.
1454           */
1455          for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1456             if (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1457                 BITFIELD64_BIT(i)) {
1458                c->prog_data.urb_setup[i] = urb_next++;
1459             }
1460          }
1461       } else {
1462          /* We have enough input varyings that the SF/SBE pipeline stage can't
1463           * arbitrarily rearrange them to suit our whim; we have to put them
1464           * in an order that matches the output of the previous pipeline stage
1465           * (geometry or vertex shader).
1466           */
1467          struct brw_vue_map prev_stage_vue_map;
1468          brw_compute_vue_map(brw, &prev_stage_vue_map,
1469                              c->key.input_slots_valid);
1470          int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1471          assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1472          for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1473               slot++) {
1474             int varying = prev_stage_vue_map.slot_to_varying[slot];
1475             /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1476              * unused.
1477              */
1478             if (varying != BRW_VARYING_SLOT_COUNT &&
1479                 (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1480                  BITFIELD64_BIT(varying))) {
1481                c->prog_data.urb_setup[varying] = slot - first_slot;
1482             }
1483          }
1484          urb_next = prev_stage_vue_map.num_slots - first_slot;
1485       }
1486    } else {
1487       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1488       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1489          /* Point size is packed into the header, not as a general attribute */
1490          if (i == VARYING_SLOT_PSIZ)
1491             continue;
1492
1493          if (c->key.input_slots_valid & BITFIELD64_BIT(i)) {
1494             /* The back color slot is skipped when the front color is
1495              * also written to.  In addition, some slots can be
1496              * written in the vertex shader and not read in the
1497              * fragment shader.  So the register number must always be
1498              * incremented, mapped or not.
1499              */
1500             if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1501                c->prog_data.urb_setup[i] = urb_next;
1502             urb_next++;
1503          }
1504       }
1505
1506       /*
1507        * It's a FS only attribute, and we did interpolation for this attribute
1508        * in SF thread. So, count it here, too.
1509        *
1510        * See compile_sf_prog() for more info.
1511        */
1512       if (fp->Base.InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1513          c->prog_data.urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1514    }
1515
1516    c->prog_data.num_varying_inputs = urb_next;
1517 }
1518
1519 void
1520 fs_visitor::assign_urb_setup()
1521 {
1522    int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1523
1524    /* Offset all the urb_setup[] index by the actual position of the
1525     * setup regs, now that the location of the constants has been chosen.
1526     */
1527    foreach_list(node, &this->instructions) {
1528       fs_inst *inst = (fs_inst *)node;
1529
1530       if (inst->opcode == FS_OPCODE_LINTERP) {
1531          assert(inst->src[2].file == HW_REG);
1532          inst->src[2].fixed_hw_reg.nr += urb_start;
1533       }
1534
1535       if (inst->opcode == FS_OPCODE_CINTERP) {
1536          assert(inst->src[0].file == HW_REG);
1537          inst->src[0].fixed_hw_reg.nr += urb_start;
1538       }
1539    }
1540
1541    /* Each attribute is 4 setup channels, each of which is half a reg. */
1542    this->first_non_payload_grf =
1543       urb_start + c->prog_data.num_varying_inputs * 2;
1544 }
1545
1546 /**
1547  * Split large virtual GRFs into separate components if we can.
1548  *
1549  * This is mostly duplicated with what brw_fs_vector_splitting does,
1550  * but that's really conservative because it's afraid of doing
1551  * splitting that doesn't result in real progress after the rest of
1552  * the optimization phases, which would cause infinite looping in
1553  * optimization.  We can do it once here, safely.  This also has the
1554  * opportunity to split interpolated values, or maybe even uniforms,
1555  * which we don't have at the IR level.
1556  *
1557  * We want to split, because virtual GRFs are what we register
1558  * allocate and spill (due to contiguousness requirements for some
1559  * instructions), and they're what we naturally generate in the
1560  * codegen process, but most virtual GRFs don't actually need to be
1561  * contiguous sets of GRFs.  If we split, we'll end up with reduced
1562  * live intervals and better dead code elimination and coalescing.
1563  */
1564 void
1565 fs_visitor::split_virtual_grfs()
1566 {
1567    int num_vars = this->virtual_grf_count;
1568    bool split_grf[num_vars];
1569    int new_virtual_grf[num_vars];
1570
1571    /* Try to split anything > 0 sized. */
1572    for (int i = 0; i < num_vars; i++) {
1573       if (this->virtual_grf_sizes[i] != 1)
1574          split_grf[i] = true;
1575       else
1576          split_grf[i] = false;
1577    }
1578
1579    if (brw->has_pln &&
1580        this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1581       /* PLN opcodes rely on the delta_xy being contiguous.  We only have to
1582        * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1583        * Gen6, that was the only supported interpolation mode, and since Gen6,
1584        * delta_x and delta_y are in fixed hardware registers.
1585        */
1586       split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1587          false;
1588    }
1589
1590    foreach_list(node, &this->instructions) {
1591       fs_inst *inst = (fs_inst *)node;
1592
1593       /* If there's a SEND message that requires contiguous destination
1594        * registers, no splitting is allowed.
1595        */
1596       if (inst->regs_written > 1) {
1597          split_grf[inst->dst.reg] = false;
1598       }
1599
1600       /* If we're sending from a GRF, don't split it, on the assumption that
1601        * the send is reading the whole thing.
1602        */
1603       if (inst->is_send_from_grf()) {
1604          for (int i = 0; i < 3; i++) {
1605             if (inst->src[i].file == GRF) {
1606                split_grf[inst->src[i].reg] = false;
1607             }
1608          }
1609       }
1610    }
1611
1612    /* Allocate new space for split regs.  Note that the virtual
1613     * numbers will be contiguous.
1614     */
1615    for (int i = 0; i < num_vars; i++) {
1616       if (split_grf[i]) {
1617          new_virtual_grf[i] = virtual_grf_alloc(1);
1618          for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1619             int reg = virtual_grf_alloc(1);
1620             assert(reg == new_virtual_grf[i] + j - 1);
1621             (void) reg;
1622          }
1623          this->virtual_grf_sizes[i] = 1;
1624       }
1625    }
1626
1627    foreach_list(node, &this->instructions) {
1628       fs_inst *inst = (fs_inst *)node;
1629
1630       if (inst->dst.file == GRF &&
1631           split_grf[inst->dst.reg] &&
1632           inst->dst.reg_offset != 0) {
1633          inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1634                           inst->dst.reg_offset - 1);
1635          inst->dst.reg_offset = 0;
1636       }
1637       for (int i = 0; i < 3; i++) {
1638          if (inst->src[i].file == GRF &&
1639              split_grf[inst->src[i].reg] &&
1640              inst->src[i].reg_offset != 0) {
1641             inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1642                                 inst->src[i].reg_offset - 1);
1643             inst->src[i].reg_offset = 0;
1644          }
1645       }
1646    }
1647    invalidate_live_intervals();
1648 }
1649
1650 /**
1651  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1652  *
1653  * During code generation, we create tons of temporary variables, many of
1654  * which get immediately killed and are never used again.  Yet, in later
1655  * optimization and analysis passes, such as compute_live_intervals, we need
1656  * to loop over all the virtual GRFs.  Compacting them can save a lot of
1657  * overhead.
1658  */
1659 void
1660 fs_visitor::compact_virtual_grfs()
1661 {
1662    /* Mark which virtual GRFs are used, and count how many. */
1663    int remap_table[this->virtual_grf_count];
1664    memset(remap_table, -1, sizeof(remap_table));
1665
1666    foreach_list(node, &this->instructions) {
1667       const fs_inst *inst = (const fs_inst *) node;
1668
1669       if (inst->dst.file == GRF)
1670          remap_table[inst->dst.reg] = 0;
1671
1672       for (int i = 0; i < 3; i++) {
1673          if (inst->src[i].file == GRF)
1674             remap_table[inst->src[i].reg] = 0;
1675       }
1676    }
1677
1678    /* In addition to registers used in instructions, fs_visitor keeps
1679     * direct references to certain special values which must be patched:
1680     */
1681    fs_reg *special[] = {
1682       &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1683       &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1684       &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1685       &delta_x[0], &delta_x[1], &delta_x[2],
1686       &delta_x[3], &delta_x[4], &delta_x[5],
1687       &delta_y[0], &delta_y[1], &delta_y[2],
1688       &delta_y[3], &delta_y[4], &delta_y[5],
1689    };
1690    STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1691    STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1692
1693    /* Treat all special values as used, to be conservative */
1694    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1695       if (special[i]->file == GRF)
1696          remap_table[special[i]->reg] = 0;
1697    }
1698
1699    /* Compact the GRF arrays. */
1700    int new_index = 0;
1701    for (int i = 0; i < this->virtual_grf_count; i++) {
1702       if (remap_table[i] != -1) {
1703          remap_table[i] = new_index;
1704          virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1705          invalidate_live_intervals();
1706          ++new_index;
1707       }
1708    }
1709
1710    this->virtual_grf_count = new_index;
1711
1712    /* Patch all the instructions to use the newly renumbered registers */
1713    foreach_list(node, &this->instructions) {
1714       fs_inst *inst = (fs_inst *) node;
1715
1716       if (inst->dst.file == GRF)
1717          inst->dst.reg = remap_table[inst->dst.reg];
1718
1719       for (int i = 0; i < 3; i++) {
1720          if (inst->src[i].file == GRF)
1721             inst->src[i].reg = remap_table[inst->src[i].reg];
1722       }
1723    }
1724
1725    /* Patch all the references to special values */
1726    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1727       if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1728          special[i]->reg = remap_table[special[i]->reg];
1729    }
1730 }
1731
1732 bool
1733 fs_visitor::remove_dead_constants()
1734 {
1735    if (dispatch_width == 8) {
1736       this->params_remap = ralloc_array(mem_ctx, int, stage_prog_data->nr_params);
1737       this->nr_params_remap = stage_prog_data->nr_params;
1738
1739       for (unsigned int i = 0; i < stage_prog_data->nr_params; i++)
1740          this->params_remap[i] = -1;
1741
1742       /* Find which params are still in use. */
1743       foreach_list(node, &this->instructions) {
1744          fs_inst *inst = (fs_inst *)node;
1745
1746          for (int i = 0; i < 3; i++) {
1747             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1748
1749             if (inst->src[i].file != UNIFORM)
1750                continue;
1751
1752             /* Section 5.11 of the OpenGL 4.3 spec says:
1753              *
1754              *     "Out-of-bounds reads return undefined values, which include
1755              *     values from other variables of the active program or zero."
1756              */
1757             if (constant_nr < 0 ||
1758                 constant_nr >= (int)stage_prog_data->nr_params) {
1759                constant_nr = 0;
1760             }
1761
1762             /* For now, set this to non-negative.  We'll give it the
1763              * actual new number in a moment, in order to keep the
1764              * register numbers nicely ordered.
1765              */
1766             this->params_remap[constant_nr] = 0;
1767          }
1768       }
1769
1770       /* Figure out what the new numbers for the params will be.  At some
1771        * point when we're doing uniform array access, we're going to want
1772        * to keep the distinction between .reg and .reg_offset, but for
1773        * now we don't care.
1774        */
1775       unsigned int new_nr_params = 0;
1776       for (unsigned int i = 0; i < stage_prog_data->nr_params; i++) {
1777          if (this->params_remap[i] != -1) {
1778             this->params_remap[i] = new_nr_params++;
1779          }
1780       }
1781
1782       /* Update the list of params to be uploaded to match our new numbering. */
1783       for (unsigned int i = 0; i < stage_prog_data->nr_params; i++) {
1784          int remapped = this->params_remap[i];
1785
1786          if (remapped == -1)
1787             continue;
1788
1789          stage_prog_data->param[remapped] = stage_prog_data->param[i];
1790       }
1791
1792       stage_prog_data->nr_params = new_nr_params;
1793    } else {
1794       /* This should have been generated in the SIMD8 pass already. */
1795       assert(this->params_remap);
1796    }
1797
1798    /* Now do the renumbering of the shader to remove unused params. */
1799    foreach_list(node, &this->instructions) {
1800       fs_inst *inst = (fs_inst *)node;
1801
1802       for (int i = 0; i < 3; i++) {
1803          int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1804
1805          if (inst->src[i].file != UNIFORM)
1806             continue;
1807
1808          /* as above alias to 0 */
1809          if (constant_nr < 0 || constant_nr >= (int)this->nr_params_remap) {
1810             constant_nr = 0;
1811          }
1812          assert(this->params_remap[constant_nr] != -1);
1813          inst->src[i].reg = this->params_remap[constant_nr];
1814          inst->src[i].reg_offset = 0;
1815       }
1816    }
1817
1818    return true;
1819 }
1820
1821 /*
1822  * Implements array access of uniforms by inserting a
1823  * PULL_CONSTANT_LOAD instruction.
1824  *
1825  * Unlike temporary GRF array access (where we don't support it due to
1826  * the difficulty of doing relative addressing on instruction
1827  * destinations), we could potentially do array access of uniforms
1828  * that were loaded in GRF space as push constants.  In real-world
1829  * usage we've seen, though, the arrays being used are always larger
1830  * than we could load as push constants, so just always move all
1831  * uniform array access out to a pull constant buffer.
1832  */
1833 void
1834 fs_visitor::move_uniform_array_access_to_pull_constants()
1835 {
1836    int pull_constant_loc[stage_prog_data->nr_params];
1837
1838    for (unsigned int i = 0; i < stage_prog_data->nr_params; i++) {
1839       pull_constant_loc[i] = -1;
1840    }
1841
1842    /* Walk through and find array access of uniforms.  Put a copy of that
1843     * uniform in the pull constant buffer.
1844     *
1845     * Note that we don't move constant-indexed accesses to arrays.  No
1846     * testing has been done of the performance impact of this choice.
1847     */
1848    foreach_list_safe(node, &this->instructions) {
1849       fs_inst *inst = (fs_inst *)node;
1850
1851       for (int i = 0 ; i < 3; i++) {
1852          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1853             continue;
1854
1855          int uniform = inst->src[i].reg;
1856
1857          /* If this array isn't already present in the pull constant buffer,
1858           * add it.
1859           */
1860          if (pull_constant_loc[uniform] == -1) {
1861             const float **values = &stage_prog_data->param[uniform];
1862
1863             pull_constant_loc[uniform] = stage_prog_data->nr_pull_params;
1864
1865             assert(param_size[uniform]);
1866
1867             for (int j = 0; j < param_size[uniform]; j++) {
1868                stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
1869                   values[j];
1870             }
1871          }
1872
1873          /* Set up the annotation tracking for new generated instructions. */
1874          base_ir = inst->ir;
1875          current_annotation = inst->annotation;
1876
1877          fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
1878          fs_reg temp = fs_reg(this, glsl_type::float_type);
1879          exec_list list = VARYING_PULL_CONSTANT_LOAD(temp,
1880                                                      surf_index,
1881                                                      *inst->src[i].reladdr,
1882                                                      pull_constant_loc[uniform] +
1883                                                      inst->src[i].reg_offset);
1884          inst->insert_before(&list);
1885
1886          inst->src[i].file = temp.file;
1887          inst->src[i].reg = temp.reg;
1888          inst->src[i].reg_offset = temp.reg_offset;
1889          inst->src[i].reladdr = NULL;
1890       }
1891    }
1892 }
1893
1894 /**
1895  * Choose accesses from the UNIFORM file to demote to using the pull
1896  * constant buffer.
1897  *
1898  * We allow a fragment shader to have more than the specified minimum
1899  * maximum number of fragment shader uniform components (64).  If
1900  * there are too many of these, they'd fill up all of register space.
1901  * So, this will push some of them out to the pull constant buffer and
1902  * update the program to load them.
1903  */
1904 void
1905 fs_visitor::setup_pull_constants()
1906 {
1907    /* Only allow 16 registers (128 uniform components) as push constants. */
1908    unsigned int max_uniform_components = 16 * 8;
1909    if (stage_prog_data->nr_params <= max_uniform_components)
1910       return;
1911
1912    if (dispatch_width == 16) {
1913       fail("Pull constants not supported in SIMD16\n");
1914       return;
1915    }
1916
1917    /* Just demote the end of the list.  We could probably do better
1918     * here, demoting things that are rarely used in the program first.
1919     */
1920    unsigned int pull_uniform_base = max_uniform_components;
1921
1922    int pull_constant_loc[stage_prog_data->nr_params];
1923    for (unsigned int i = 0; i < stage_prog_data->nr_params; i++) {
1924       if (i < pull_uniform_base) {
1925          pull_constant_loc[i] = -1;
1926       } else {
1927          pull_constant_loc[i] = -1;
1928          /* If our constant is already being uploaded for reladdr purposes,
1929           * reuse it.
1930           */
1931          for (unsigned int j = 0; j < stage_prog_data->nr_pull_params; j++) {
1932             if (stage_prog_data->pull_param[j] == stage_prog_data->param[i]) {
1933                pull_constant_loc[i] = j;
1934                break;
1935             }
1936          }
1937          if (pull_constant_loc[i] == -1) {
1938             int pull_index = stage_prog_data->nr_pull_params++;
1939             stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
1940             pull_constant_loc[i] = pull_index;
1941          }
1942       }
1943    }
1944    stage_prog_data->nr_params = pull_uniform_base;
1945
1946    foreach_list(node, &this->instructions) {
1947       fs_inst *inst = (fs_inst *)node;
1948
1949       for (int i = 0; i < 3; i++) {
1950          if (inst->src[i].file != UNIFORM)
1951             continue;
1952
1953          int pull_index = pull_constant_loc[inst->src[i].reg +
1954                                             inst->src[i].reg_offset];
1955          if (pull_index == -1)
1956             continue;
1957
1958          assert(!inst->src[i].reladdr);
1959
1960          fs_reg dst = fs_reg(this, glsl_type::float_type);
1961          fs_reg index(stage_prog_data->binding_table.pull_constants_start);
1962          fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1963          fs_inst *pull =
1964             new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1965                                  dst, index, offset);
1966          pull->ir = inst->ir;
1967          pull->annotation = inst->annotation;
1968
1969          inst->insert_before(pull);
1970
1971          inst->src[i].file = GRF;
1972          inst->src[i].reg = dst.reg;
1973          inst->src[i].reg_offset = 0;
1974          inst->src[i].set_smear(pull_index & 3);
1975       }
1976    }
1977 }
1978
1979 bool
1980 fs_visitor::opt_algebraic()
1981 {
1982    bool progress = false;
1983
1984    foreach_list(node, &this->instructions) {
1985       fs_inst *inst = (fs_inst *)node;
1986
1987       switch (inst->opcode) {
1988       case BRW_OPCODE_MUL:
1989          if (inst->src[1].file != IMM)
1990             continue;
1991
1992          /* a * 1.0 = a */
1993          if (inst->src[1].is_one()) {
1994             inst->opcode = BRW_OPCODE_MOV;
1995             inst->src[1] = reg_undef;
1996             progress = true;
1997             break;
1998          }
1999
2000          /* a * 0.0 = 0.0 */
2001          if (inst->src[1].is_zero()) {
2002             inst->opcode = BRW_OPCODE_MOV;
2003             inst->src[0] = inst->src[1];
2004             inst->src[1] = reg_undef;
2005             progress = true;
2006             break;
2007          }
2008
2009          break;
2010       case BRW_OPCODE_ADD:
2011          if (inst->src[1].file != IMM)
2012             continue;
2013
2014          /* a + 0.0 = a */
2015          if (inst->src[1].is_zero()) {
2016             inst->opcode = BRW_OPCODE_MOV;
2017             inst->src[1] = reg_undef;
2018             progress = true;
2019             break;
2020          }
2021          break;
2022       case BRW_OPCODE_OR:
2023          if (inst->src[0].equals(inst->src[1])) {
2024             inst->opcode = BRW_OPCODE_MOV;
2025             inst->src[1] = reg_undef;
2026             progress = true;
2027             break;
2028          }
2029          break;
2030       case BRW_OPCODE_LRP:
2031          if (inst->src[1].equals(inst->src[2])) {
2032             inst->opcode = BRW_OPCODE_MOV;
2033             inst->src[0] = inst->src[1];
2034             inst->src[1] = reg_undef;
2035             inst->src[2] = reg_undef;
2036             progress = true;
2037             break;
2038          }
2039          break;
2040       case BRW_OPCODE_SEL:
2041          if (inst->saturate && inst->src[1].file == IMM) {
2042             switch (inst->conditional_mod) {
2043             case BRW_CONDITIONAL_LE:
2044             case BRW_CONDITIONAL_L:
2045                switch (inst->src[1].type) {
2046                case BRW_REGISTER_TYPE_F:
2047                   if (inst->src[1].imm.f >= 1.0f) {
2048                      inst->opcode = BRW_OPCODE_MOV;
2049                      inst->src[1] = reg_undef;
2050                      progress = true;
2051                   }
2052                   break;
2053                default:
2054                   break;
2055                }
2056                break;
2057             case BRW_CONDITIONAL_GE:
2058             case BRW_CONDITIONAL_G:
2059                switch (inst->src[1].type) {
2060                case BRW_REGISTER_TYPE_F:
2061                   if (inst->src[1].imm.f <= 0.0f) {
2062                      inst->opcode = BRW_OPCODE_MOV;
2063                      inst->src[1] = reg_undef;
2064                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
2065                      progress = true;
2066                   }
2067                   break;
2068                default:
2069                   break;
2070                }
2071             default:
2072                break;
2073             }
2074          }
2075          break;
2076       default:
2077          break;
2078       }
2079    }
2080
2081    return progress;
2082 }
2083
2084 /**
2085  * Removes any instructions writing a VGRF where that VGRF is not used by any
2086  * later instruction.
2087  */
2088 bool
2089 fs_visitor::dead_code_eliminate()
2090 {
2091    bool progress = false;
2092    int pc = 0;
2093
2094    calculate_live_intervals();
2095
2096    foreach_list_safe(node, &this->instructions) {
2097       fs_inst *inst = (fs_inst *)node;
2098
2099       if (inst->dst.file == GRF && !inst->has_side_effects()) {
2100          bool dead = true;
2101
2102          for (int i = 0; i < inst->regs_written; i++) {
2103             int var = live_intervals->var_from_vgrf[inst->dst.reg];
2104             assert(live_intervals->end[var + inst->dst.reg_offset + i] >= pc);
2105             if (live_intervals->end[var + inst->dst.reg_offset + i] != pc) {
2106                dead = false;
2107                break;
2108             }
2109          }
2110
2111          if (dead) {
2112             /* Don't dead code eliminate instructions that write to the
2113              * accumulator as a side-effect. Instead just set the destination
2114              * to the null register to free it.
2115              */
2116             switch (inst->opcode) {
2117             case BRW_OPCODE_ADDC:
2118             case BRW_OPCODE_SUBB:
2119             case BRW_OPCODE_MACH:
2120                inst->dst = fs_reg(retype(brw_null_reg(), inst->dst.type));
2121                break;
2122             default:
2123                inst->remove();
2124                progress = true;
2125                break;
2126             }
2127          }
2128       }
2129
2130       pc++;
2131    }
2132
2133    if (progress)
2134       invalidate_live_intervals();
2135
2136    return progress;
2137 }
2138
2139 struct dead_code_hash_key
2140 {
2141    int vgrf;
2142    int reg_offset;
2143 };
2144
2145 static bool
2146 dead_code_hash_compare(const void *a, const void *b)
2147 {
2148    return memcmp(a, b, sizeof(struct dead_code_hash_key)) == 0;
2149 }
2150
2151 static void
2152 clear_dead_code_hash(struct hash_table *ht)
2153 {
2154    struct hash_entry *entry;
2155
2156    hash_table_foreach(ht, entry) {
2157       _mesa_hash_table_remove(ht, entry);
2158    }
2159 }
2160
2161 static void
2162 insert_dead_code_hash(struct hash_table *ht,
2163                       int vgrf, int reg_offset, fs_inst *inst)
2164 {
2165    /* We don't bother freeing keys, because they'll be GCed with the ht. */
2166    struct dead_code_hash_key *key = ralloc(ht, struct dead_code_hash_key);
2167
2168    key->vgrf = vgrf;
2169    key->reg_offset = reg_offset;
2170
2171    _mesa_hash_table_insert(ht, _mesa_hash_data(key, sizeof(*key)), key, inst);
2172 }
2173
2174 static struct hash_entry *
2175 get_dead_code_hash_entry(struct hash_table *ht, int vgrf, int reg_offset)
2176 {
2177    struct dead_code_hash_key key;
2178
2179    key.vgrf = vgrf;
2180    key.reg_offset = reg_offset;
2181
2182    return _mesa_hash_table_search(ht, _mesa_hash_data(&key, sizeof(key)), &key);
2183 }
2184
2185 static void
2186 remove_dead_code_hash(struct hash_table *ht,
2187                       int vgrf, int reg_offset)
2188 {
2189    struct hash_entry *entry = get_dead_code_hash_entry(ht, vgrf, reg_offset);
2190    if (!entry)
2191       return;
2192
2193    _mesa_hash_table_remove(ht, entry);
2194 }
2195
2196 /**
2197  * Walks basic blocks, removing any regs that are written but not read before
2198  * being redefined.
2199  *
2200  * The dead_code_eliminate() function implements a global dead code
2201  * elimination, but it only handles the removing the last write to a register
2202  * if it's never read.  This one can handle intermediate writes, but only
2203  * within a basic block.
2204  */
2205 bool
2206 fs_visitor::dead_code_eliminate_local()
2207 {
2208    struct hash_table *ht;
2209    bool progress = false;
2210
2211    ht = _mesa_hash_table_create(mem_ctx, dead_code_hash_compare);
2212
2213    if (ht == NULL) {
2214       return false;
2215    }
2216
2217    foreach_list_safe(node, &this->instructions) {
2218       fs_inst *inst = (fs_inst *)node;
2219
2220       /* At a basic block, empty the HT since we don't understand dataflow
2221        * here.
2222        */
2223       if (inst->is_control_flow()) {
2224          clear_dead_code_hash(ht);
2225          continue;
2226       }
2227
2228       /* Clear the HT of any instructions that got read. */
2229       for (int i = 0; i < 3; i++) {
2230          fs_reg src = inst->src[i];
2231          if (src.file != GRF)
2232             continue;
2233
2234          int read = 1;
2235          if (inst->is_send_from_grf())
2236             read = virtual_grf_sizes[src.reg] - src.reg_offset;
2237
2238          for (int reg_offset = src.reg_offset;
2239               reg_offset < src.reg_offset + read;
2240               reg_offset++) {
2241             remove_dead_code_hash(ht, src.reg, reg_offset);
2242          }
2243       }
2244
2245       /* Add any update of a GRF to the HT, removing a previous write if it
2246        * wasn't read.
2247        */
2248       if (inst->dst.file == GRF) {
2249          if (inst->regs_written > 1) {
2250             /* We don't know how to trim channels from an instruction's
2251              * writes, so we can't incrementally remove unread channels from
2252              * it.  Just remove whatever it overwrites from the table
2253              */
2254             for (int i = 0; i < inst->regs_written; i++) {
2255                remove_dead_code_hash(ht,
2256                                      inst->dst.reg,
2257                                      inst->dst.reg_offset + i);
2258             }
2259          } else {
2260             struct hash_entry *entry =
2261                get_dead_code_hash_entry(ht, inst->dst.reg,
2262                                         inst->dst.reg_offset);
2263
2264             if (entry) {
2265                if (inst->is_partial_write()) {
2266                   /* For a partial write, we can't remove any previous dead code
2267                    * candidate, since we're just modifying their result.
2268                    */
2269                } else {
2270                   /* We're completely updating a channel, and there was a
2271                    * previous write to the channel that wasn't read.  Kill it!
2272                    */
2273                   fs_inst *inst = (fs_inst *)entry->data;
2274                   inst->remove();
2275                   progress = true;
2276                }
2277
2278                _mesa_hash_table_remove(ht, entry);
2279             }
2280
2281             if (!inst->has_side_effects())
2282                insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
2283                                      inst);
2284          }
2285       }
2286    }
2287
2288    _mesa_hash_table_destroy(ht, NULL);
2289
2290    if (progress)
2291       invalidate_live_intervals();
2292
2293    return progress;
2294 }
2295
2296 /**
2297  * Implements register coalescing: Checks if the two registers involved in a
2298  * raw move don't interfere, in which case they can both be stored in the same
2299  * place and the MOV removed.
2300  *
2301  * To do this, all uses of the source of the MOV in the shader are replaced
2302  * with the destination of the MOV. For example:
2303  *
2304  * add vgrf3:F, vgrf1:F, vgrf2:F
2305  * mov vgrf4:F, vgrf3:F
2306  * mul vgrf5:F, vgrf5:F, vgrf4:F
2307  *
2308  * becomes
2309  *
2310  * add vgrf4:F, vgrf1:F, vgrf2:F
2311  * mul vgrf5:F, vgrf5:F, vgrf4:F
2312  */
2313 bool
2314 fs_visitor::register_coalesce()
2315 {
2316    bool progress = false;
2317
2318    calculate_live_intervals();
2319
2320    int src_size = 0;
2321    int channels_remaining = 0;
2322    int reg_from = -1, reg_to = -1;
2323    int reg_to_offset[MAX_SAMPLER_MESSAGE_SIZE];
2324    fs_inst *mov[MAX_SAMPLER_MESSAGE_SIZE];
2325
2326    foreach_list(node, &this->instructions) {
2327       fs_inst *inst = (fs_inst *)node;
2328
2329       if (inst->opcode != BRW_OPCODE_MOV ||
2330           inst->is_partial_write() ||
2331           inst->saturate ||
2332           inst->src[0].file != GRF ||
2333           inst->src[0].negate ||
2334           inst->src[0].abs ||
2335           !inst->src[0].is_contiguous() ||
2336           inst->dst.file != GRF ||
2337           inst->dst.type != inst->src[0].type) {
2338          continue;
2339       }
2340
2341       if (virtual_grf_sizes[inst->src[0].reg] >
2342           virtual_grf_sizes[inst->dst.reg])
2343          continue;
2344
2345       int var_from = live_intervals->var_from_reg(&inst->src[0]);
2346       int var_to = live_intervals->var_from_reg(&inst->dst);
2347
2348       if (live_intervals->vars_interfere(var_from, var_to) &&
2349           !inst->dst.equals(inst->src[0])) {
2350
2351          /* We know that the live ranges of A (var_from) and B (var_to)
2352           * interfere because of the ->vars_interfere() call above. If the end
2353           * of B's live range is after the end of A's range, then we know two
2354           * things:
2355           *  - the start of B's live range must be in A's live range (since we
2356           *    already know the two ranges interfere, this is the only remaining
2357           *    possibility)
2358           *  - the interference isn't of the form we're looking for (where B is
2359           *    entirely inside A)
2360           */
2361          if (live_intervals->end[var_to] > live_intervals->end[var_from])
2362             continue;
2363
2364          bool overwritten = false;
2365          int scan_ip = -1;
2366
2367          foreach_list(n, &this->instructions) {
2368             fs_inst *scan_inst = (fs_inst *)n;
2369             scan_ip++;
2370
2371             if (scan_inst->is_control_flow()) {
2372                overwritten = true;
2373                break;
2374             }
2375
2376             if (scan_ip <= live_intervals->start[var_to])
2377                continue;
2378
2379             if (scan_ip > live_intervals->end[var_to])
2380                break;
2381
2382             if (scan_inst->dst.equals(inst->dst) ||
2383                 scan_inst->dst.equals(inst->src[0])) {
2384                overwritten = true;
2385                break;
2386             }
2387          }
2388
2389          if (overwritten)
2390             continue;
2391       }
2392
2393       if (reg_from != inst->src[0].reg) {
2394          reg_from = inst->src[0].reg;
2395
2396          src_size = virtual_grf_sizes[inst->src[0].reg];
2397          assert(src_size <= MAX_SAMPLER_MESSAGE_SIZE);
2398
2399          channels_remaining = src_size;
2400          memset(mov, 0, sizeof(mov));
2401
2402          reg_to = inst->dst.reg;
2403       }
2404
2405       if (reg_to != inst->dst.reg)
2406          continue;
2407
2408       const int offset = inst->src[0].reg_offset;
2409       reg_to_offset[offset] = inst->dst.reg_offset;
2410       mov[offset] = inst;
2411       channels_remaining--;
2412
2413       if (channels_remaining)
2414          continue;
2415
2416       bool removed = false;
2417       for (int i = 0; i < src_size; i++) {
2418          if (mov[i]) {
2419             removed = true;
2420
2421             mov[i]->opcode = BRW_OPCODE_NOP;
2422             mov[i]->conditional_mod = BRW_CONDITIONAL_NONE;
2423             mov[i]->dst = reg_undef;
2424             mov[i]->src[0] = reg_undef;
2425             mov[i]->src[1] = reg_undef;
2426             mov[i]->src[2] = reg_undef;
2427          }
2428       }
2429
2430       foreach_list(node, &this->instructions) {
2431          fs_inst *scan_inst = (fs_inst *)node;
2432
2433          for (int i = 0; i < src_size; i++) {
2434             if (mov[i]) {
2435                if (scan_inst->dst.file == GRF &&
2436                    scan_inst->dst.reg == reg_from &&
2437                    scan_inst->dst.reg_offset == i) {
2438                   scan_inst->dst.reg = reg_to;
2439                   scan_inst->dst.reg_offset = reg_to_offset[i];
2440                }
2441                for (int j = 0; j < 3; j++) {
2442                   if (scan_inst->src[j].file == GRF &&
2443                       scan_inst->src[j].reg == reg_from &&
2444                       scan_inst->src[j].reg_offset == i) {
2445                      scan_inst->src[j].reg = reg_to;
2446                      scan_inst->src[j].reg_offset = reg_to_offset[i];
2447                   }
2448                }
2449             }
2450          }
2451       }
2452
2453       if (removed) {
2454          live_intervals->start[var_to] = MIN2(live_intervals->start[var_to],
2455                                               live_intervals->start[var_from]);
2456          live_intervals->end[var_to] = MAX2(live_intervals->end[var_to],
2457                                             live_intervals->end[var_from]);
2458          reg_from = -1;
2459       }
2460    }
2461
2462    foreach_list_safe(node, &this->instructions) {
2463       fs_inst *inst = (fs_inst *)node;
2464
2465       if (inst->opcode == BRW_OPCODE_NOP) {
2466          inst->remove();
2467          progress = true;
2468       }
2469    }
2470
2471    if (progress)
2472       invalidate_live_intervals();
2473
2474    return progress;
2475 }
2476
2477 bool
2478 fs_visitor::compute_to_mrf()
2479 {
2480    bool progress = false;
2481    int next_ip = 0;
2482
2483    calculate_live_intervals();
2484
2485    foreach_list_safe(node, &this->instructions) {
2486       fs_inst *inst = (fs_inst *)node;
2487
2488       int ip = next_ip;
2489       next_ip++;
2490
2491       if (inst->opcode != BRW_OPCODE_MOV ||
2492           inst->is_partial_write() ||
2493           inst->dst.file != MRF || inst->src[0].file != GRF ||
2494           inst->dst.type != inst->src[0].type ||
2495           inst->src[0].abs || inst->src[0].negate ||
2496           !inst->src[0].is_contiguous() ||
2497           inst->src[0].subreg_offset)
2498          continue;
2499
2500       /* Work out which hardware MRF registers are written by this
2501        * instruction.
2502        */
2503       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2504       int mrf_high;
2505       if (inst->dst.reg & BRW_MRF_COMPR4) {
2506          mrf_high = mrf_low + 4;
2507       } else if (dispatch_width == 16 &&
2508                  (!inst->force_uncompressed && !inst->force_sechalf)) {
2509          mrf_high = mrf_low + 1;
2510       } else {
2511          mrf_high = mrf_low;
2512       }
2513
2514       /* Can't compute-to-MRF this GRF if someone else was going to
2515        * read it later.
2516        */
2517       if (this->virtual_grf_end[inst->src[0].reg] > ip)
2518          continue;
2519
2520       /* Found a move of a GRF to a MRF.  Let's see if we can go
2521        * rewrite the thing that made this GRF to write into the MRF.
2522        */
2523       fs_inst *scan_inst;
2524       for (scan_inst = (fs_inst *)inst->prev;
2525            scan_inst->prev != NULL;
2526            scan_inst = (fs_inst *)scan_inst->prev) {
2527          if (scan_inst->dst.file == GRF &&
2528              scan_inst->dst.reg == inst->src[0].reg) {
2529             /* Found the last thing to write our reg we want to turn
2530              * into a compute-to-MRF.
2531              */
2532
2533             /* If this one instruction didn't populate all the
2534              * channels, bail.  We might be able to rewrite everything
2535              * that writes that reg, but it would require smarter
2536              * tracking to delay the rewriting until complete success.
2537              */
2538             if (scan_inst->is_partial_write())
2539                break;
2540
2541             /* Things returning more than one register would need us to
2542              * understand coalescing out more than one MOV at a time.
2543              */
2544             if (scan_inst->regs_written > 1)
2545                break;
2546
2547             /* SEND instructions can't have MRF as a destination. */
2548             if (scan_inst->mlen)
2549                break;
2550
2551             if (brw->gen == 6) {
2552                /* gen6 math instructions must have the destination be
2553                 * GRF, so no compute-to-MRF for them.
2554                 */
2555                if (scan_inst->is_math()) {
2556                   break;
2557                }
2558             }
2559
2560             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2561                /* Found the creator of our MRF's source value. */
2562                scan_inst->dst.file = MRF;
2563                scan_inst->dst.reg = inst->dst.reg;
2564                scan_inst->saturate |= inst->saturate;
2565                inst->remove();
2566                progress = true;
2567             }
2568             break;
2569          }
2570
2571          /* We don't handle control flow here.  Most computation of
2572           * values that end up in MRFs are shortly before the MRF
2573           * write anyway.
2574           */
2575          if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2576             break;
2577
2578          /* You can't read from an MRF, so if someone else reads our
2579           * MRF's source GRF that we wanted to rewrite, that stops us.
2580           */
2581          bool interfered = false;
2582          for (int i = 0; i < 3; i++) {
2583             if (scan_inst->src[i].file == GRF &&
2584                 scan_inst->src[i].reg == inst->src[0].reg &&
2585                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2586                interfered = true;
2587             }
2588          }
2589          if (interfered)
2590             break;
2591
2592          if (scan_inst->dst.file == MRF) {
2593             /* If somebody else writes our MRF here, we can't
2594              * compute-to-MRF before that.
2595              */
2596             int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2597             int scan_mrf_high;
2598
2599             if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2600                scan_mrf_high = scan_mrf_low + 4;
2601             } else if (dispatch_width == 16 &&
2602                        (!scan_inst->force_uncompressed &&
2603                         !scan_inst->force_sechalf)) {
2604                scan_mrf_high = scan_mrf_low + 1;
2605             } else {
2606                scan_mrf_high = scan_mrf_low;
2607             }
2608
2609             if (mrf_low == scan_mrf_low ||
2610                 mrf_low == scan_mrf_high ||
2611                 mrf_high == scan_mrf_low ||
2612                 mrf_high == scan_mrf_high) {
2613                break;
2614             }
2615          }
2616
2617          if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2618             /* Found a SEND instruction, which means that there are
2619              * live values in MRFs from base_mrf to base_mrf +
2620              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
2621              * above it.
2622              */
2623             if (mrf_low >= scan_inst->base_mrf &&
2624                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2625                break;
2626             }
2627             if (mrf_high >= scan_inst->base_mrf &&
2628                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2629                break;
2630             }
2631          }
2632       }
2633    }
2634
2635    if (progress)
2636       invalidate_live_intervals();
2637
2638    return progress;
2639 }
2640
2641 /**
2642  * Walks through basic blocks, looking for repeated MRF writes and
2643  * removing the later ones.
2644  */
2645 bool
2646 fs_visitor::remove_duplicate_mrf_writes()
2647 {
2648    fs_inst *last_mrf_move[16];
2649    bool progress = false;
2650
2651    /* Need to update the MRF tracking for compressed instructions. */
2652    if (dispatch_width == 16)
2653       return false;
2654
2655    memset(last_mrf_move, 0, sizeof(last_mrf_move));
2656
2657    foreach_list_safe(node, &this->instructions) {
2658       fs_inst *inst = (fs_inst *)node;
2659
2660       if (inst->is_control_flow()) {
2661          memset(last_mrf_move, 0, sizeof(last_mrf_move));
2662       }
2663
2664       if (inst->opcode == BRW_OPCODE_MOV &&
2665           inst->dst.file == MRF) {
2666          fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2667          if (prev_inst && inst->equals(prev_inst)) {
2668             inst->remove();
2669             progress = true;
2670             continue;
2671          }
2672       }
2673
2674       /* Clear out the last-write records for MRFs that were overwritten. */
2675       if (inst->dst.file == MRF) {
2676          last_mrf_move[inst->dst.reg] = NULL;
2677       }
2678
2679       if (inst->mlen > 0 && inst->base_mrf != -1) {
2680          /* Found a SEND instruction, which will include two or fewer
2681           * implied MRF writes.  We could do better here.
2682           */
2683          for (int i = 0; i < implied_mrf_writes(inst); i++) {
2684             last_mrf_move[inst->base_mrf + i] = NULL;
2685          }
2686       }
2687
2688       /* Clear out any MRF move records whose sources got overwritten. */
2689       if (inst->dst.file == GRF) {
2690          for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2691             if (last_mrf_move[i] &&
2692                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2693                last_mrf_move[i] = NULL;
2694             }
2695          }
2696       }
2697
2698       if (inst->opcode == BRW_OPCODE_MOV &&
2699           inst->dst.file == MRF &&
2700           inst->src[0].file == GRF &&
2701           !inst->is_partial_write()) {
2702          last_mrf_move[inst->dst.reg] = inst;
2703       }
2704    }
2705
2706    if (progress)
2707       invalidate_live_intervals();
2708
2709    return progress;
2710 }
2711
2712 static void
2713 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2714                         int first_grf, int grf_len)
2715 {
2716    bool inst_simd16 = (dispatch_width > 8 &&
2717                        !inst->force_uncompressed &&
2718                        !inst->force_sechalf);
2719
2720    /* Clear the flag for registers that actually got read (as expected). */
2721    for (int i = 0; i < 3; i++) {
2722       int grf;
2723       if (inst->src[i].file == GRF) {
2724          grf = inst->src[i].reg;
2725       } else if (inst->src[i].file == HW_REG &&
2726                  inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2727          grf = inst->src[i].fixed_hw_reg.nr;
2728       } else {
2729          continue;
2730       }
2731
2732       if (grf >= first_grf &&
2733           grf < first_grf + grf_len) {
2734          deps[grf - first_grf] = false;
2735          if (inst_simd16)
2736             deps[grf - first_grf + 1] = false;
2737       }
2738    }
2739 }
2740
2741 /**
2742  * Implements this workaround for the original 965:
2743  *
2744  *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2745  *      check for post destination dependencies on this instruction, software
2746  *      must ensure that there is no destination hazard for the case of ‘write
2747  *      followed by a posted write’ shown in the following example.
2748  *
2749  *      1. mov r3 0
2750  *      2. send r3.xy <rest of send instruction>
2751  *      3. mov r2 r3
2752  *
2753  *      Due to no post-destination dependency check on the ‘send’, the above
2754  *      code sequence could have two instructions (1 and 2) in flight at the
2755  *      same time that both consider ‘r3’ as the target of their final writes.
2756  */
2757 void
2758 fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
2759 {
2760    int reg_size = dispatch_width / 8;
2761    int write_len = inst->regs_written * reg_size;
2762    int first_write_grf = inst->dst.reg;
2763    bool needs_dep[BRW_MAX_MRF];
2764    assert(write_len < (int)sizeof(needs_dep) - 1);
2765
2766    memset(needs_dep, false, sizeof(needs_dep));
2767    memset(needs_dep, true, write_len);
2768
2769    clear_deps_for_inst_src(inst, dispatch_width,
2770                            needs_dep, first_write_grf, write_len);
2771
2772    /* Walk backwards looking for writes to registers we're writing which
2773     * aren't read since being written.  If we hit the start of the program,
2774     * we assume that there are no outstanding dependencies on entry to the
2775     * program.
2776     */
2777    for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2778         scan_inst != NULL;
2779         scan_inst = (fs_inst *)scan_inst->prev) {
2780
2781       /* If we hit control flow, assume that there *are* outstanding
2782        * dependencies, and force their cleanup before our instruction.
2783        */
2784       if (scan_inst->is_control_flow()) {
2785          for (int i = 0; i < write_len; i++) {
2786             if (needs_dep[i]) {
2787                inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2788             }
2789          }
2790          return;
2791       }
2792
2793       bool scan_inst_simd16 = (dispatch_width > 8 &&
2794                                !scan_inst->force_uncompressed &&
2795                                !scan_inst->force_sechalf);
2796
2797       /* We insert our reads as late as possible on the assumption that any
2798        * instruction but a MOV that might have left us an outstanding
2799        * dependency has more latency than a MOV.
2800        */
2801       if (scan_inst->dst.file == GRF) {
2802          for (int i = 0; i < scan_inst->regs_written; i++) {
2803             int reg = scan_inst->dst.reg + i * reg_size;
2804
2805             if (reg >= first_write_grf &&
2806                 reg < first_write_grf + write_len &&
2807                 needs_dep[reg - first_write_grf]) {
2808                inst->insert_before(DEP_RESOLVE_MOV(reg));
2809                needs_dep[reg - first_write_grf] = false;
2810                if (scan_inst_simd16)
2811                   needs_dep[reg - first_write_grf + 1] = false;
2812             }
2813          }
2814       }
2815
2816       /* Clear the flag for registers that actually got read (as expected). */
2817       clear_deps_for_inst_src(scan_inst, dispatch_width,
2818                               needs_dep, first_write_grf, write_len);
2819
2820       /* Continue the loop only if we haven't resolved all the dependencies */
2821       int i;
2822       for (i = 0; i < write_len; i++) {
2823          if (needs_dep[i])
2824             break;
2825       }
2826       if (i == write_len)
2827          return;
2828    }
2829 }
2830
2831 /**
2832  * Implements this workaround for the original 965:
2833  *
2834  *     "[DevBW, DevCL] Errata: A destination register from a send can not be
2835  *      used as a destination register until after it has been sourced by an
2836  *      instruction with a different destination register.
2837  */
2838 void
2839 fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
2840 {
2841    int write_len = inst->regs_written * dispatch_width / 8;
2842    int first_write_grf = inst->dst.reg;
2843    bool needs_dep[BRW_MAX_MRF];
2844    assert(write_len < (int)sizeof(needs_dep) - 1);
2845
2846    memset(needs_dep, false, sizeof(needs_dep));
2847    memset(needs_dep, true, write_len);
2848    /* Walk forwards looking for writes to registers we're writing which aren't
2849     * read before being written.
2850     */
2851    for (fs_inst *scan_inst = (fs_inst *)inst->next;
2852         !scan_inst->is_tail_sentinel();
2853         scan_inst = (fs_inst *)scan_inst->next) {
2854       /* If we hit control flow, force resolve all remaining dependencies. */
2855       if (scan_inst->is_control_flow()) {
2856          for (int i = 0; i < write_len; i++) {
2857             if (needs_dep[i])
2858                scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2859          }
2860          return;
2861       }
2862
2863       /* Clear the flag for registers that actually got read (as expected). */
2864       clear_deps_for_inst_src(scan_inst, dispatch_width,
2865                               needs_dep, first_write_grf, write_len);
2866
2867       /* We insert our reads as late as possible since they're reading the
2868        * result of a SEND, which has massive latency.
2869        */
2870       if (scan_inst->dst.file == GRF &&
2871           scan_inst->dst.reg >= first_write_grf &&
2872           scan_inst->dst.reg < first_write_grf + write_len &&
2873           needs_dep[scan_inst->dst.reg - first_write_grf]) {
2874          scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2875          needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2876       }
2877
2878       /* Continue the loop only if we haven't resolved all the dependencies */
2879       int i;
2880       for (i = 0; i < write_len; i++) {
2881          if (needs_dep[i])
2882             break;
2883       }
2884       if (i == write_len)
2885          return;
2886    }
2887
2888    /* If we hit the end of the program, resolve all remaining dependencies out
2889     * of paranoia.
2890     */
2891    fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2892    assert(last_inst->eot);
2893    for (int i = 0; i < write_len; i++) {
2894       if (needs_dep[i])
2895          last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2896    }
2897 }
2898
2899 void
2900 fs_visitor::insert_gen4_send_dependency_workarounds()
2901 {
2902    if (brw->gen != 4 || brw->is_g4x)
2903       return;
2904
2905    /* Note that we're done with register allocation, so GRF fs_regs always
2906     * have a .reg_offset of 0.
2907     */
2908
2909    foreach_list_safe(node, &this->instructions) {
2910       fs_inst *inst = (fs_inst *)node;
2911
2912       if (inst->mlen != 0 && inst->dst.file == GRF) {
2913          insert_gen4_pre_send_dependency_workarounds(inst);
2914          insert_gen4_post_send_dependency_workarounds(inst);
2915       }
2916    }
2917 }
2918
2919 /**
2920  * Turns the generic expression-style uniform pull constant load instruction
2921  * into a hardware-specific series of instructions for loading a pull
2922  * constant.
2923  *
2924  * The expression style allows the CSE pass before this to optimize out
2925  * repeated loads from the same offset, and gives the pre-register-allocation
2926  * scheduling full flexibility, while the conversion to native instructions
2927  * allows the post-register-allocation scheduler the best information
2928  * possible.
2929  *
2930  * Note that execution masking for setting up pull constant loads is special:
2931  * the channels that need to be written are unrelated to the current execution
2932  * mask, since a later instruction will use one of the result channels as a
2933  * source operand for all 8 or 16 of its channels.
2934  */
2935 void
2936 fs_visitor::lower_uniform_pull_constant_loads()
2937 {
2938    foreach_list(node, &this->instructions) {
2939       fs_inst *inst = (fs_inst *)node;
2940
2941       if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2942          continue;
2943
2944       if (brw->gen >= 7) {
2945          /* The offset arg before was a vec4-aligned byte offset.  We need to
2946           * turn it into a dword offset.
2947           */
2948          fs_reg const_offset_reg = inst->src[1];
2949          assert(const_offset_reg.file == IMM &&
2950                 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2951          const_offset_reg.imm.u /= 4;
2952          fs_reg payload = fs_reg(this, glsl_type::uint_type);
2953
2954          /* This is actually going to be a MOV, but since only the first dword
2955           * is accessed, we have a special opcode to do just that one.  Note
2956           * that this needs to be an operation that will be considered a def
2957           * by live variable analysis, or register allocation will explode.
2958           */
2959          fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2960                                                payload, const_offset_reg);
2961          setup->force_writemask_all = true;
2962
2963          setup->ir = inst->ir;
2964          setup->annotation = inst->annotation;
2965          inst->insert_before(setup);
2966
2967          /* Similarly, this will only populate the first 4 channels of the
2968           * result register (since we only use smear values from 0-3), but we
2969           * don't tell the optimizer.
2970           */
2971          inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2972          inst->src[1] = payload;
2973
2974          invalidate_live_intervals();
2975       } else {
2976          /* Before register allocation, we didn't tell the scheduler about the
2977           * MRF we use.  We know it's safe to use this MRF because nothing
2978           * else does except for register spill/unspill, which generates and
2979           * uses its MRF within a single IR instruction.
2980           */
2981          inst->base_mrf = 14;
2982          inst->mlen = 1;
2983       }
2984    }
2985 }
2986
2987 void
2988 fs_visitor::dump_instructions()
2989 {
2990    calculate_register_pressure();
2991
2992    int ip = 0, max_pressure = 0;
2993    foreach_list(node, &this->instructions) {
2994       backend_instruction *inst = (backend_instruction *)node;
2995       max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
2996       printf("{%3d} %4d: ", regs_live_at_ip[ip], ip);
2997       dump_instruction(inst);
2998       ++ip;
2999    }
3000    printf("Maximum %3d registers live at once.\n", max_pressure);
3001 }
3002
3003 void
3004 fs_visitor::dump_instruction(backend_instruction *be_inst)
3005 {
3006    fs_inst *inst = (fs_inst *)be_inst;
3007
3008    if (inst->predicate) {
3009       printf("(%cf0.%d) ",
3010              inst->predicate_inverse ? '-' : '+',
3011              inst->flag_subreg);
3012    }
3013
3014    printf("%s", brw_instruction_name(inst->opcode));
3015    if (inst->saturate)
3016       printf(".sat");
3017    if (inst->conditional_mod) {
3018       printf("%s", conditional_modifier[inst->conditional_mod]);
3019       if (!inst->predicate &&
3020           (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
3021                               inst->opcode != BRW_OPCODE_IF &&
3022                               inst->opcode != BRW_OPCODE_WHILE))) {
3023          printf(".f0.%d", inst->flag_subreg);
3024       }
3025    }
3026    printf(" ");
3027
3028
3029    switch (inst->dst.file) {
3030    case GRF:
3031       printf("vgrf%d", inst->dst.reg);
3032       if (virtual_grf_sizes[inst->dst.reg] != 1 ||
3033           inst->dst.subreg_offset)
3034          printf("+%d.%d", inst->dst.reg_offset, inst->dst.subreg_offset);
3035       break;
3036    case MRF:
3037       printf("m%d", inst->dst.reg);
3038       break;
3039    case BAD_FILE:
3040       printf("(null)");
3041       break;
3042    case UNIFORM:
3043       printf("***u%d***", inst->dst.reg);
3044       break;
3045    case HW_REG:
3046       if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3047          switch (inst->dst.fixed_hw_reg.nr) {
3048          case BRW_ARF_NULL:
3049             printf("null");
3050             break;
3051          case BRW_ARF_ADDRESS:
3052             printf("a0.%d", inst->dst.fixed_hw_reg.subnr);
3053             break;
3054          case BRW_ARF_ACCUMULATOR:
3055             printf("acc%d", inst->dst.fixed_hw_reg.subnr);
3056             break;
3057          case BRW_ARF_FLAG:
3058             printf("f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3059                              inst->dst.fixed_hw_reg.subnr);
3060             break;
3061          default:
3062             printf("arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3063                                inst->dst.fixed_hw_reg.subnr);
3064             break;
3065          }
3066       } else {
3067          printf("hw_reg%d", inst->dst.fixed_hw_reg.nr);
3068       }
3069       if (inst->dst.fixed_hw_reg.subnr)
3070          printf("+%d", inst->dst.fixed_hw_reg.subnr);
3071       break;
3072    default:
3073       printf("???");
3074       break;
3075    }
3076    printf(":%s, ", reg_encoding[inst->dst.type]);
3077
3078    for (int i = 0; i < 3 && inst->src[i].file != BAD_FILE; i++) {
3079       if (inst->src[i].negate)
3080          printf("-");
3081       if (inst->src[i].abs)
3082          printf("|");
3083       switch (inst->src[i].file) {
3084       case GRF:
3085          printf("vgrf%d", inst->src[i].reg);
3086          if (virtual_grf_sizes[inst->src[i].reg] != 1 ||
3087              inst->src[i].subreg_offset)
3088             printf("+%d.%d", inst->src[i].reg_offset,
3089                    inst->src[i].subreg_offset);
3090          break;
3091       case MRF:
3092          printf("***m%d***", inst->src[i].reg);
3093          break;
3094       case UNIFORM:
3095          printf("u%d", inst->src[i].reg);
3096          if (virtual_grf_sizes[inst->src[i].reg] != 1 ||
3097              inst->src[i].subreg_offset)
3098             printf("+%d.%d", inst->src[i].reg_offset,
3099                    inst->src[i].subreg_offset);
3100          break;
3101       case BAD_FILE:
3102          printf("(null)");
3103          break;
3104       case IMM:
3105          switch (inst->src[i].type) {
3106          case BRW_REGISTER_TYPE_F:
3107             printf("%ff", inst->src[i].imm.f);
3108             break;
3109          case BRW_REGISTER_TYPE_D:
3110             printf("%dd", inst->src[i].imm.i);
3111             break;
3112          case BRW_REGISTER_TYPE_UD:
3113             printf("%uu", inst->src[i].imm.u);
3114             break;
3115          default:
3116             printf("???");
3117             break;
3118          }
3119          break;
3120       case HW_REG:
3121          if (inst->src[i].fixed_hw_reg.negate)
3122             printf("-");
3123          if (inst->src[i].fixed_hw_reg.abs)
3124             printf("|");
3125          if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3126             switch (inst->src[i].fixed_hw_reg.nr) {
3127             case BRW_ARF_NULL:
3128                printf("null");
3129                break;
3130             case BRW_ARF_ADDRESS:
3131                printf("a0.%d", inst->src[i].fixed_hw_reg.subnr);
3132                break;
3133             case BRW_ARF_ACCUMULATOR:
3134                printf("acc%d", inst->src[i].fixed_hw_reg.subnr);
3135                break;
3136             case BRW_ARF_FLAG:
3137                printf("f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3138                                 inst->src[i].fixed_hw_reg.subnr);
3139                break;
3140             default:
3141                printf("arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3142                                   inst->src[i].fixed_hw_reg.subnr);
3143                break;
3144             }
3145          } else {
3146             printf("hw_reg%d", inst->src[i].fixed_hw_reg.nr);
3147          }
3148          if (inst->src[i].fixed_hw_reg.subnr)
3149             printf("+%d", inst->src[i].fixed_hw_reg.subnr);
3150          if (inst->src[i].fixed_hw_reg.abs)
3151             printf("|");
3152          break;
3153       default:
3154          printf("???");
3155          break;
3156       }
3157       if (inst->src[i].abs)
3158          printf("|");
3159
3160       if (inst->src[i].file != IMM) {
3161          printf(":%s", brw_reg_type_letters(inst->src[i].type));
3162       }
3163
3164       if (i < 2 && inst->src[i + 1].file != BAD_FILE)
3165          printf(", ");
3166    }
3167
3168    printf(" ");
3169
3170    if (inst->force_uncompressed)
3171       printf("1sthalf ");
3172
3173    if (inst->force_sechalf)
3174       printf("2ndhalf ");
3175
3176    printf("\n");
3177 }
3178
3179 /**
3180  * Possibly returns an instruction that set up @param reg.
3181  *
3182  * Sometimes we want to take the result of some expression/variable
3183  * dereference tree and rewrite the instruction generating the result
3184  * of the tree.  When processing the tree, we know that the
3185  * instructions generated are all writing temporaries that are dead
3186  * outside of this tree.  So, if we have some instructions that write
3187  * a temporary, we're free to point that temp write somewhere else.
3188  *
3189  * Note that this doesn't guarantee that the instruction generated
3190  * only reg -- it might be the size=4 destination of a texture instruction.
3191  */
3192 fs_inst *
3193 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3194                                            fs_inst *end,
3195                                            fs_reg reg)
3196 {
3197    if (end == start ||
3198        end->is_partial_write() ||
3199        reg.reladdr ||
3200        !reg.equals(end->dst)) {
3201       return NULL;
3202    } else {
3203       return end;
3204    }
3205 }
3206
3207 void
3208 fs_visitor::setup_payload_gen6()
3209 {
3210    bool uses_depth =
3211       (fp->Base.InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3212    unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
3213
3214    assert(brw->gen >= 6);
3215
3216    /* R0-1: masks, pixel X/Y coordinates. */
3217    c->nr_payload_regs = 2;
3218    /* R2: only for 32-pixel dispatch.*/
3219
3220    /* R3-26: barycentric interpolation coordinates.  These appear in the
3221     * same order that they appear in the brw_wm_barycentric_interp_mode
3222     * enum.  Each set of coordinates occupies 2 registers if dispatch width
3223     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
3224     * appear if they were enabled using the "Barycentric Interpolation
3225     * Mode" bits in WM_STATE.
3226     */
3227    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3228       if (barycentric_interp_modes & (1 << i)) {
3229          c->barycentric_coord_reg[i] = c->nr_payload_regs;
3230          c->nr_payload_regs += 2;
3231          if (dispatch_width == 16) {
3232             c->nr_payload_regs += 2;
3233          }
3234       }
3235    }
3236
3237    /* R27: interpolated depth if uses source depth */
3238    if (uses_depth) {
3239       c->source_depth_reg = c->nr_payload_regs;
3240       c->nr_payload_regs++;
3241       if (dispatch_width == 16) {
3242          /* R28: interpolated depth if not SIMD8. */
3243          c->nr_payload_regs++;
3244       }
3245    }
3246    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3247    if (uses_depth) {
3248       c->source_w_reg = c->nr_payload_regs;
3249       c->nr_payload_regs++;
3250       if (dispatch_width == 16) {
3251          /* R30: interpolated W if not SIMD8. */
3252          c->nr_payload_regs++;
3253       }
3254    }
3255
3256    c->prog_data.uses_pos_offset = c->key.compute_pos_offset;
3257    /* R31: MSAA position offsets. */
3258    if (c->prog_data.uses_pos_offset) {
3259       c->sample_pos_reg = c->nr_payload_regs;
3260       c->nr_payload_regs++;
3261    }
3262
3263    /* R32: MSAA input coverage mask */
3264    if (fp->Base.SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
3265       assert(brw->gen >= 7);
3266       c->sample_mask_reg = c->nr_payload_regs;
3267       c->nr_payload_regs++;
3268       if (dispatch_width == 16) {
3269          /* R33: input coverage mask if not SIMD8. */
3270          c->nr_payload_regs++;
3271       }
3272    }
3273
3274    /* R34-: bary for 32-pixel. */
3275    /* R58-59: interp W for 32-pixel. */
3276
3277    if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3278       c->source_depth_to_render_target = true;
3279    }
3280 }
3281
3282 void
3283 fs_visitor::assign_binding_table_offsets()
3284 {
3285    uint32_t next_binding_table_offset = 0;
3286
3287    /* If there are no color regions, we still perform an FB write to a null
3288     * renderbuffer, which we place at surface index 0.
3289     */
3290    c->prog_data.binding_table.render_target_start = next_binding_table_offset;
3291    next_binding_table_offset += MAX2(c->key.nr_color_regions, 1);
3292
3293    assign_common_binding_table_offsets(next_binding_table_offset);
3294 }
3295
3296 void
3297 fs_visitor::calculate_register_pressure()
3298 {
3299    calculate_live_intervals();
3300
3301    int num_instructions = 0;
3302    foreach_list(node, &this->instructions) {
3303       ++num_instructions;
3304    }
3305
3306    regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
3307
3308    for (int reg = 0; reg < virtual_grf_count; reg++) {
3309       for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
3310          regs_live_at_ip[ip] += virtual_grf_sizes[reg];
3311    }
3312 }
3313
3314 bool
3315 fs_visitor::run()
3316 {
3317    sanity_param_count = fp->Base.Parameters->NumParameters;
3318    uint32_t orig_nr_params = stage_prog_data->nr_params;
3319    bool allocated_without_spills;
3320
3321    assign_binding_table_offsets();
3322
3323    if (brw->gen >= 6)
3324       setup_payload_gen6();
3325    else
3326       setup_payload_gen4();
3327
3328    if (0) {
3329       emit_dummy_fs();
3330    } else {
3331       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3332          emit_shader_time_begin();
3333
3334       calculate_urb_setup();
3335       if (fp->Base.InputsRead > 0) {
3336          if (brw->gen < 6)
3337             emit_interpolation_setup_gen4();
3338          else
3339             emit_interpolation_setup_gen6();
3340       }
3341
3342       /* We handle discards by keeping track of the still-live pixels in f0.1.
3343        * Initialize it with the dispatched pixels.
3344        */
3345       if (fp->UsesKill || c->key.alpha_test_func) {
3346          fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
3347          discard_init->flag_subreg = 1;
3348       }
3349
3350       /* Generate FS IR for main().  (the visitor only descends into
3351        * functions called "main").
3352        */
3353       if (shader) {
3354          foreach_list(node, &*shader->base.ir) {
3355             ir_instruction *ir = (ir_instruction *)node;
3356             base_ir = ir;
3357             this->result = reg_undef;
3358             ir->accept(this);
3359          }
3360       } else {
3361          emit_fragment_program_code();
3362       }
3363       base_ir = NULL;
3364       if (failed)
3365          return false;
3366
3367       emit(FS_OPCODE_PLACEHOLDER_HALT);
3368
3369       if (c->key.alpha_test_func)
3370          emit_alpha_test();
3371
3372       emit_fb_writes();
3373
3374       split_virtual_grfs();
3375
3376       move_uniform_array_access_to_pull_constants();
3377       remove_dead_constants();
3378       setup_pull_constants();
3379
3380       bool progress;
3381       do {
3382          progress = false;
3383
3384          compact_virtual_grfs();
3385
3386          progress = remove_duplicate_mrf_writes() || progress;
3387
3388          progress = opt_algebraic() || progress;
3389          progress = opt_cse() || progress;
3390          progress = opt_copy_propagate() || progress;
3391          progress = opt_peephole_predicated_break() || progress;
3392          progress = dead_code_eliminate() || progress;
3393          progress = dead_code_eliminate_local() || progress;
3394          progress = opt_peephole_sel() || progress;
3395          progress = dead_control_flow_eliminate(this) || progress;
3396          progress = opt_saturate_propagation() || progress;
3397          progress = register_coalesce() || progress;
3398          progress = compute_to_mrf() || progress;
3399       } while (progress);
3400
3401       lower_uniform_pull_constant_loads();
3402
3403       assign_curb_setup();
3404       assign_urb_setup();
3405
3406       static enum instruction_scheduler_mode pre_modes[] = {
3407          SCHEDULE_PRE,
3408          SCHEDULE_PRE_NON_LIFO,
3409          SCHEDULE_PRE_LIFO,
3410       };
3411
3412       /* Try each scheduling heuristic to see if it can successfully register
3413        * allocate without spilling.  They should be ordered by decreasing
3414        * performance but increasing likelihood of allocating.
3415        */
3416       for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3417          schedule_instructions(pre_modes[i]);
3418
3419          if (0) {
3420             assign_regs_trivial();
3421             allocated_without_spills = true;
3422          } else {
3423             allocated_without_spills = assign_regs(false);
3424          }
3425          if (allocated_without_spills)
3426             break;
3427       }
3428
3429       if (!allocated_without_spills) {
3430          /* We assume that any spilling is worse than just dropping back to
3431           * SIMD8.  There's probably actually some intermediate point where
3432           * SIMD16 with a couple of spills is still better.
3433           */
3434          if (dispatch_width == 16) {
3435             fail("Failure to register allocate.  Reduce number of "
3436                  "live scalar values to avoid this.");
3437          }
3438
3439          /* Since we're out of heuristics, just go spill registers until we
3440           * get an allocation.
3441           */
3442          while (!assign_regs(true)) {
3443             if (failed)
3444                break;
3445          }
3446       }
3447    }
3448    assert(force_uncompressed_stack == 0);
3449
3450    /* This must come after all optimization and register allocation, since
3451     * it inserts dead code that happens to have side effects, and it does
3452     * so based on the actual physical registers in use.
3453     */
3454    insert_gen4_send_dependency_workarounds();
3455
3456    if (failed)
3457       return false;
3458
3459    if (!allocated_without_spills)
3460       schedule_instructions(SCHEDULE_POST);
3461
3462    if (dispatch_width == 8) {
3463       c->prog_data.reg_blocks = brw_register_blocks(grf_used);
3464    } else {
3465       c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
3466
3467       /* Make sure we didn't try to sneak in an extra uniform */
3468       assert(orig_nr_params == stage_prog_data->nr_params);
3469       (void) orig_nr_params;
3470    }
3471
3472    /* If any state parameters were appended, then ParameterValues could have
3473     * been realloced, in which case the driver uniform storage set up by
3474     * _mesa_associate_uniform_storage() would point to freed memory.  Make
3475     * sure that didn't happen.
3476     */
3477    assert(sanity_param_count == fp->Base.Parameters->NumParameters);
3478
3479    return !failed;
3480 }
3481
3482 const unsigned *
3483 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
3484                struct gl_fragment_program *fp,
3485                struct gl_shader_program *prog,
3486                unsigned *final_assembly_size)
3487 {
3488    bool start_busy = false;
3489    float start_time = 0;
3490
3491    if (unlikely(brw->perf_debug)) {
3492       start_busy = (brw->batch.last_bo &&
3493                     drm_intel_bo_busy(brw->batch.last_bo));
3494       start_time = get_time();
3495    }
3496
3497    struct brw_shader *shader = NULL;
3498    if (prog)
3499       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3500
3501    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3502       if (prog) {
3503          printf("GLSL IR for native fragment shader %d:\n", prog->Name);
3504          _mesa_print_ir(shader->base.ir, NULL);
3505          printf("\n\n");
3506       } else {
3507          printf("ARB_fragment_program %d ir for native fragment shader\n",
3508                 fp->Base.Id);
3509          _mesa_print_program(&fp->Base);
3510       }
3511    }
3512
3513    /* Now the main event: Visit the shader IR and generate our FS IR for it.
3514     */
3515    fs_visitor v(brw, c, prog, fp, 8);
3516    if (!v.run()) {
3517       if (prog) {
3518          prog->LinkStatus = false;
3519          ralloc_strcat(&prog->InfoLog, v.fail_msg);
3520       }
3521
3522       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3523                     v.fail_msg);
3524
3525       return NULL;
3526    }
3527
3528    exec_list *simd16_instructions = NULL;
3529    fs_visitor v2(brw, c, prog, fp, 16);
3530    if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16))) {
3531       if (c->prog_data.base.nr_pull_params == 0) {
3532          /* Try a SIMD16 compile */
3533          v2.import_uniforms(&v);
3534          if (!v2.run()) {
3535             perf_debug("SIMD16 shader failed to compile, falling back to "
3536                        "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
3537          } else {
3538             simd16_instructions = &v2.instructions;
3539          }
3540       } else {
3541          perf_debug("Skipping SIMD16 due to pull parameters.\n");
3542       }
3543    }
3544
3545    const unsigned *assembly = NULL;
3546    if (brw->gen >= 8) {
3547       gen8_fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
3548       assembly = g.generate_assembly(&v.instructions, simd16_instructions,
3549                                      final_assembly_size);
3550    } else {
3551       fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
3552       assembly = g.generate_assembly(&v.instructions, simd16_instructions,
3553                                      final_assembly_size);
3554    }
3555
3556    if (unlikely(brw->perf_debug) && shader) {
3557       if (shader->compiled_once)
3558          brw_wm_debug_recompile(brw, prog, &c->key);
3559       shader->compiled_once = true;
3560
3561       if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
3562          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3563                     (get_time() - start_time) * 1000);
3564       }
3565    }
3566
3567    return assembly;
3568 }
3569
3570 bool
3571 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
3572 {
3573    struct brw_context *brw = brw_context(ctx);
3574    struct brw_wm_prog_key key;
3575
3576    if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
3577       return true;
3578
3579    struct gl_fragment_program *fp = (struct gl_fragment_program *)
3580       prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
3581    struct brw_fragment_program *bfp = brw_fragment_program(fp);
3582    bool program_uses_dfdy = fp->UsesDFdy;
3583
3584    memset(&key, 0, sizeof(key));
3585
3586    if (brw->gen < 6) {
3587       if (fp->UsesKill)
3588          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3589
3590       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3591          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3592
3593       /* Just assume depth testing. */
3594       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3595       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3596    }
3597
3598    if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
3599                                          BRW_FS_VARYING_INPUT_MASK) > 16)
3600       key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
3601
3602    key.clamp_fragment_color = ctx->API == API_OPENGL_COMPAT;
3603
3604    unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
3605    for (unsigned i = 0; i < sampler_count; i++) {
3606       if (fp->Base.ShadowSamplers & (1 << i)) {
3607          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3608          key.tex.swizzles[i] =
3609             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3610       } else {
3611          /* Color sampler: assume no swizzling. */
3612          key.tex.swizzles[i] = SWIZZLE_XYZW;
3613       }
3614    }
3615
3616    if (fp->Base.InputsRead & VARYING_BIT_POS) {
3617       key.drawable_height = ctx->DrawBuffer->Height;
3618    }
3619
3620    key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
3621          ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
3622          BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
3623
3624    if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3625       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
3626                           key.nr_color_regions > 1;
3627    }
3628
3629    /* GL_FRAGMENT_SHADER_DERIVATIVE_HINT is almost always GL_DONT_CARE.  The
3630     * quality of the derivatives is likely to be determined by the driconf
3631     * option.
3632     */
3633    key.high_quality_derivatives = brw->disable_derivative_optimization;
3634
3635    key.program_string_id = bfp->id;
3636
3637    uint32_t old_prog_offset = brw->wm.base.prog_offset;
3638    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3639
3640    bool success = do_wm_prog(brw, prog, bfp, &key);
3641
3642    brw->wm.base.prog_offset = old_prog_offset;
3643    brw->wm.prog_data = old_prog_data;
3644
3645    return success;
3646 }