src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 extern "C" {
  32
  33 #include <sys/types.h>
  34
  35 #include "main/hash_table.h"
  36 #include "main/macros.h"
  37 #include "main/shaderobj.h"
  38 #include "main/fbobject.h"
  39 #include "program/prog_parameter.h"
  40 #include "program/prog_print.h"
  41 #include "program/register_allocate.h"
  42 #include "program/sampler.h"
  43 #include "program/hash_table.h"
  44 #include "brw_context.h"
  45 #include "brw_eu.h"
  46 #include "brw_wm.h"
  47 }
  48 #include "brw_fs.h"
  49 #include "main/uniforms.h"
  50 #include "brw_fs_live_variables.h"
  51 #include "glsl/glsl_types.h"
  52
  53 void
  54 fs_inst::init()
  55 {
  56    memset(this, 0, sizeof(*this));
  57    this->opcode = BRW_OPCODE_NOP;
  58    this->conditional_mod = BRW_CONDITIONAL_NONE;
  59
  60    this->dst = reg_undef;
  61    this->src[0] = reg_undef;
  62    this->src[1] = reg_undef;
  63    this->src[2] = reg_undef;
  64
  65    /* This will be the case for almost all instructions. */
  66    this->regs_written = 1;
  67 }
  68
  69 fs_inst::fs_inst()
  70 {
  71    init();
  72 }
  73
  74 fs_inst::fs_inst(enum opcode opcode)
  75 {
  76    init();
  77    this->opcode = opcode;
  78 }
  79
  80 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
  81 {
  82    init();
  83    this->opcode = opcode;
  84    this->dst = dst;
  85
  86    if (dst.file == GRF)
  87       assert(dst.reg_offset >= 0);
  88 }
  89
  90 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
  91 {
  92    init();
  93    this->opcode = opcode;
  94    this->dst = dst;
  95    this->src[0] = src0;
  96
  97    if (dst.file == GRF)
  98       assert(dst.reg_offset >= 0);
  99    if (src[0].file == GRF)
 100       assert(src[0].reg_offset >= 0);
 101 }
 102
 103 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 104 {
 105    init();
 106    this->opcode = opcode;
 107    this->dst = dst;
 108    this->src[0] = src0;
 109    this->src[1] = src1;
 110
 111    if (dst.file == GRF)
 112       assert(dst.reg_offset >= 0);
 113    if (src[0].file == GRF)
 114       assert(src[0].reg_offset >= 0);
 115    if (src[1].file == GRF)
 116       assert(src[1].reg_offset >= 0);
 117 }
 118
 119 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
 120                  fs_reg src0, fs_reg src1, fs_reg src2)
 121 {
 122    init();
 123    this->opcode = opcode;
 124    this->dst = dst;
 125    this->src[0] = src0;
 126    this->src[1] = src1;
 127    this->src[2] = src2;
 128
 129    if (dst.file == GRF)
 130       assert(dst.reg_offset >= 0);
 131    if (src[0].file == GRF)
 132       assert(src[0].reg_offset >= 0);
 133    if (src[1].file == GRF)
 134       assert(src[1].reg_offset >= 0);
 135    if (src[2].file == GRF)
 136       assert(src[2].reg_offset >= 0);
 137 }
 138
 139 #define ALU1(op)                                                        \
 140    fs_inst *                                                            \
 141    fs_visitor::op(fs_reg dst, fs_reg src0)                              \
 142    {                                                                    \
 143       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
 144    }
 145
 146 #define ALU2(op)                                                        \
 147    fs_inst *                                                            \
 148    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1)                 \
 149    {                                                                    \
 150       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
 151    }
 152
 153 #define ALU3(op)                                                        \
 154    fs_inst *                                                            \
 155    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1, fs_reg src2)    \
 156    {                                                                    \
 157       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
 158    }
 159
 160 ALU1(NOT)
 161 ALU1(MOV)
 162 ALU1(FRC)
 163 ALU1(RNDD)
 164 ALU1(RNDE)
 165 ALU1(RNDZ)
 166 ALU2(ADD)
 167 ALU2(MUL)
 168 ALU2(MACH)
 169 ALU2(AND)
 170 ALU2(OR)
 171 ALU2(XOR)
 172 ALU2(SHL)
 173 ALU2(SHR)
 174 ALU2(ASR)
 175 ALU3(LRP)
 176 ALU1(BFREV)
 177 ALU3(BFE)
 178 ALU2(BFI1)
 179 ALU3(BFI2)
 180 ALU1(FBH)
 181 ALU1(FBL)
 182 ALU1(CBIT)
 183 ALU3(MAD)
 184 ALU2(ADDC)
 185 ALU2(SUBB)
 186
 187 /** Gen4 predicated IF. */
 188 fs_inst *
 189 fs_visitor::IF(uint32_t predicate)
 190 {
 191    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
 192    inst->predicate = predicate;
 193    return inst;
 194 }
 195
 196 /** Gen6+ IF with embedded comparison. */
 197 fs_inst *
 198 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
 199 {
 200    assert(brw->gen >= 6);
 201    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
 202                                         reg_null_d, src0, src1);
 203    inst->conditional_mod = condition;
 204    return inst;
 205 }
 206
 207 /**
 208  * CMP: Sets the low bit of the destination channels with the result
 209  * of the comparison, while the upper bits are undefined, and updates
 210  * the flag register with the packed 16 bits of the result.
 211  */
 212 fs_inst *
 213 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
 214 {
 215    fs_inst *inst;
 216
 217    /* Take the instruction:
 218     *
 219     * CMP null<d> src0<f> src1<f>
 220     *
 221     * Original gen4 does type conversion to the destination type before
 222     * comparison, producing garbage results for floating point comparisons.
 223     * gen5 does the comparison on the execution type (resolved source types),
 224     * so dst type doesn't matter.  gen6 does comparison and then uses the
 225     * result as if it was the dst type with no conversion, which happens to
 226     * mostly work out for float-interpreted-as-int since our comparisons are
 227     * for >0, =0, <0.
 228     */
 229    if (brw->gen == 4) {
 230       dst.type = src0.type;
 231       if (dst.file == HW_REG)
 232          dst.fixed_hw_reg.type = dst.type;
 233    }
 234
 235    resolve_ud_negate(&src0);
 236    resolve_ud_negate(&src1);
 237
 238    inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
 239    inst->conditional_mod = condition;
 240
 241    return inst;
 242 }
 243
 244 exec_list
 245 fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
 246                                        fs_reg varying_offset,
 247                                        uint32_t const_offset)
 248 {
 249    exec_list instructions;
 250    fs_inst *inst;
 251
 252    /* We have our constant surface use a pitch of 4 bytes, so our index can
 253     * be any component of a vector, and then we load 4 contiguous
 254     * components starting from that.
 255     *
 256     * We break down the const_offset to a portion added to the variable
 257     * offset and a portion done using reg_offset, which means that if you
 258     * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
 259     * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
 260     * CSE can later notice that those loads are all the same and eliminate
 261     * the redundant ones.
 262     */
 263    fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
 264    instructions.push_tail(ADD(vec4_offset,
 265                               varying_offset, const_offset & ~3));
 266
 267    int scale = 1;
 268    if (brw->gen == 4 && dispatch_width == 8) {
 269       /* Pre-gen5, we can either use a SIMD8 message that requires (header,
 270        * u, v, r) as parameters, or we can just use the SIMD16 message
 271        * consisting of (header, u).  We choose the second, at the cost of a
 272        * longer return length.
 273        */
 274       scale = 2;
 275    }
 276
 277    enum opcode op;
 278    if (brw->gen >= 7)
 279       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
 280    else
 281       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
 282    fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4 * scale), dst.type);
 283    inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
 284    inst->regs_written = 4 * scale;
 285    instructions.push_tail(inst);
 286
 287    if (brw->gen < 7) {
 288       inst->base_mrf = 13;
 289       inst->header_present = true;
 290       if (brw->gen == 4)
 291          inst->mlen = 3;
 292       else
 293          inst->mlen = 1 + dispatch_width / 8;
 294    }
 295
 296    vec4_result.reg_offset += (const_offset & 3) * scale;
 297    instructions.push_tail(MOV(dst, vec4_result));
 298
 299    return instructions;
 300 }
 301
 302 /**
 303  * A helper for MOV generation for fixing up broken hardware SEND dependency
 304  * handling.
 305  */
 306 fs_inst *
 307 fs_visitor::DEP_RESOLVE_MOV(int grf)
 308 {
 309    fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
 310
 311    inst->ir = NULL;
 312    inst->annotation = "send dependency resolve";
 313
 314    /* The caller always wants uncompressed to emit the minimal extra
 315     * dependencies, and to avoid having to deal with aligning its regs to 2.
 316     */
 317    inst->force_uncompressed = true;
 318
 319    return inst;
 320 }
 321
 322 bool
 323 fs_inst::equals(fs_inst *inst)
 324 {
 325    return (opcode == inst->opcode &&
 326            dst.equals(inst->dst) &&
 327            src[0].equals(inst->src[0]) &&
 328            src[1].equals(inst->src[1]) &&
 329            src[2].equals(inst->src[2]) &&
 330            saturate == inst->saturate &&
 331            predicate == inst->predicate &&
 332            conditional_mod == inst->conditional_mod &&
 333            mlen == inst->mlen &&
 334            base_mrf == inst->base_mrf &&
 335            sampler == inst->sampler &&
 336            target == inst->target &&
 337            eot == inst->eot &&
 338            header_present == inst->header_present &&
 339            shadow_compare == inst->shadow_compare &&
 340            offset == inst->offset);
 341 }
 342
 343 bool
 344 fs_inst::overwrites_reg(const fs_reg &reg)
 345 {
 346    return (reg.file == dst.file &&
 347            reg.reg == dst.reg &&
 348            reg.reg_offset >= dst.reg_offset  &&
 349            reg.reg_offset < dst.reg_offset + regs_written);
 350 }
 351
 352 bool
 353 fs_inst::is_send_from_grf()
 354 {
 355    return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
 356            opcode == SHADER_OPCODE_SHADER_TIME_ADD ||
 357            (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
 358             src[1].file == GRF) ||
 359            (is_tex() && src[0].file == GRF));
 360 }
 361
 362 bool
 363 fs_visitor::can_do_source_mods(fs_inst *inst)
 364 {
 365    if (brw->gen == 6 && inst->is_math())
 366       return false;
 367
 368    if (inst->is_send_from_grf())
 369       return false;
 370
 371    if (!inst->can_do_source_mods())
 372       return false;
 373
 374    return true;
 375 }
 376
 377 void
 378 fs_reg::init()
 379 {
 380    memset(this, 0, sizeof(*this));
 381    this->smear = -1;
 382 }
 383
 384 /** Generic unset register constructor. */
 385 fs_reg::fs_reg()
 386 {
 387    init();
 388    this->file = BAD_FILE;
 389 }
 390
 391 /** Immediate value constructor. */
 392 fs_reg::fs_reg(float f)
 393 {
 394    init();
 395    this->file = IMM;
 396    this->type = BRW_REGISTER_TYPE_F;
 397    this->imm.f = f;
 398 }
 399
 400 /** Immediate value constructor. */
 401 fs_reg::fs_reg(int32_t i)
 402 {
 403    init();
 404    this->file = IMM;
 405    this->type = BRW_REGISTER_TYPE_D;
 406    this->imm.i = i;
 407 }
 408
 409 /** Immediate value constructor. */
 410 fs_reg::fs_reg(uint32_t u)
 411 {
 412    init();
 413    this->file = IMM;
 414    this->type = BRW_REGISTER_TYPE_UD;
 415    this->imm.u = u;
 416 }
 417
 418 /** Fixed brw_reg Immediate value constructor. */
 419 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
 420 {
 421    init();
 422    this->file = HW_REG;
 423    this->fixed_hw_reg = fixed_hw_reg;
 424    this->type = fixed_hw_reg.type;
 425 }
 426
 427 bool
 428 fs_reg::equals(const fs_reg &r) const
 429 {
 430    return (file == r.file &&
 431            reg == r.reg &&
 432            reg_offset == r.reg_offset &&
 433            type == r.type &&
 434            negate == r.negate &&
 435            abs == r.abs &&
 436            !reladdr && !r.reladdr &&
 437            memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
 438                   sizeof(fixed_hw_reg)) == 0 &&
 439            smear == r.smear &&
 440            imm.u == r.imm.u);
 441 }
 442
 443 fs_reg
 444 fs_reg::retype(uint32_t type)
 445 {
 446    fs_reg result = *this;
 447    result.type = type;
 448    return result;
 449 }
 450
 451 bool
 452 fs_reg::is_zero() const
 453 {
 454    if (file != IMM)
 455       return false;
 456
 457    return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
 458 }
 459
 460 bool
 461 fs_reg::is_one() const
 462 {
 463    if (file != IMM)
 464       return false;
 465
 466    return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
 467 }
 468
 469 bool
 470 fs_reg::is_null() const
 471 {
 472    return file == HW_REG &&
 473           fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
 474           fixed_hw_reg.nr == BRW_ARF_NULL;
 475 }
 476
 477 bool
 478 fs_reg::is_valid_3src() const
 479 {
 480    return file == GRF || file == UNIFORM;
 481 }
 482
 483 int
 484 fs_visitor::type_size(const struct glsl_type *type)
 485 {
 486    unsigned int size, i;
 487
 488    switch (type->base_type) {
 489    case GLSL_TYPE_UINT:
 490    case GLSL_TYPE_INT:
 491    case GLSL_TYPE_FLOAT:
 492    case GLSL_TYPE_BOOL:
 493       return type->components();
 494    case GLSL_TYPE_ARRAY:
 495       return type_size(type->fields.array) * type->length;
 496    case GLSL_TYPE_STRUCT:
 497       size = 0;
 498       for (i = 0; i < type->length; i++) {
 499          size += type_size(type->fields.structure[i].type);
 500       }
 501       return size;
 502    case GLSL_TYPE_SAMPLER:
 503       /* Samplers take up no register space, since they're baked in at
 504        * link time.
 505        */
 506       return 0;
 507    case GLSL_TYPE_ATOMIC_UINT:
 508       return 0;
 509    case GLSL_TYPE_VOID:
 510    case GLSL_TYPE_ERROR:
 511    case GLSL_TYPE_INTERFACE:
 512       assert(!"not reached");
 513       break;
 514    }
 515
 516    return 0;
 517 }
 518
 519 fs_reg
 520 fs_visitor::get_timestamp()
 521 {
 522    assert(brw->gen >= 7);
 523
 524    fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 525                                           BRW_ARF_TIMESTAMP,
 526                                           0),
 527                              BRW_REGISTER_TYPE_UD));
 528
 529    fs_reg dst = fs_reg(this, glsl_type::uint_type);
 530
 531    fs_inst *mov = emit(MOV(dst, ts));
 532    /* We want to read the 3 fields we care about (mostly field 0, but also 2)
 533     * even if it's not enabled in the dispatch.
 534     */
 535    mov->force_writemask_all = true;
 536    mov->force_uncompressed = true;
 537
 538    /* The caller wants the low 32 bits of the timestamp.  Since it's running
 539     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
 540     * which is plenty of time for our purposes.  It is identical across the
 541     * EUs, but since it's tracking GPU core speed it will increment at a
 542     * varying rate as render P-states change.
 543     *
 544     * The caller could also check if render P-states have changed (or anything
 545     * else that might disrupt timing) by setting smear to 2 and checking if
 546     * that field is != 0.
 547     */
 548    dst.smear = 0;
 549
 550    return dst;
 551 }
 552
 553 void
 554 fs_visitor::emit_shader_time_begin()
 555 {
 556    current_annotation = "shader time start";
 557    shader_start_time = get_timestamp();
 558 }
 559
 560 void
 561 fs_visitor::emit_shader_time_end()
 562 {
 563    current_annotation = "shader time end";
 564
 565    enum shader_time_shader_type type, written_type, reset_type;
 566    if (dispatch_width == 8) {
 567       type = ST_FS8;
 568       written_type = ST_FS8_WRITTEN;
 569       reset_type = ST_FS8_RESET;
 570    } else {
 571       assert(dispatch_width == 16);
 572       type = ST_FS16;
 573       written_type = ST_FS16_WRITTEN;
 574       reset_type = ST_FS16_RESET;
 575    }
 576
 577    fs_reg shader_end_time = get_timestamp();
 578
 579    /* Check that there weren't any timestamp reset events (assuming these
 580     * were the only two timestamp reads that happened).
 581     */
 582    fs_reg reset = shader_end_time;
 583    reset.smear = 2;
 584    fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
 585    test->conditional_mod = BRW_CONDITIONAL_Z;
 586    emit(IF(BRW_PREDICATE_NORMAL));
 587
 588    push_force_uncompressed();
 589    fs_reg start = shader_start_time;
 590    start.negate = true;
 591    fs_reg diff = fs_reg(this, glsl_type::uint_type);
 592    emit(ADD(diff, start, shader_end_time));
 593
 594    /* If there were no instructions between the two timestamp gets, the diff
 595     * is 2 cycles.  Remove that overhead, so I can forget about that when
 596     * trying to determine the time taken for single instructions.
 597     */
 598    emit(ADD(diff, diff, fs_reg(-2u)));
 599
 600    emit_shader_time_write(type, diff);
 601    emit_shader_time_write(written_type, fs_reg(1u));
 602    emit(BRW_OPCODE_ELSE);
 603    emit_shader_time_write(reset_type, fs_reg(1u));
 604    emit(BRW_OPCODE_ENDIF);
 605
 606    pop_force_uncompressed();
 607 }
 608
 609 void
 610 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
 611                                    fs_reg value)
 612 {
 613    int shader_time_index =
 614       brw_get_shader_time_index(brw, shader_prog, &fp->Base, type);
 615    fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
 616
 617    fs_reg payload;
 618    if (dispatch_width == 8)
 619       payload = fs_reg(this, glsl_type::uvec2_type);
 620    else
 621       payload = fs_reg(this, glsl_type::uint_type);
 622
 623    emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
 624                 fs_reg(), payload, offset, value));
 625 }
 626
 627 void
 628 fs_visitor::fail(const char *format, ...)
 629 {
 630    va_list va;
 631    char *msg;
 632
 633    if (failed)
 634       return;
 635
 636    failed = true;
 637
 638    va_start(va, format);
 639    msg = ralloc_vasprintf(mem_ctx, format, va);
 640    va_end(va);
 641    msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
 642
 643    this->fail_msg = msg;
 644
 645    if (INTEL_DEBUG & DEBUG_WM) {
 646       fprintf(stderr, "%s",  msg);
 647    }
 648 }
 649
 650 fs_inst *
 651 fs_visitor::emit(enum opcode opcode)
 652 {
 653    return emit(fs_inst(opcode));
 654 }
 655
 656 fs_inst *
 657 fs_visitor::emit(enum opcode opcode, fs_reg dst)
 658 {
 659    return emit(fs_inst(opcode, dst));
 660 }
 661
 662 fs_inst *
 663 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
 664 {
 665    return emit(fs_inst(opcode, dst, src0));
 666 }
 667
 668 fs_inst *
 669 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 670 {
 671    return emit(fs_inst(opcode, dst, src0, src1));
 672 }
 673
 674 fs_inst *
 675 fs_visitor::emit(enum opcode opcode, fs_reg dst,
 676                  fs_reg src0, fs_reg src1, fs_reg src2)
 677 {
 678    return emit(fs_inst(opcode, dst, src0, src1, src2));
 679 }
 680
 681 void
 682 fs_visitor::push_force_uncompressed()
 683 {
 684    force_uncompressed_stack++;
 685 }
 686
 687 void
 688 fs_visitor::pop_force_uncompressed()
 689 {
 690    force_uncompressed_stack--;
 691    assert(force_uncompressed_stack >= 0);
 692 }
 693
 694 void
 695 fs_visitor::push_force_sechalf()
 696 {
 697    force_sechalf_stack++;
 698 }
 699
 700 void
 701 fs_visitor::pop_force_sechalf()
 702 {
 703    force_sechalf_stack--;
 704    assert(force_sechalf_stack >= 0);
 705 }
 706
 707 /**
 708  * Returns true if the instruction has a flag that means it won't
 709  * update an entire destination register.
 710  *
 711  * For example, dead code elimination and live variable analysis want to know
 712  * when a write to a variable screens off any preceding values that were in
 713  * it.
 714  */
 715 bool
 716 fs_inst::is_partial_write()
 717 {
 718    return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
 719            this->force_uncompressed ||
 720            this->force_sechalf);
 721 }
 722
 723 int
 724 fs_inst::regs_read(fs_visitor *v, int arg)
 725 {
 726    if (is_tex() && arg == 0 && src[0].file == GRF) {
 727       if (v->dispatch_width == 16)
 728          return (mlen + 1) / 2;
 729       else
 730          return mlen;
 731    }
 732    return 1;
 733 }
 734
 735 bool
 736 fs_inst::reads_flag()
 737 {
 738    return predicate;
 739 }
 740
 741 bool
 742 fs_inst::writes_flag()
 743 {
 744    return (conditional_mod && opcode != BRW_OPCODE_SEL) ||
 745           opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
 746 }
 747
 748 /**
 749  * Returns how many MRFs an FS opcode will write over.
 750  *
 751  * Note that this is not the 0 or 1 implied writes in an actual gen
 752  * instruction -- the FS opcodes often generate MOVs in addition.
 753  */
 754 int
 755 fs_visitor::implied_mrf_writes(fs_inst *inst)
 756 {
 757    if (inst->mlen == 0)
 758       return 0;
 759
 760    if (inst->base_mrf == -1)
 761       return 0;
 762
 763    switch (inst->opcode) {
 764    case SHADER_OPCODE_RCP:
 765    case SHADER_OPCODE_RSQ:
 766    case SHADER_OPCODE_SQRT:
 767    case SHADER_OPCODE_EXP2:
 768    case SHADER_OPCODE_LOG2:
 769    case SHADER_OPCODE_SIN:
 770    case SHADER_OPCODE_COS:
 771       return 1 * dispatch_width / 8;
 772    case SHADER_OPCODE_POW:
 773    case SHADER_OPCODE_INT_QUOTIENT:
 774    case SHADER_OPCODE_INT_REMAINDER:
 775       return 2 * dispatch_width / 8;
 776    case SHADER_OPCODE_TEX:
 777    case FS_OPCODE_TXB:
 778    case SHADER_OPCODE_TXD:
 779    case SHADER_OPCODE_TXF:
 780    case SHADER_OPCODE_TXF_MS:
 781    case SHADER_OPCODE_TG4:
 782    case SHADER_OPCODE_TG4_OFFSET:
 783    case SHADER_OPCODE_TXL:
 784    case SHADER_OPCODE_TXS:
 785    case SHADER_OPCODE_LOD:
 786       return 1;
 787    case FS_OPCODE_FB_WRITE:
 788       return 2;
 789    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 790    case SHADER_OPCODE_GEN4_SCRATCH_READ:
 791       return 1;
 792    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
 793       return inst->mlen;
 794    case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
 795       return 2;
 796    case SHADER_OPCODE_UNTYPED_ATOMIC:
 797    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
 798       return 0;
 799    default:
 800       assert(!"not reached");
 801       return inst->mlen;
 802    }
 803 }
 804
 805 int
 806 fs_visitor::virtual_grf_alloc(int size)
 807 {
 808    if (virtual_grf_array_size <= virtual_grf_count) {
 809       if (virtual_grf_array_size == 0)
 810          virtual_grf_array_size = 16;
 811       else
 812          virtual_grf_array_size *= 2;
 813       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 814                                    virtual_grf_array_size);
 815    }
 816    virtual_grf_sizes[virtual_grf_count] = size;
 817    return virtual_grf_count++;
 818 }
 819
 820 /** Fixed HW reg constructor. */
 821 fs_reg::fs_reg(enum register_file file, int reg)
 822 {
 823    init();
 824    this->file = file;
 825    this->reg = reg;
 826    this->type = BRW_REGISTER_TYPE_F;
 827 }
 828
 829 /** Fixed HW reg constructor. */
 830 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
 831 {
 832    init();
 833    this->file = file;
 834    this->reg = reg;
 835    this->type = type;
 836 }
 837
 838 /** Automatic reg constructor. */
 839 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
 840 {
 841    init();
 842
 843    this->file = GRF;
 844    this->reg = v->virtual_grf_alloc(v->type_size(type));
 845    this->reg_offset = 0;
 846    this->type = brw_type_for_base_type(type);
 847 }
 848
 849 fs_reg *
 850 fs_visitor::variable_storage(ir_variable *var)
 851 {
 852    return (fs_reg *)hash_table_find(this->variable_ht, var);
 853 }
 854
 855 void
 856 import_uniforms_callback(const void *key,
 857                          void *data,
 858                          void *closure)
 859 {
 860    struct hash_table *dst_ht = (struct hash_table *)closure;
 861    const fs_reg *reg = (const fs_reg *)data;
 862
 863    if (reg->file != UNIFORM)
 864       return;
 865
 866    hash_table_insert(dst_ht, data, key);
 867 }
 868
 869 /* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
 870  * This brings in those uniform definitions
 871  */
 872 void
 873 fs_visitor::import_uniforms(fs_visitor *v)
 874 {
 875    hash_table_call_foreach(v->variable_ht,
 876                            import_uniforms_callback,
 877                            variable_ht);
 878    this->params_remap = v->params_remap;
 879    this->nr_params_remap = v->nr_params_remap;
 880 }
 881
 882 /* Our support for uniforms is piggy-backed on the struct
 883  * gl_fragment_program, because that's where the values actually
 884  * get stored, rather than in some global gl_shader_program uniform
 885  * store.
 886  */
 887 void
 888 fs_visitor::setup_uniform_values(ir_variable *ir)
 889 {
 890    int namelen = strlen(ir->name);
 891
 892    /* The data for our (non-builtin) uniforms is stored in a series of
 893     * gl_uniform_driver_storage structs for each subcomponent that
 894     * glGetUniformLocation() could name.  We know it's been set up in the same
 895     * order we'd walk the type, so walk the list of storage and find anything
 896     * with our name, or the prefix of a component that starts with our name.
 897     */
 898    unsigned params_before = c->prog_data.nr_params;
 899    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
 900       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 901
 902       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 903           (storage->name[namelen] != 0 &&
 904            storage->name[namelen] != '.' &&
 905            storage->name[namelen] != '[')) {
 906          continue;
 907       }
 908
 909       unsigned slots = storage->type->component_slots();
 910       if (storage->array_elements)
 911          slots *= storage->array_elements;
 912
 913       for (unsigned i = 0; i < slots; i++) {
 914          c->prog_data.param[c->prog_data.nr_params++] =
 915             &storage->storage[i].f;
 916       }
 917    }
 918
 919    /* Make sure we actually initialized the right amount of stuff here. */
 920    assert(params_before + ir->type->component_slots() ==
 921           c->prog_data.nr_params);
 922    (void)params_before;
 923 }
 924
 925
 926 /* Our support for builtin uniforms is even scarier than non-builtin.
 927  * It sits on top of the PROG_STATE_VAR parameters that are
 928  * automatically updated from GL context state.
 929  */
 930 void
 931 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
 932 {
 933    const ir_state_slot *const slots = ir->state_slots;
 934    assert(ir->state_slots != NULL);
 935
 936    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 937       /* This state reference has already been setup by ir_to_mesa, but we'll
 938        * get the same index back here.
 939        */
 940       int index = _mesa_add_state_reference(this->fp->Base.Parameters,
 941                                             (gl_state_index *)slots[i].tokens);
 942
 943       /* Add each of the unique swizzles of the element as a parameter.
 944        * This'll end up matching the expected layout of the
 945        * array/matrix/structure we're trying to fill in.
 946        */
 947       int last_swiz = -1;
 948       for (unsigned int j = 0; j < 4; j++) {
 949          int swiz = GET_SWZ(slots[i].swizzle, j);
 950          if (swiz == last_swiz)
 951             break;
 952          last_swiz = swiz;
 953
 954          c->prog_data.param[c->prog_data.nr_params++] =
 955             &fp->Base.Parameters->ParameterValues[index][swiz].f;
 956       }
 957    }
 958 }
 959
 960 fs_reg *
 961 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
 962 {
 963    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 964    fs_reg wpos = *reg;
 965    bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
 966
 967    /* gl_FragCoord.x */
 968    if (ir->pixel_center_integer) {
 969       emit(MOV(wpos, this->pixel_x));
 970    } else {
 971       emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
 972    }
 973    wpos.reg_offset++;
 974
 975    /* gl_FragCoord.y */
 976    if (!flip && ir->pixel_center_integer) {
 977       emit(MOV(wpos, this->pixel_y));
 978    } else {
 979       fs_reg pixel_y = this->pixel_y;
 980       float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
 981
 982       if (flip) {
 983          pixel_y.negate = true;
 984          offset += c->key.drawable_height - 1.0;
 985       }
 986
 987       emit(ADD(wpos, pixel_y, fs_reg(offset)));
 988    }
 989    wpos.reg_offset++;
 990
 991    /* gl_FragCoord.z */
 992    if (brw->gen >= 6) {
 993       emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
 994    } else {
 995       emit(FS_OPCODE_LINTERP, wpos,
 996            this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
 997            this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
 998            interp_reg(VARYING_SLOT_POS, 2));
 999    }
1000    wpos.reg_offset++;
1001
1002    /* gl_FragCoord.w: Already set up in emit_interpolation */
1003    emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1004
1005    return reg;
1006 }
1007
1008 fs_inst *
1009 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1010                          glsl_interp_qualifier interpolation_mode,
1011                          bool is_centroid)
1012 {
1013    brw_wm_barycentric_interp_mode barycoord_mode;
1014    if (brw->gen >= 6) {
1015       if (is_centroid) {
1016          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1017             barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1018          else
1019             barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1020       } else {
1021          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1022             barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1023          else
1024             barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1025       }
1026    } else {
1027       /* On Ironlake and below, there is only one interpolation mode.
1028        * Centroid interpolation doesn't mean anything on this hardware --
1029        * there is no multisampling.
1030        */
1031       barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1032    }
1033    return emit(FS_OPCODE_LINTERP, attr,
1034                this->delta_x[barycoord_mode],
1035                this->delta_y[barycoord_mode], interp);
1036 }
1037
1038 fs_reg *
1039 fs_visitor::emit_general_interpolation(ir_variable *ir)
1040 {
1041    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1042    reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
1043    fs_reg attr = *reg;
1044
1045    unsigned int array_elements;
1046    const glsl_type *type;
1047
1048    if (ir->type->is_array()) {
1049       array_elements = ir->type->length;
1050       if (array_elements == 0) {
1051          fail("dereferenced array '%s' has length 0\n", ir->name);
1052       }
1053       type = ir->type->fields.array;
1054    } else {
1055       array_elements = 1;
1056       type = ir->type;
1057    }
1058
1059    glsl_interp_qualifier interpolation_mode =
1060       ir->determine_interpolation_mode(c->key.flat_shade);
1061
1062    int location = ir->location;
1063    for (unsigned int i = 0; i < array_elements; i++) {
1064       for (unsigned int j = 0; j < type->matrix_columns; j++) {
1065          if (c->prog_data.urb_setup[location] == -1) {
1066             /* If there's no incoming setup data for this slot, don't
1067              * emit interpolation for it.
1068              */
1069             attr.reg_offset += type->vector_elements;
1070             location++;
1071             continue;
1072          }
1073
1074          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1075             /* Constant interpolation (flat shading) case. The SF has
1076              * handed us defined values in only the constant offset
1077              * field of the setup reg.
1078              */
1079             for (unsigned int k = 0; k < type->vector_elements; k++) {
1080                struct brw_reg interp = interp_reg(location, k);
1081                interp = suboffset(interp, 3);
1082                interp.type = reg->type;
1083                emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1084                attr.reg_offset++;
1085             }
1086          } else {
1087             /* Smooth/noperspective interpolation case. */
1088             for (unsigned int k = 0; k < type->vector_elements; k++) {
1089                /* FINISHME: At some point we probably want to push
1090                 * this farther by giving similar treatment to the
1091                 * other potentially constant components of the
1092                 * attribute, as well as making brw_vs_constval.c
1093                 * handle varyings other than gl_TexCoord.
1094                 */
1095                struct brw_reg interp = interp_reg(location, k);
1096                emit_linterp(attr, fs_reg(interp), interpolation_mode,
1097                             ir->centroid);
1098                if (brw->needs_unlit_centroid_workaround && ir->centroid) {
1099                   /* Get the pixel/sample mask into f0 so that we know
1100                    * which pixels are lit.  Then, for each channel that is
1101                    * unlit, replace the centroid data with non-centroid
1102                    * data.
1103                    */
1104                   emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1105                   fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1106                                                interpolation_mode, false);
1107                   inst->predicate = BRW_PREDICATE_NORMAL;
1108                   inst->predicate_inverse = true;
1109                }
1110                if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1111                   emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1112                }
1113                attr.reg_offset++;
1114             }
1115
1116          }
1117          location++;
1118       }
1119    }
1120
1121    return reg;
1122 }
1123
1124 fs_reg *
1125 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1126 {
1127    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1128
1129    /* The frontfacing comes in as a bit in the thread payload. */
1130    if (brw->gen >= 6) {
1131       emit(BRW_OPCODE_ASR, *reg,
1132            fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1133            fs_reg(15));
1134       emit(BRW_OPCODE_NOT, *reg, *reg);
1135       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1136    } else {
1137       struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1138       /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1139        * us front face
1140        */
1141       emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1142       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1143    }
1144
1145    return reg;
1146 }
1147
1148 void
1149 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1150 {
1151    assert(dst.type == BRW_REGISTER_TYPE_F);
1152
1153    if (c->key.compute_pos_offset) {
1154       /* Convert int_sample_pos to floating point */
1155       emit(MOV(dst, int_sample_pos));
1156       /* Scale to the range [0, 1] */
1157       emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1158    }
1159    else {
1160       /* From ARB_sample_shading specification:
1161        * "When rendering to a non-multisample buffer, or if multisample
1162        *  rasterization is disabled, gl_SamplePosition will always be
1163        *  (0.5, 0.5).
1164        */
1165       emit(MOV(dst, fs_reg(0.5f)));
1166    }
1167 }
1168
1169 fs_reg *
1170 fs_visitor::emit_samplepos_setup(ir_variable *ir)
1171 {
1172    assert(brw->gen >= 6);
1173    assert(ir->type == glsl_type::vec2_type);
1174
1175    this->current_annotation = "compute sample position";
1176    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1177    fs_reg pos = *reg;
1178    fs_reg int_sample_x = fs_reg(this, glsl_type::int_type);
1179    fs_reg int_sample_y = fs_reg(this, glsl_type::int_type);
1180
1181    /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1182     * mode will be enabled.
1183     *
1184     * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1185     * R31.1:0         Position Offset X/Y for Slot[3:0]
1186     * R31.3:2         Position Offset X/Y for Slot[7:4]
1187     * .....
1188     *
1189     * The X, Y sample positions come in as bytes in  thread payload. So, read
1190     * the positions using vstride=16, width=8, hstride=2.
1191     */
1192    struct brw_reg sample_pos_reg =
1193       stride(retype(brw_vec1_grf(c->sample_pos_reg, 0),
1194                     BRW_REGISTER_TYPE_B), 16, 8, 2);
1195
1196    emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1197    if (dispatch_width == 16) {
1198       int_sample_x.sechalf = true;
1199       fs_inst *inst = emit(MOV(int_sample_x,
1200                                fs_reg(suboffset(sample_pos_reg, 16))));
1201       inst->force_sechalf = true;
1202       int_sample_x.sechalf = false;
1203    }
1204    /* Compute gl_SamplePosition.x */
1205    compute_sample_position(pos, int_sample_x);
1206    pos.reg_offset++;
1207    emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1208    if (dispatch_width == 16) {
1209       int_sample_y.sechalf = true;
1210       fs_inst *inst = emit(MOV(int_sample_y,
1211                                fs_reg(suboffset(sample_pos_reg, 17))));
1212       inst->force_sechalf = true;
1213       int_sample_y.sechalf = false;
1214    }
1215    /* Compute gl_SamplePosition.y */
1216    compute_sample_position(pos, int_sample_y);
1217    return reg;
1218 }
1219
1220 fs_reg *
1221 fs_visitor::emit_sampleid_setup(ir_variable *ir)
1222 {
1223    assert(brw->gen >= 6);
1224
1225    this->current_annotation = "compute sample id";
1226    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1227
1228    if (c->key.compute_sample_id) {
1229       fs_reg t1 = fs_reg(this, glsl_type::int_type);
1230       fs_reg t2 = fs_reg(this, glsl_type::int_type);
1231       t2.type = BRW_REGISTER_TYPE_UW;
1232
1233       /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1234        * 8x multisampling, subspan 0 will represent sample N (where N
1235        * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1236        * 7. We can find the value of N by looking at R0.0 bits 7:6
1237        * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1238        * (since samples are always delivered in pairs). That is, we
1239        * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1240        * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1241        * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1242        * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1243        * populating a temporary variable with the sequence (0, 1, 2, 3),
1244        * and then reading from it using vstride=1, width=4, hstride=0.
1245        * These computations hold good for 4x multisampling as well.
1246        */
1247       emit(BRW_OPCODE_AND, t1,
1248            fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1249            fs_reg(brw_imm_d(0xc0)));
1250       emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1251       /* This works for both SIMD8 and SIMD16 */
1252       emit(MOV(t2, brw_imm_v(0x3210)));
1253       /* This special instruction takes care of setting vstride=1,
1254        * width=4, hstride=0 of t2 during an ADD instruction.
1255        */
1256       emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1257    } else {
1258       /* As per GL_ARB_sample_shading specification:
1259        * "When rendering to a non-multisample buffer, or if multisample
1260        *  rasterization is disabled, gl_SampleID will always be zero."
1261        */
1262       emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1263    }
1264
1265    return reg;
1266 }
1267
1268 fs_reg
1269 fs_visitor::fix_math_operand(fs_reg src)
1270 {
1271    /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1272     * might be able to do better by doing execsize = 1 math and then
1273     * expanding that result out, but we would need to be careful with
1274     * masking.
1275     *
1276     * The hardware ignores source modifiers (negate and abs) on math
1277     * instructions, so we also move to a temp to set those up.
1278     */
1279    if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1280        !src.abs && !src.negate)
1281       return src;
1282
1283    /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1284     * operands to math
1285     */
1286    if (brw->gen >= 7 && src.file != IMM)
1287       return src;
1288
1289    fs_reg expanded = fs_reg(this, glsl_type::float_type);
1290    expanded.type = src.type;
1291    emit(BRW_OPCODE_MOV, expanded, src);
1292    return expanded;
1293 }
1294
1295 fs_inst *
1296 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1297 {
1298    switch (opcode) {
1299    case SHADER_OPCODE_RCP:
1300    case SHADER_OPCODE_RSQ:
1301    case SHADER_OPCODE_SQRT:
1302    case SHADER_OPCODE_EXP2:
1303    case SHADER_OPCODE_LOG2:
1304    case SHADER_OPCODE_SIN:
1305    case SHADER_OPCODE_COS:
1306       break;
1307    default:
1308       assert(!"not reached: bad math opcode");
1309       return NULL;
1310    }
1311
1312    /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
1313     * might be able to do better by doing execsize = 1 math and then
1314     * expanding that result out, but we would need to be careful with
1315     * masking.
1316     *
1317     * Gen 6 hardware ignores source modifiers (negate and abs) on math
1318     * instructions, so we also move to a temp to set those up.
1319     */
1320    if (brw->gen >= 6)
1321       src = fix_math_operand(src);
1322
1323    fs_inst *inst = emit(opcode, dst, src);
1324
1325    if (brw->gen < 6) {
1326       inst->base_mrf = 2;
1327       inst->mlen = dispatch_width / 8;
1328    }
1329
1330    return inst;
1331 }
1332
1333 fs_inst *
1334 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1335 {
1336    int base_mrf = 2;
1337    fs_inst *inst;
1338
1339    switch (opcode) {
1340    case SHADER_OPCODE_INT_QUOTIENT:
1341    case SHADER_OPCODE_INT_REMAINDER:
1342       if (brw->gen >= 7 && dispatch_width == 16)
1343          fail("16-wide INTDIV unsupported\n");
1344       break;
1345    case SHADER_OPCODE_POW:
1346       break;
1347    default:
1348       assert(!"not reached: unsupported binary math opcode.");
1349       return NULL;
1350    }
1351
1352    if (brw->gen >= 6) {
1353       src0 = fix_math_operand(src0);
1354       src1 = fix_math_operand(src1);
1355
1356       inst = emit(opcode, dst, src0, src1);
1357    } else {
1358       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1359        * "Message Payload":
1360        *
1361        * "Operand0[7].  For the INT DIV functions, this operand is the
1362        *  denominator."
1363        *  ...
1364        * "Operand1[7].  For the INT DIV functions, this operand is the
1365        *  numerator."
1366        */
1367       bool is_int_div = opcode != SHADER_OPCODE_POW;
1368       fs_reg &op0 = is_int_div ? src1 : src0;
1369       fs_reg &op1 = is_int_div ? src0 : src1;
1370
1371       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1372       inst = emit(opcode, dst, op0, reg_null_f);
1373
1374       inst->base_mrf = base_mrf;
1375       inst->mlen = 2 * dispatch_width / 8;
1376    }
1377    return inst;
1378 }
1379
1380 void
1381 fs_visitor::assign_curb_setup()
1382 {
1383    c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
1384    if (dispatch_width == 8) {
1385       c->prog_data.first_curbe_grf = c->nr_payload_regs;
1386    } else {
1387       c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1388    }
1389
1390    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1391    foreach_list(node, &this->instructions) {
1392       fs_inst *inst = (fs_inst *)node;
1393
1394       for (unsigned int i = 0; i < 3; i++) {
1395          if (inst->src[i].file == UNIFORM) {
1396             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1397             struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1398                                                   constant_nr / 8,
1399                                                   constant_nr % 8);
1400
1401             inst->src[i].file = HW_REG;
1402             inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
1403          }
1404       }
1405    }
1406 }
1407
1408 void
1409 fs_visitor::calculate_urb_setup()
1410 {
1411    for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1412       c->prog_data.urb_setup[i] = -1;
1413    }
1414
1415    int urb_next = 0;
1416    /* Figure out where each of the incoming setup attributes lands. */
1417    if (brw->gen >= 6) {
1418       if (_mesa_bitcount_64(fp->Base.InputsRead &
1419                             BRW_FS_VARYING_INPUT_MASK) <= 16) {
1420          /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1421           * first 16 varying inputs, so we can put them wherever we want.
1422           * Just put them in order.
1423           *
1424           * This is useful because it means that (a) inputs not used by the
1425           * fragment shader won't take up valuable register space, and (b) we
1426           * won't have to recompile the fragment shader if it gets paired with
1427           * a different vertex (or geometry) shader.
1428           */
1429          for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1430             if (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1431                 BITFIELD64_BIT(i)) {
1432                c->prog_data.urb_setup[i] = urb_next++;
1433             }
1434          }
1435       } else {
1436          /* We have enough input varyings that the SF/SBE pipeline stage can't
1437           * arbitrarily rearrange them to suit our whim; we have to put them
1438           * in an order that matches the output of the previous pipeline stage
1439           * (geometry or vertex shader).
1440           */
1441          struct brw_vue_map prev_stage_vue_map;
1442          brw_compute_vue_map(brw, &prev_stage_vue_map,
1443                              c->key.input_slots_valid);
1444          int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1445          assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1446          for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1447               slot++) {
1448             int varying = prev_stage_vue_map.slot_to_varying[slot];
1449             /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1450              * unused.
1451              */
1452             if (varying != BRW_VARYING_SLOT_COUNT &&
1453                 (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1454                  BITFIELD64_BIT(varying))) {
1455                c->prog_data.urb_setup[varying] = slot - first_slot;
1456             }
1457          }
1458          urb_next = prev_stage_vue_map.num_slots - first_slot;
1459       }
1460    } else {
1461       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1462       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1463          /* Point size is packed into the header, not as a general attribute */
1464          if (i == VARYING_SLOT_PSIZ)
1465             continue;
1466
1467          if (c->key.input_slots_valid & BITFIELD64_BIT(i)) {
1468             /* The back color slot is skipped when the front color is
1469              * also written to.  In addition, some slots can be
1470              * written in the vertex shader and not read in the
1471              * fragment shader.  So the register number must always be
1472              * incremented, mapped or not.
1473              */
1474             if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1475                c->prog_data.urb_setup[i] = urb_next;
1476             urb_next++;
1477          }
1478       }
1479
1480       /*
1481        * It's a FS only attribute, and we did interpolation for this attribute
1482        * in SF thread. So, count it here, too.
1483        *
1484        * See compile_sf_prog() for more info.
1485        */
1486       if (fp->Base.InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1487          c->prog_data.urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1488    }
1489
1490    c->prog_data.num_varying_inputs = urb_next;
1491 }
1492
1493 void
1494 fs_visitor::assign_urb_setup()
1495 {
1496    int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1497
1498    /* Offset all the urb_setup[] index by the actual position of the
1499     * setup regs, now that the location of the constants has been chosen.
1500     */
1501    foreach_list(node, &this->instructions) {
1502       fs_inst *inst = (fs_inst *)node;
1503
1504       if (inst->opcode == FS_OPCODE_LINTERP) {
1505          assert(inst->src[2].file == HW_REG);
1506          inst->src[2].fixed_hw_reg.nr += urb_start;
1507       }
1508
1509       if (inst->opcode == FS_OPCODE_CINTERP) {
1510          assert(inst->src[0].file == HW_REG);
1511          inst->src[0].fixed_hw_reg.nr += urb_start;
1512       }
1513    }
1514
1515    /* Each attribute is 4 setup channels, each of which is half a reg. */
1516    this->first_non_payload_grf =
1517       urb_start + c->prog_data.num_varying_inputs * 2;
1518 }
1519
1520 /**
1521  * Split large virtual GRFs into separate components if we can.
1522  *
1523  * This is mostly duplicated with what brw_fs_vector_splitting does,
1524  * but that's really conservative because it's afraid of doing
1525  * splitting that doesn't result in real progress after the rest of
1526  * the optimization phases, which would cause infinite looping in
1527  * optimization.  We can do it once here, safely.  This also has the
1528  * opportunity to split interpolated values, or maybe even uniforms,
1529  * which we don't have at the IR level.
1530  *
1531  * We want to split, because virtual GRFs are what we register
1532  * allocate and spill (due to contiguousness requirements for some
1533  * instructions), and they're what we naturally generate in the
1534  * codegen process, but most virtual GRFs don't actually need to be
1535  * contiguous sets of GRFs.  If we split, we'll end up with reduced
1536  * live intervals and better dead code elimination and coalescing.
1537  */
1538 void
1539 fs_visitor::split_virtual_grfs()
1540 {
1541    int num_vars = this->virtual_grf_count;
1542    bool split_grf[num_vars];
1543    int new_virtual_grf[num_vars];
1544
1545    /* Try to split anything > 0 sized. */
1546    for (int i = 0; i < num_vars; i++) {
1547       if (this->virtual_grf_sizes[i] != 1)
1548          split_grf[i] = true;
1549       else
1550          split_grf[i] = false;
1551    }
1552
1553    if (brw->has_pln &&
1554        this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1555       /* PLN opcodes rely on the delta_xy being contiguous.  We only have to
1556        * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1557        * Gen6, that was the only supported interpolation mode, and since Gen6,
1558        * delta_x and delta_y are in fixed hardware registers.
1559        */
1560       split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1561          false;
1562    }
1563
1564    foreach_list(node, &this->instructions) {
1565       fs_inst *inst = (fs_inst *)node;
1566
1567       /* If there's a SEND message that requires contiguous destination
1568        * registers, no splitting is allowed.
1569        */
1570       if (inst->regs_written > 1) {
1571          split_grf[inst->dst.reg] = false;
1572       }
1573
1574       /* If we're sending from a GRF, don't split it, on the assumption that
1575        * the send is reading the whole thing.
1576        */
1577       if (inst->is_send_from_grf()) {
1578          for (int i = 0; i < 3; i++) {
1579             if (inst->src[i].file == GRF) {
1580                split_grf[inst->src[i].reg] = false;
1581             }
1582          }
1583       }
1584    }
1585
1586    /* Allocate new space for split regs.  Note that the virtual
1587     * numbers will be contiguous.
1588     */
1589    for (int i = 0; i < num_vars; i++) {
1590       if (split_grf[i]) {
1591          new_virtual_grf[i] = virtual_grf_alloc(1);
1592          for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1593             int reg = virtual_grf_alloc(1);
1594             assert(reg == new_virtual_grf[i] + j - 1);
1595             (void) reg;
1596          }
1597          this->virtual_grf_sizes[i] = 1;
1598       }
1599    }
1600
1601    foreach_list(node, &this->instructions) {
1602       fs_inst *inst = (fs_inst *)node;
1603
1604       if (inst->dst.file == GRF &&
1605           split_grf[inst->dst.reg] &&
1606           inst->dst.reg_offset != 0) {
1607          inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1608                           inst->dst.reg_offset - 1);
1609          inst->dst.reg_offset = 0;
1610       }
1611       for (int i = 0; i < 3; i++) {
1612          if (inst->src[i].file == GRF &&
1613              split_grf[inst->src[i].reg] &&
1614              inst->src[i].reg_offset != 0) {
1615             inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1616                                 inst->src[i].reg_offset - 1);
1617             inst->src[i].reg_offset = 0;
1618          }
1619       }
1620    }
1621    invalidate_live_intervals();
1622 }
1623
1624 /**
1625  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1626  *
1627  * During code generation, we create tons of temporary variables, many of
1628  * which get immediately killed and are never used again.  Yet, in later
1629  * optimization and analysis passes, such as compute_live_intervals, we need
1630  * to loop over all the virtual GRFs.  Compacting them can save a lot of
1631  * overhead.
1632  */
1633 void
1634 fs_visitor::compact_virtual_grfs()
1635 {
1636    /* Mark which virtual GRFs are used, and count how many. */
1637    int remap_table[this->virtual_grf_count];
1638    memset(remap_table, -1, sizeof(remap_table));
1639
1640    foreach_list(node, &this->instructions) {
1641       const fs_inst *inst = (const fs_inst *) node;
1642
1643       if (inst->dst.file == GRF)
1644          remap_table[inst->dst.reg] = 0;
1645
1646       for (int i = 0; i < 3; i++) {
1647          if (inst->src[i].file == GRF)
1648             remap_table[inst->src[i].reg] = 0;
1649       }
1650    }
1651
1652    /* In addition to registers used in instructions, fs_visitor keeps
1653     * direct references to certain special values which must be patched:
1654     */
1655    fs_reg *special[] = {
1656       &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1657       &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1658       &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1659       &delta_x[0], &delta_x[1], &delta_x[2],
1660       &delta_x[3], &delta_x[4], &delta_x[5],
1661       &delta_y[0], &delta_y[1], &delta_y[2],
1662       &delta_y[3], &delta_y[4], &delta_y[5],
1663    };
1664    STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1665    STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1666
1667    /* Treat all special values as used, to be conservative */
1668    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1669       if (special[i]->file == GRF)
1670          remap_table[special[i]->reg] = 0;
1671    }
1672
1673    /* Compact the GRF arrays. */
1674    int new_index = 0;
1675    for (int i = 0; i < this->virtual_grf_count; i++) {
1676       if (remap_table[i] != -1) {
1677          remap_table[i] = new_index;
1678          virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1679          invalidate_live_intervals();
1680          ++new_index;
1681       }
1682    }
1683
1684    this->virtual_grf_count = new_index;
1685
1686    /* Patch all the instructions to use the newly renumbered registers */
1687    foreach_list(node, &this->instructions) {
1688       fs_inst *inst = (fs_inst *) node;
1689
1690       if (inst->dst.file == GRF)
1691          inst->dst.reg = remap_table[inst->dst.reg];
1692
1693       for (int i = 0; i < 3; i++) {
1694          if (inst->src[i].file == GRF)
1695             inst->src[i].reg = remap_table[inst->src[i].reg];
1696       }
1697    }
1698
1699    /* Patch all the references to special values */
1700    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1701       if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1702          special[i]->reg = remap_table[special[i]->reg];
1703    }
1704 }
1705
1706 bool
1707 fs_visitor::remove_dead_constants()
1708 {
1709    if (dispatch_width == 8) {
1710       this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
1711       this->nr_params_remap = c->prog_data.nr_params;
1712
1713       for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
1714          this->params_remap[i] = -1;
1715
1716       /* Find which params are still in use. */
1717       foreach_list(node, &this->instructions) {
1718          fs_inst *inst = (fs_inst *)node;
1719
1720          for (int i = 0; i < 3; i++) {
1721             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1722
1723             if (inst->src[i].file != UNIFORM)
1724                continue;
1725
1726             /* Section 5.11 of the OpenGL 4.3 spec says:
1727              *
1728              *     "Out-of-bounds reads return undefined values, which include
1729              *     values from other variables of the active program or zero."
1730              */
1731             if (constant_nr < 0 || constant_nr >= (int)c->prog_data.nr_params) {
1732                constant_nr = 0;
1733             }
1734
1735             /* For now, set this to non-negative.  We'll give it the
1736              * actual new number in a moment, in order to keep the
1737              * register numbers nicely ordered.
1738              */
1739             this->params_remap[constant_nr] = 0;
1740          }
1741       }
1742
1743       /* Figure out what the new numbers for the params will be.  At some
1744        * point when we're doing uniform array access, we're going to want
1745        * to keep the distinction between .reg and .reg_offset, but for
1746        * now we don't care.
1747        */
1748       unsigned int new_nr_params = 0;
1749       for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1750          if (this->params_remap[i] != -1) {
1751             this->params_remap[i] = new_nr_params++;
1752          }
1753       }
1754
1755       /* Update the list of params to be uploaded to match our new numbering. */
1756       for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1757          int remapped = this->params_remap[i];
1758
1759          if (remapped == -1)
1760             continue;
1761
1762          c->prog_data.param[remapped] = c->prog_data.param[i];
1763       }
1764
1765       c->prog_data.nr_params = new_nr_params;
1766    } else {
1767       /* This should have been generated in the 8-wide pass already. */
1768       assert(this->params_remap);
1769    }
1770
1771    /* Now do the renumbering of the shader to remove unused params. */
1772    foreach_list(node, &this->instructions) {
1773       fs_inst *inst = (fs_inst *)node;
1774
1775       for (int i = 0; i < 3; i++) {
1776          int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1777
1778          if (inst->src[i].file != UNIFORM)
1779             continue;
1780
1781          /* as above alias to 0 */
1782          if (constant_nr < 0 || constant_nr >= (int)this->nr_params_remap) {
1783             constant_nr = 0;
1784          }
1785          assert(this->params_remap[constant_nr] != -1);
1786          inst->src[i].reg = this->params_remap[constant_nr];
1787          inst->src[i].reg_offset = 0;
1788       }
1789    }
1790
1791    return true;
1792 }
1793
1794 /*
1795  * Implements array access of uniforms by inserting a
1796  * PULL_CONSTANT_LOAD instruction.
1797  *
1798  * Unlike temporary GRF array access (where we don't support it due to
1799  * the difficulty of doing relative addressing on instruction
1800  * destinations), we could potentially do array access of uniforms
1801  * that were loaded in GRF space as push constants.  In real-world
1802  * usage we've seen, though, the arrays being used are always larger
1803  * than we could load as push constants, so just always move all
1804  * uniform array access out to a pull constant buffer.
1805  */
1806 void
1807 fs_visitor::move_uniform_array_access_to_pull_constants()
1808 {
1809    int pull_constant_loc[c->prog_data.nr_params];
1810
1811    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1812       pull_constant_loc[i] = -1;
1813    }
1814
1815    /* Walk through and find array access of uniforms.  Put a copy of that
1816     * uniform in the pull constant buffer.
1817     *
1818     * Note that we don't move constant-indexed accesses to arrays.  No
1819     * testing has been done of the performance impact of this choice.
1820     */
1821    foreach_list_safe(node, &this->instructions) {
1822       fs_inst *inst = (fs_inst *)node;
1823
1824       for (int i = 0 ; i < 3; i++) {
1825          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1826             continue;
1827
1828          int uniform = inst->src[i].reg;
1829
1830          /* If this array isn't already present in the pull constant buffer,
1831           * add it.
1832           */
1833          if (pull_constant_loc[uniform] == -1) {
1834             const float **values = &c->prog_data.param[uniform];
1835
1836             pull_constant_loc[uniform] = c->prog_data.nr_pull_params;
1837
1838             assert(param_size[uniform]);
1839
1840             for (int j = 0; j < param_size[uniform]; j++) {
1841                c->prog_data.pull_param[c->prog_data.nr_pull_params++] =
1842                   values[j];
1843             }
1844          }
1845
1846          /* Set up the annotation tracking for new generated instructions. */
1847          base_ir = inst->ir;
1848          current_annotation = inst->annotation;
1849
1850          fs_reg surf_index = fs_reg(c->prog_data.base.binding_table.pull_constants_start);
1851          fs_reg temp = fs_reg(this, glsl_type::float_type);
1852          exec_list list = VARYING_PULL_CONSTANT_LOAD(temp,
1853                                                      surf_index,
1854                                                      *inst->src[i].reladdr,
1855                                                      pull_constant_loc[uniform] +
1856                                                      inst->src[i].reg_offset);
1857          inst->insert_before(&list);
1858
1859          inst->src[i].file = temp.file;
1860          inst->src[i].reg = temp.reg;
1861          inst->src[i].reg_offset = temp.reg_offset;
1862          inst->src[i].reladdr = NULL;
1863       }
1864    }
1865 }
1866
1867 /**
1868  * Choose accesses from the UNIFORM file to demote to using the pull
1869  * constant buffer.
1870  *
1871  * We allow a fragment shader to have more than the specified minimum
1872  * maximum number of fragment shader uniform components (64).  If
1873  * there are too many of these, they'd fill up all of register space.
1874  * So, this will push some of them out to the pull constant buffer and
1875  * update the program to load them.
1876  */
1877 void
1878 fs_visitor::setup_pull_constants()
1879 {
1880    /* Only allow 16 registers (128 uniform components) as push constants. */
1881    unsigned int max_uniform_components = 16 * 8;
1882    if (c->prog_data.nr_params <= max_uniform_components)
1883       return;
1884
1885    if (dispatch_width == 16) {
1886       fail("Pull constants not supported in 16-wide\n");
1887       return;
1888    }
1889
1890    /* Just demote the end of the list.  We could probably do better
1891     * here, demoting things that are rarely used in the program first.
1892     */
1893    unsigned int pull_uniform_base = max_uniform_components;
1894
1895    int pull_constant_loc[c->prog_data.nr_params];
1896    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1897       if (i < pull_uniform_base) {
1898          pull_constant_loc[i] = -1;
1899       } else {
1900          pull_constant_loc[i] = -1;
1901          /* If our constant is already being uploaded for reladdr purposes,
1902           * reuse it.
1903           */
1904          for (unsigned int j = 0; j < c->prog_data.nr_pull_params; j++) {
1905             if (c->prog_data.pull_param[j] == c->prog_data.param[i]) {
1906                pull_constant_loc[i] = j;
1907                break;
1908             }
1909          }
1910          if (pull_constant_loc[i] == -1) {
1911             int pull_index = c->prog_data.nr_pull_params++;
1912             c->prog_data.pull_param[pull_index] = c->prog_data.param[i];
1913             pull_constant_loc[i] = pull_index;;
1914          }
1915       }
1916    }
1917    c->prog_data.nr_params = pull_uniform_base;
1918
1919    foreach_list(node, &this->instructions) {
1920       fs_inst *inst = (fs_inst *)node;
1921
1922       for (int i = 0; i < 3; i++) {
1923          if (inst->src[i].file != UNIFORM)
1924             continue;
1925
1926          int pull_index = pull_constant_loc[inst->src[i].reg +
1927                                             inst->src[i].reg_offset];
1928          if (pull_index == -1)
1929             continue;
1930
1931          assert(!inst->src[i].reladdr);
1932
1933          fs_reg dst = fs_reg(this, glsl_type::float_type);
1934          fs_reg index = fs_reg(c->prog_data.base.binding_table.pull_constants_start);
1935          fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1936          fs_inst *pull =
1937             new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1938                                  dst, index, offset);
1939          pull->ir = inst->ir;
1940          pull->annotation = inst->annotation;
1941
1942          inst->insert_before(pull);
1943
1944          inst->src[i].file = GRF;
1945          inst->src[i].reg = dst.reg;
1946          inst->src[i].reg_offset = 0;
1947          inst->src[i].smear = pull_index & 3;
1948       }
1949    }
1950 }
1951
1952 bool
1953 fs_visitor::opt_algebraic()
1954 {
1955    bool progress = false;
1956
1957    foreach_list(node, &this->instructions) {
1958       fs_inst *inst = (fs_inst *)node;
1959
1960       switch (inst->opcode) {
1961       case BRW_OPCODE_MUL:
1962          if (inst->src[1].file != IMM)
1963             continue;
1964
1965          /* a * 1.0 = a */
1966          if (inst->src[1].is_one()) {
1967             inst->opcode = BRW_OPCODE_MOV;
1968             inst->src[1] = reg_undef;
1969             progress = true;
1970             break;
1971          }
1972
1973          /* a * 0.0 = 0.0 */
1974          if (inst->src[1].is_zero()) {
1975             inst->opcode = BRW_OPCODE_MOV;
1976             inst->src[0] = inst->src[1];
1977             inst->src[1] = reg_undef;
1978             progress = true;
1979             break;
1980          }
1981
1982          break;
1983       case BRW_OPCODE_ADD:
1984          if (inst->src[1].file != IMM)
1985             continue;
1986
1987          /* a + 0.0 = a */
1988          if (inst->src[1].is_zero()) {
1989             inst->opcode = BRW_OPCODE_MOV;
1990             inst->src[1] = reg_undef;
1991             progress = true;
1992             break;
1993          }
1994          break;
1995       case BRW_OPCODE_OR:
1996          if (inst->src[0].equals(inst->src[1])) {
1997             inst->opcode = BRW_OPCODE_MOV;
1998             inst->src[1] = reg_undef;
1999             progress = true;
2000             break;
2001          }
2002          break;
2003       case BRW_OPCODE_SEL:
2004          if (inst->saturate && inst->src[1].file == IMM) {
2005             switch (inst->conditional_mod) {
2006             case BRW_CONDITIONAL_LE:
2007             case BRW_CONDITIONAL_L:
2008                switch (inst->src[1].type) {
2009                case BRW_REGISTER_TYPE_F:
2010                   if (inst->src[1].imm.f >= 1.0f) {
2011                      inst->opcode = BRW_OPCODE_MOV;
2012                      inst->src[1] = reg_undef;
2013                      progress = true;
2014                   }
2015                   break;
2016                default:
2017                   break;
2018                }
2019                break;
2020             case BRW_CONDITIONAL_GE:
2021             case BRW_CONDITIONAL_G:
2022                switch (inst->src[1].type) {
2023                case BRW_REGISTER_TYPE_F:
2024                   if (inst->src[1].imm.f <= 0.0f) {
2025                      inst->opcode = BRW_OPCODE_MOV;
2026                      inst->src[1] = reg_undef;
2027                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
2028                      progress = true;
2029                   }
2030                   break;
2031                default:
2032                   break;
2033                }
2034             default:
2035                break;
2036             }
2037          }
2038          break;
2039       default:
2040          break;
2041       }
2042    }
2043
2044    return progress;
2045 }
2046
2047 /**
2048  * Removes any instructions writing a VGRF where that VGRF is not used by any
2049  * later instruction.
2050  */
2051 bool
2052 fs_visitor::dead_code_eliminate()
2053 {
2054    bool progress = false;
2055    int pc = 0;
2056
2057    calculate_live_intervals();
2058
2059    foreach_list_safe(node, &this->instructions) {
2060       fs_inst *inst = (fs_inst *)node;
2061
2062       if (inst->dst.file == GRF) {
2063          bool dead = true;
2064
2065          for (int i = 0; i < inst->regs_written; i++) {
2066             int var = live_intervals->var_from_vgrf[inst->dst.reg];
2067             assert(live_intervals->end[var + inst->dst.reg_offset + i] >= pc);
2068             if (live_intervals->end[var + inst->dst.reg_offset + i] != pc) {
2069                dead = false;
2070                break;
2071             }
2072          }
2073
2074          if (dead) {
2075             /* Don't dead code eliminate instructions that write to the
2076              * accumulator as a side-effect. Instead just set the destination
2077              * to the null register to free it.
2078              */
2079             switch (inst->opcode) {
2080             case BRW_OPCODE_ADDC:
2081             case BRW_OPCODE_SUBB:
2082             case BRW_OPCODE_MACH:
2083                inst->dst = fs_reg(retype(brw_null_reg(), inst->dst.type));
2084                break;
2085             default:
2086                inst->remove();
2087                progress = true;
2088                break;
2089             }
2090          }
2091       }
2092
2093       pc++;
2094    }
2095
2096    if (progress)
2097       invalidate_live_intervals();
2098
2099    return progress;
2100 }
2101
2102 struct dead_code_hash_key
2103 {
2104    int vgrf;
2105    int reg_offset;
2106 };
2107
2108 static bool
2109 dead_code_hash_compare(const void *a, const void *b)
2110 {
2111    return memcmp(a, b, sizeof(struct dead_code_hash_key)) == 0;
2112 }
2113
2114 static void
2115 clear_dead_code_hash(struct hash_table *ht)
2116 {
2117    struct hash_entry *entry;
2118
2119    hash_table_foreach(ht, entry) {
2120       _mesa_hash_table_remove(ht, entry);
2121    }
2122 }
2123
2124 static void
2125 insert_dead_code_hash(struct hash_table *ht,
2126                       int vgrf, int reg_offset, fs_inst *inst)
2127 {
2128    /* We don't bother freeing keys, because they'll be GCed with the ht. */
2129    struct dead_code_hash_key *key = ralloc(ht, struct dead_code_hash_key);
2130
2131    key->vgrf = vgrf;
2132    key->reg_offset = reg_offset;
2133
2134    _mesa_hash_table_insert(ht, _mesa_hash_data(key, sizeof(*key)), key, inst);
2135 }
2136
2137 static struct hash_entry *
2138 get_dead_code_hash_entry(struct hash_table *ht, int vgrf, int reg_offset)
2139 {
2140    struct dead_code_hash_key key;
2141
2142    key.vgrf = vgrf;
2143    key.reg_offset = reg_offset;
2144
2145    return _mesa_hash_table_search(ht, _mesa_hash_data(&key, sizeof(key)), &key);
2146 }
2147
2148 static void
2149 remove_dead_code_hash(struct hash_table *ht,
2150                       int vgrf, int reg_offset)
2151 {
2152    struct hash_entry *entry = get_dead_code_hash_entry(ht, vgrf, reg_offset);
2153    if (!entry)
2154       return;
2155
2156    _mesa_hash_table_remove(ht, entry);
2157 }
2158
2159 /**
2160  * Walks basic blocks, removing any regs that are written but not read before
2161  * being redefined.
2162  *
2163  * The dead_code_eliminate() function implements a global dead code
2164  * elimination, but it only handles the removing the last write to a register
2165  * if it's never read.  This one can handle intermediate writes, but only
2166  * within a basic block.
2167  */
2168 bool
2169 fs_visitor::dead_code_eliminate_local()
2170 {
2171    struct hash_table *ht;
2172    bool progress = false;
2173
2174    ht = _mesa_hash_table_create(mem_ctx, dead_code_hash_compare);
2175
2176    foreach_list_safe(node, &this->instructions) {
2177       fs_inst *inst = (fs_inst *)node;
2178
2179       /* At a basic block, empty the HT since we don't understand dataflow
2180        * here.
2181        */
2182       if (inst->is_control_flow()) {
2183          clear_dead_code_hash(ht);
2184          continue;
2185       }
2186
2187       /* Clear the HT of any instructions that got read. */
2188       for (int i = 0; i < 3; i++) {
2189          fs_reg src = inst->src[i];
2190          if (src.file != GRF)
2191             continue;
2192
2193          int read = 1;
2194          if (inst->is_send_from_grf())
2195             read = virtual_grf_sizes[src.reg] - src.reg_offset;
2196
2197          for (int reg_offset = src.reg_offset;
2198               reg_offset < src.reg_offset + read;
2199               reg_offset++) {
2200             remove_dead_code_hash(ht, src.reg, reg_offset);
2201          }
2202       }
2203
2204       /* Add any update of a GRF to the HT, removing a previous write if it
2205        * wasn't read.
2206        */
2207       if (inst->dst.file == GRF) {
2208          if (inst->regs_written > 1) {
2209             /* We don't know how to trim channels from an instruction's
2210              * writes, so we can't incrementally remove unread channels from
2211              * it.  Just remove whatever it overwrites from the table
2212              */
2213             for (int i = 0; i < inst->regs_written; i++) {
2214                remove_dead_code_hash(ht,
2215                                      inst->dst.reg,
2216                                      inst->dst.reg_offset + i);
2217             }
2218          } else {
2219             struct hash_entry *entry =
2220                get_dead_code_hash_entry(ht, inst->dst.reg,
2221                                         inst->dst.reg_offset);
2222
2223             if (inst->is_partial_write()) {
2224                /* For a partial write, we can't remove any previous dead code
2225                 * candidate, since we're just modifying their result, but we can
2226                 * be dead code eliminiated ourselves.
2227                 */
2228                if (entry) {
2229                   entry->data = inst;
2230                } else {
2231                   insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
2232                                         inst);
2233                }
2234             } else {
2235                if (entry) {
2236                   /* We're completely updating a channel, and there was a
2237                    * previous write to the channel that wasn't read.  Kill it!
2238                    */
2239                   fs_inst *inst = (fs_inst *)entry->data;
2240                   inst->remove();
2241                   progress = true;
2242                   _mesa_hash_table_remove(ht, entry);
2243                }
2244
2245                insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
2246                                      inst);
2247             }
2248          }
2249       }
2250    }
2251
2252    _mesa_hash_table_destroy(ht, NULL);
2253
2254    if (progress)
2255       invalidate_live_intervals();
2256
2257    return progress;
2258 }
2259
2260 /**
2261  * Implements a second type of register coalescing: This one checks if
2262  * the two regs involved in a raw move don't interfere, in which case
2263  * they can both by stored in the same place and the MOV removed.
2264  */
2265 bool
2266 fs_visitor::register_coalesce_2()
2267 {
2268    bool progress = false;
2269
2270    calculate_live_intervals();
2271
2272    foreach_list_safe(node, &this->instructions) {
2273       fs_inst *inst = (fs_inst *)node;
2274
2275       if (inst->opcode != BRW_OPCODE_MOV ||
2276           inst->is_partial_write() ||
2277           inst->saturate ||
2278           inst->src[0].file != GRF ||
2279           inst->src[0].negate ||
2280           inst->src[0].abs ||
2281           inst->src[0].smear != -1 ||
2282           inst->dst.file != GRF ||
2283           inst->dst.type != inst->src[0].type ||
2284           virtual_grf_sizes[inst->src[0].reg] != 1) {
2285          continue;
2286       }
2287
2288       int var_from = live_intervals->var_from_reg(&inst->src[0]);
2289       int var_to = live_intervals->var_from_reg(&inst->dst);
2290
2291       if (live_intervals->vars_interfere(var_from, var_to))
2292          continue;
2293
2294       int reg_from = inst->src[0].reg;
2295       assert(inst->src[0].reg_offset == 0);
2296       int reg_to = inst->dst.reg;
2297       int reg_to_offset = inst->dst.reg_offset;
2298
2299       foreach_list(node, &this->instructions) {
2300          fs_inst *scan_inst = (fs_inst *)node;
2301
2302          if (scan_inst->dst.file == GRF &&
2303              scan_inst->dst.reg == reg_from) {
2304             scan_inst->dst.reg = reg_to;
2305             scan_inst->dst.reg_offset = reg_to_offset;
2306          }
2307          for (int i = 0; i < 3; i++) {
2308             if (scan_inst->src[i].file == GRF &&
2309                 scan_inst->src[i].reg == reg_from) {
2310                scan_inst->src[i].reg = reg_to;
2311                scan_inst->src[i].reg_offset = reg_to_offset;
2312             }
2313          }
2314       }
2315
2316       inst->remove();
2317       progress = true;
2318       continue;
2319    }
2320
2321    if (progress)
2322       invalidate_live_intervals();
2323
2324    return progress;
2325 }
2326
2327 bool
2328 fs_visitor::register_coalesce()
2329 {
2330    bool progress = false;
2331    int if_depth = 0;
2332    int loop_depth = 0;
2333
2334    foreach_list_safe(node, &this->instructions) {
2335       fs_inst *inst = (fs_inst *)node;
2336
2337       /* Make sure that we dominate the instructions we're going to
2338        * scan for interfering with our coalescing, or we won't have
2339        * scanned enough to see if anything interferes with our
2340        * coalescing.  We don't dominate the following instructions if
2341        * we're in a loop or an if block.
2342        */
2343       switch (inst->opcode) {
2344       case BRW_OPCODE_DO:
2345          loop_depth++;
2346          break;
2347       case BRW_OPCODE_WHILE:
2348          loop_depth--;
2349          break;
2350       case BRW_OPCODE_IF:
2351          if_depth++;
2352          break;
2353       case BRW_OPCODE_ENDIF:
2354          if_depth--;
2355          break;
2356       default:
2357          break;
2358       }
2359       if (loop_depth || if_depth)
2360          continue;
2361
2362       if (inst->opcode != BRW_OPCODE_MOV ||
2363           inst->is_partial_write() ||
2364           inst->saturate ||
2365           inst->dst.file != GRF || (inst->src[0].file != GRF &&
2366                                     inst->src[0].file != UNIFORM)||
2367           inst->dst.type != inst->src[0].type)
2368          continue;
2369
2370       bool has_source_modifiers = (inst->src[0].abs ||
2371                                    inst->src[0].negate ||
2372                                    inst->src[0].smear != -1 ||
2373                                    inst->src[0].file == UNIFORM);
2374
2375       /* Found a move of a GRF to a GRF.  Let's see if we can coalesce
2376        * them: check for no writes to either one until the exit of the
2377        * program.
2378        */
2379       bool interfered = false;
2380
2381       for (fs_inst *scan_inst = (fs_inst *)inst->next;
2382            !scan_inst->is_tail_sentinel();
2383            scan_inst = (fs_inst *)scan_inst->next) {
2384          if (scan_inst->dst.file == GRF) {
2385             if (scan_inst->overwrites_reg(inst->dst) ||
2386                 scan_inst->overwrites_reg(inst->src[0])) {
2387                interfered = true;
2388                break;
2389             }
2390          }
2391
2392          if (has_source_modifiers) {
2393             for (int i = 0; i < 3; i++) {
2394                if (scan_inst->src[i].file == GRF &&
2395                    scan_inst->src[i].reg == inst->dst.reg &&
2396                    scan_inst->src[i].reg_offset == inst->dst.reg_offset &&
2397                    inst->dst.type != scan_inst->src[i].type)
2398                {
2399                  interfered = true;
2400                  break;
2401                }
2402             }
2403          }
2404
2405
2406          /* The gen6 MATH instruction can't handle source modifiers or
2407           * unusual register regions, so avoid coalescing those for
2408           * now.  We should do something more specific.
2409           */
2410          if (has_source_modifiers && !can_do_source_mods(scan_inst)) {
2411             interfered = true;
2412             break;
2413          }
2414
2415          if (scan_inst->mlen > 0 && scan_inst->base_mrf == -1 &&
2416              scan_inst->src[0].file == GRF &&
2417              scan_inst->src[0].reg == inst->dst.reg) {
2418             interfered = true;
2419             break;
2420          }
2421
2422          /* The accumulator result appears to get used for the
2423           * conditional modifier generation.  When negating a UD
2424           * value, there is a 33rd bit generated for the sign in the
2425           * accumulator value, so now you can't check, for example,
2426           * equality with a 32-bit value.  See piglit fs-op-neg-uint.
2427           */
2428          if (scan_inst->conditional_mod &&
2429              inst->src[0].negate &&
2430              inst->src[0].type == BRW_REGISTER_TYPE_UD) {
2431             interfered = true;
2432             break;
2433          }
2434       }
2435       if (interfered) {
2436          continue;
2437       }
2438
2439       /* Rewrite the later usage to point at the source of the move to
2440        * be removed.
2441        */
2442       for (fs_inst *scan_inst = inst;
2443            !scan_inst->is_tail_sentinel();
2444            scan_inst = (fs_inst *)scan_inst->next) {
2445          for (int i = 0; i < 3; i++) {
2446             if (scan_inst->src[i].file == GRF &&
2447                 scan_inst->src[i].reg == inst->dst.reg &&
2448                 scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
2449                fs_reg new_src = inst->src[0];
2450                if (scan_inst->src[i].abs) {
2451                   new_src.negate = 0;
2452                   new_src.abs = 1;
2453                }
2454                new_src.negate ^= scan_inst->src[i].negate;
2455                new_src.sechalf = scan_inst->src[i].sechalf;
2456                scan_inst->src[i] = new_src;
2457             }
2458          }
2459       }
2460
2461       inst->remove();
2462       progress = true;
2463    }
2464
2465    if (progress)
2466       invalidate_live_intervals();
2467
2468    return progress;
2469 }
2470
2471
2472 bool
2473 fs_visitor::compute_to_mrf()
2474 {
2475    bool progress = false;
2476    int next_ip = 0;
2477
2478    calculate_live_intervals();
2479
2480    foreach_list_safe(node, &this->instructions) {
2481       fs_inst *inst = (fs_inst *)node;
2482
2483       int ip = next_ip;
2484       next_ip++;
2485
2486       if (inst->opcode != BRW_OPCODE_MOV ||
2487           inst->is_partial_write() ||
2488           inst->dst.file != MRF || inst->src[0].file != GRF ||
2489           inst->dst.type != inst->src[0].type ||
2490           inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
2491          continue;
2492
2493       /* Work out which hardware MRF registers are written by this
2494        * instruction.
2495        */
2496       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2497       int mrf_high;
2498       if (inst->dst.reg & BRW_MRF_COMPR4) {
2499          mrf_high = mrf_low + 4;
2500       } else if (dispatch_width == 16 &&
2501                  (!inst->force_uncompressed && !inst->force_sechalf)) {
2502          mrf_high = mrf_low + 1;
2503       } else {
2504          mrf_high = mrf_low;
2505       }
2506
2507       /* Can't compute-to-MRF this GRF if someone else was going to
2508        * read it later.
2509        */
2510       if (this->virtual_grf_end[inst->src[0].reg] > ip)
2511          continue;
2512
2513       /* Found a move of a GRF to a MRF.  Let's see if we can go
2514        * rewrite the thing that made this GRF to write into the MRF.
2515        */
2516       fs_inst *scan_inst;
2517       for (scan_inst = (fs_inst *)inst->prev;
2518            scan_inst->prev != NULL;
2519            scan_inst = (fs_inst *)scan_inst->prev) {
2520          if (scan_inst->dst.file == GRF &&
2521              scan_inst->dst.reg == inst->src[0].reg) {
2522             /* Found the last thing to write our reg we want to turn
2523              * into a compute-to-MRF.
2524              */
2525
2526             /* If this one instruction didn't populate all the
2527              * channels, bail.  We might be able to rewrite everything
2528              * that writes that reg, but it would require smarter
2529              * tracking to delay the rewriting until complete success.
2530              */
2531             if (scan_inst->is_partial_write())
2532                break;
2533
2534             /* Things returning more than one register would need us to
2535              * understand coalescing out more than one MOV at a time.
2536              */
2537             if (scan_inst->regs_written > 1)
2538                break;
2539
2540             /* SEND instructions can't have MRF as a destination. */
2541             if (scan_inst->mlen)
2542                break;
2543
2544             if (brw->gen == 6) {
2545                /* gen6 math instructions must have the destination be
2546                 * GRF, so no compute-to-MRF for them.
2547                 */
2548                if (scan_inst->is_math()) {
2549                   break;
2550                }
2551             }
2552
2553             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2554                /* Found the creator of our MRF's source value. */
2555                scan_inst->dst.file = MRF;
2556                scan_inst->dst.reg = inst->dst.reg;
2557                scan_inst->saturate |= inst->saturate;
2558                inst->remove();
2559                progress = true;
2560             }
2561             break;
2562          }
2563
2564          /* We don't handle control flow here.  Most computation of
2565           * values that end up in MRFs are shortly before the MRF
2566           * write anyway.
2567           */
2568          if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2569             break;
2570
2571          /* You can't read from an MRF, so if someone else reads our
2572           * MRF's source GRF that we wanted to rewrite, that stops us.
2573           */
2574          bool interfered = false;
2575          for (int i = 0; i < 3; i++) {
2576             if (scan_inst->src[i].file == GRF &&
2577                 scan_inst->src[i].reg == inst->src[0].reg &&
2578                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2579                interfered = true;
2580             }
2581          }
2582          if (interfered)
2583             break;
2584
2585          if (scan_inst->dst.file == MRF) {
2586             /* If somebody else writes our MRF here, we can't
2587              * compute-to-MRF before that.
2588              */
2589             int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2590             int scan_mrf_high;
2591
2592             if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2593                scan_mrf_high = scan_mrf_low + 4;
2594             } else if (dispatch_width == 16 &&
2595                        (!scan_inst->force_uncompressed &&
2596                         !scan_inst->force_sechalf)) {
2597                scan_mrf_high = scan_mrf_low + 1;
2598             } else {
2599                scan_mrf_high = scan_mrf_low;
2600             }
2601
2602             if (mrf_low == scan_mrf_low ||
2603                 mrf_low == scan_mrf_high ||
2604                 mrf_high == scan_mrf_low ||
2605                 mrf_high == scan_mrf_high) {
2606                break;
2607             }
2608          }
2609
2610          if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2611             /* Found a SEND instruction, which means that there are
2612              * live values in MRFs from base_mrf to base_mrf +
2613              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
2614              * above it.
2615              */
2616             if (mrf_low >= scan_inst->base_mrf &&
2617                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2618                break;
2619             }
2620             if (mrf_high >= scan_inst->base_mrf &&
2621                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2622                break;
2623             }
2624          }
2625       }
2626    }
2627
2628    if (progress)
2629       invalidate_live_intervals();
2630
2631    return progress;
2632 }
2633
2634 /**
2635  * Walks through basic blocks, looking for repeated MRF writes and
2636  * removing the later ones.
2637  */
2638 bool
2639 fs_visitor::remove_duplicate_mrf_writes()
2640 {
2641    fs_inst *last_mrf_move[16];
2642    bool progress = false;
2643
2644    /* Need to update the MRF tracking for compressed instructions. */
2645    if (dispatch_width == 16)
2646       return false;
2647
2648    memset(last_mrf_move, 0, sizeof(last_mrf_move));
2649
2650    foreach_list_safe(node, &this->instructions) {
2651       fs_inst *inst = (fs_inst *)node;
2652
2653       if (inst->is_control_flow()) {
2654          memset(last_mrf_move, 0, sizeof(last_mrf_move));
2655       }
2656
2657       if (inst->opcode == BRW_OPCODE_MOV &&
2658           inst->dst.file == MRF) {
2659          fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2660          if (prev_inst && inst->equals(prev_inst)) {
2661             inst->remove();
2662             progress = true;
2663             continue;
2664          }
2665       }
2666
2667       /* Clear out the last-write records for MRFs that were overwritten. */
2668       if (inst->dst.file == MRF) {
2669          last_mrf_move[inst->dst.reg] = NULL;
2670       }
2671
2672       if (inst->mlen > 0 && inst->base_mrf != -1) {
2673          /* Found a SEND instruction, which will include two or fewer
2674           * implied MRF writes.  We could do better here.
2675           */
2676          for (int i = 0; i < implied_mrf_writes(inst); i++) {
2677             last_mrf_move[inst->base_mrf + i] = NULL;
2678          }
2679       }
2680
2681       /* Clear out any MRF move records whose sources got overwritten. */
2682       if (inst->dst.file == GRF) {
2683          for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2684             if (last_mrf_move[i] &&
2685                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2686                last_mrf_move[i] = NULL;
2687             }
2688          }
2689       }
2690
2691       if (inst->opcode == BRW_OPCODE_MOV &&
2692           inst->dst.file == MRF &&
2693           inst->src[0].file == GRF &&
2694           !inst->is_partial_write()) {
2695          last_mrf_move[inst->dst.reg] = inst;
2696       }
2697    }
2698
2699    if (progress)
2700       invalidate_live_intervals();
2701
2702    return progress;
2703 }
2704
2705 static void
2706 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2707                         int first_grf, int grf_len)
2708 {
2709    bool inst_16wide = (dispatch_width > 8 &&
2710                        !inst->force_uncompressed &&
2711                        !inst->force_sechalf);
2712
2713    /* Clear the flag for registers that actually got read (as expected). */
2714    for (int i = 0; i < 3; i++) {
2715       int grf;
2716       if (inst->src[i].file == GRF) {
2717          grf = inst->src[i].reg;
2718       } else if (inst->src[i].file == HW_REG &&
2719                  inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2720          grf = inst->src[i].fixed_hw_reg.nr;
2721       } else {
2722          continue;
2723       }
2724
2725       if (grf >= first_grf &&
2726           grf < first_grf + grf_len) {
2727          deps[grf - first_grf] = false;
2728          if (inst_16wide)
2729             deps[grf - first_grf + 1] = false;
2730       }
2731    }
2732 }
2733
2734 /**
2735  * Implements this workaround for the original 965:
2736  *
2737  *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2738  *      check for post destination dependencies on this instruction, software
2739  *      must ensure that there is no destination hazard for the case of ‘write
2740  *      followed by a posted write’ shown in the following example.
2741  *
2742  *      1. mov r3 0
2743  *      2. send r3.xy <rest of send instruction>
2744  *      3. mov r2 r3
2745  *
2746  *      Due to no post-destination dependency check on the ‘send’, the above
2747  *      code sequence could have two instructions (1 and 2) in flight at the
2748  *      same time that both consider ‘r3’ as the target of their final writes.
2749  */
2750 void
2751 fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
2752 {
2753    int reg_size = dispatch_width / 8;
2754    int write_len = inst->regs_written * reg_size;
2755    int first_write_grf = inst->dst.reg;
2756    bool needs_dep[BRW_MAX_MRF];
2757    assert(write_len < (int)sizeof(needs_dep) - 1);
2758
2759    memset(needs_dep, false, sizeof(needs_dep));
2760    memset(needs_dep, true, write_len);
2761
2762    clear_deps_for_inst_src(inst, dispatch_width,
2763                            needs_dep, first_write_grf, write_len);
2764
2765    /* Walk backwards looking for writes to registers we're writing which
2766     * aren't read since being written.  If we hit the start of the program,
2767     * we assume that there are no outstanding dependencies on entry to the
2768     * program.
2769     */
2770    for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2771         scan_inst != NULL;
2772         scan_inst = (fs_inst *)scan_inst->prev) {
2773
2774       /* If we hit control flow, assume that there *are* outstanding
2775        * dependencies, and force their cleanup before our instruction.
2776        */
2777       if (scan_inst->is_control_flow()) {
2778          for (int i = 0; i < write_len; i++) {
2779             if (needs_dep[i]) {
2780                inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2781             }
2782          }
2783          return;
2784       }
2785
2786       bool scan_inst_16wide = (dispatch_width > 8 &&
2787                                !scan_inst->force_uncompressed &&
2788                                !scan_inst->force_sechalf);
2789
2790       /* We insert our reads as late as possible on the assumption that any
2791        * instruction but a MOV that might have left us an outstanding
2792        * dependency has more latency than a MOV.
2793        */
2794       if (scan_inst->dst.file == GRF) {
2795          for (int i = 0; i < scan_inst->regs_written; i++) {
2796             int reg = scan_inst->dst.reg + i * reg_size;
2797
2798             if (reg >= first_write_grf &&
2799                 reg < first_write_grf + write_len &&
2800                 needs_dep[reg - first_write_grf]) {
2801                inst->insert_before(DEP_RESOLVE_MOV(reg));
2802                needs_dep[reg - first_write_grf] = false;
2803                if (scan_inst_16wide)
2804                   needs_dep[reg - first_write_grf + 1] = false;
2805             }
2806          }
2807       }
2808
2809       /* Clear the flag for registers that actually got read (as expected). */
2810       clear_deps_for_inst_src(scan_inst, dispatch_width,
2811                               needs_dep, first_write_grf, write_len);
2812
2813       /* Continue the loop only if we haven't resolved all the dependencies */
2814       int i;
2815       for (i = 0; i < write_len; i++) {
2816          if (needs_dep[i])
2817             break;
2818       }
2819       if (i == write_len)
2820          return;
2821    }
2822 }
2823
2824 /**
2825  * Implements this workaround for the original 965:
2826  *
2827  *     "[DevBW, DevCL] Errata: A destination register from a send can not be
2828  *      used as a destination register until after it has been sourced by an
2829  *      instruction with a different destination register.
2830  */
2831 void
2832 fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
2833 {
2834    int write_len = inst->regs_written * dispatch_width / 8;
2835    int first_write_grf = inst->dst.reg;
2836    bool needs_dep[BRW_MAX_MRF];
2837    assert(write_len < (int)sizeof(needs_dep) - 1);
2838
2839    memset(needs_dep, false, sizeof(needs_dep));
2840    memset(needs_dep, true, write_len);
2841    /* Walk forwards looking for writes to registers we're writing which aren't
2842     * read before being written.
2843     */
2844    for (fs_inst *scan_inst = (fs_inst *)inst->next;
2845         !scan_inst->is_tail_sentinel();
2846         scan_inst = (fs_inst *)scan_inst->next) {
2847       /* If we hit control flow, force resolve all remaining dependencies. */
2848       if (scan_inst->is_control_flow()) {
2849          for (int i = 0; i < write_len; i++) {
2850             if (needs_dep[i])
2851                scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2852          }
2853          return;
2854       }
2855
2856       /* Clear the flag for registers that actually got read (as expected). */
2857       clear_deps_for_inst_src(scan_inst, dispatch_width,
2858                               needs_dep, first_write_grf, write_len);
2859
2860       /* We insert our reads as late as possible since they're reading the
2861        * result of a SEND, which has massive latency.
2862        */
2863       if (scan_inst->dst.file == GRF &&
2864           scan_inst->dst.reg >= first_write_grf &&
2865           scan_inst->dst.reg < first_write_grf + write_len &&
2866           needs_dep[scan_inst->dst.reg - first_write_grf]) {
2867          scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2868          needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2869       }
2870
2871       /* Continue the loop only if we haven't resolved all the dependencies */
2872       int i;
2873       for (i = 0; i < write_len; i++) {
2874          if (needs_dep[i])
2875             break;
2876       }
2877       if (i == write_len)
2878          return;
2879    }
2880
2881    /* If we hit the end of the program, resolve all remaining dependencies out
2882     * of paranoia.
2883     */
2884    fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2885    assert(last_inst->eot);
2886    for (int i = 0; i < write_len; i++) {
2887       if (needs_dep[i])
2888          last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2889    }
2890 }
2891
2892 void
2893 fs_visitor::insert_gen4_send_dependency_workarounds()
2894 {
2895    if (brw->gen != 4 || brw->is_g4x)
2896       return;
2897
2898    /* Note that we're done with register allocation, so GRF fs_regs always
2899     * have a .reg_offset of 0.
2900     */
2901
2902    foreach_list_safe(node, &this->instructions) {
2903       fs_inst *inst = (fs_inst *)node;
2904
2905       if (inst->mlen != 0 && inst->dst.file == GRF) {
2906          insert_gen4_pre_send_dependency_workarounds(inst);
2907          insert_gen4_post_send_dependency_workarounds(inst);
2908       }
2909    }
2910 }
2911
2912 /**
2913  * Turns the generic expression-style uniform pull constant load instruction
2914  * into a hardware-specific series of instructions for loading a pull
2915  * constant.
2916  *
2917  * The expression style allows the CSE pass before this to optimize out
2918  * repeated loads from the same offset, and gives the pre-register-allocation
2919  * scheduling full flexibility, while the conversion to native instructions
2920  * allows the post-register-allocation scheduler the best information
2921  * possible.
2922  *
2923  * Note that execution masking for setting up pull constant loads is special:
2924  * the channels that need to be written are unrelated to the current execution
2925  * mask, since a later instruction will use one of the result channels as a
2926  * source operand for all 8 or 16 of its channels.
2927  */
2928 void
2929 fs_visitor::lower_uniform_pull_constant_loads()
2930 {
2931    foreach_list(node, &this->instructions) {
2932       fs_inst *inst = (fs_inst *)node;
2933
2934       if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2935          continue;
2936
2937       if (brw->gen >= 7) {
2938          /* The offset arg before was a vec4-aligned byte offset.  We need to
2939           * turn it into a dword offset.
2940           */
2941          fs_reg const_offset_reg = inst->src[1];
2942          assert(const_offset_reg.file == IMM &&
2943                 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2944          const_offset_reg.imm.u /= 4;
2945          fs_reg payload = fs_reg(this, glsl_type::uint_type);
2946
2947          /* This is actually going to be a MOV, but since only the first dword
2948           * is accessed, we have a special opcode to do just that one.  Note
2949           * that this needs to be an operation that will be considered a def
2950           * by live variable analysis, or register allocation will explode.
2951           */
2952          fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2953                                                payload, const_offset_reg);
2954          setup->force_writemask_all = true;
2955
2956          setup->ir = inst->ir;
2957          setup->annotation = inst->annotation;
2958          inst->insert_before(setup);
2959
2960          /* Similarly, this will only populate the first 4 channels of the
2961           * result register (since we only use smear values from 0-3), but we
2962           * don't tell the optimizer.
2963           */
2964          inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2965          inst->src[1] = payload;
2966
2967          invalidate_live_intervals();
2968       } else {
2969          /* Before register allocation, we didn't tell the scheduler about the
2970           * MRF we use.  We know it's safe to use this MRF because nothing
2971           * else does except for register spill/unspill, which generates and
2972           * uses its MRF within a single IR instruction.
2973           */
2974          inst->base_mrf = 14;
2975          inst->mlen = 1;
2976       }
2977    }
2978 }
2979
2980 void
2981 fs_visitor::dump_instruction(backend_instruction *be_inst)
2982 {
2983    fs_inst *inst = (fs_inst *)be_inst;
2984
2985    if (inst->predicate) {
2986       printf("(%cf0.%d) ",
2987              inst->predicate_inverse ? '-' : '+',
2988              inst->flag_subreg);
2989    }
2990
2991    printf("%s", brw_instruction_name(inst->opcode));
2992    if (inst->saturate)
2993       printf(".sat");
2994    if (inst->conditional_mod) {
2995       printf(".cmod");
2996       if (!inst->predicate &&
2997           (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2998                               inst->opcode != BRW_OPCODE_IF &&
2999                               inst->opcode != BRW_OPCODE_WHILE))) {
3000          printf(".f0.%d", inst->flag_subreg);
3001       }
3002    }
3003    printf(" ");
3004
3005
3006    switch (inst->dst.file) {
3007    case GRF:
3008       printf("vgrf%d", inst->dst.reg);
3009       if (inst->dst.reg_offset)
3010          printf("+%d", inst->dst.reg_offset);
3011       break;
3012    case MRF:
3013       printf("m%d", inst->dst.reg);
3014       break;
3015    case BAD_FILE:
3016       printf("(null)");
3017       break;
3018    case UNIFORM:
3019       printf("***u%d***", inst->dst.reg);
3020       break;
3021    case HW_REG:
3022       printf("hw_reg%d", inst->dst.fixed_hw_reg.nr);
3023       if (inst->dst.fixed_hw_reg.subnr)
3024          printf("+%d", inst->dst.fixed_hw_reg.subnr);
3025       break;
3026    default:
3027       printf("???");
3028       break;
3029    }
3030    printf(", ");
3031
3032    for (int i = 0; i < 3; i++) {
3033       if (inst->src[i].negate)
3034          printf("-");
3035       if (inst->src[i].abs)
3036          printf("|");
3037       switch (inst->src[i].file) {
3038       case GRF:
3039          printf("vgrf%d", inst->src[i].reg);
3040          if (inst->src[i].reg_offset)
3041             printf("+%d", inst->src[i].reg_offset);
3042          break;
3043       case MRF:
3044          printf("***m%d***", inst->src[i].reg);
3045          break;
3046       case UNIFORM:
3047          printf("u%d", inst->src[i].reg);
3048          if (inst->src[i].reg_offset)
3049             printf(".%d", inst->src[i].reg_offset);
3050          break;
3051       case BAD_FILE:
3052          printf("(null)");
3053          break;
3054       case IMM:
3055          switch (inst->src[i].type) {
3056          case BRW_REGISTER_TYPE_F:
3057             printf("%ff", inst->src[i].imm.f);
3058             break;
3059          case BRW_REGISTER_TYPE_D:
3060             printf("%dd", inst->src[i].imm.i);
3061             break;
3062          case BRW_REGISTER_TYPE_UD:
3063             printf("%uu", inst->src[i].imm.u);
3064             break;
3065          default:
3066             printf("???");
3067             break;
3068          }
3069          break;
3070       case HW_REG:
3071          if (inst->src[i].fixed_hw_reg.negate)
3072             printf("-");
3073          if (inst->src[i].fixed_hw_reg.abs)
3074             printf("|");
3075          printf("hw_reg%d", inst->src[i].fixed_hw_reg.nr);
3076          if (inst->src[i].fixed_hw_reg.subnr)
3077             printf("+%d", inst->src[i].fixed_hw_reg.subnr);
3078          if (inst->src[i].fixed_hw_reg.abs)
3079             printf("|");
3080          break;
3081       default:
3082          printf("???");
3083          break;
3084       }
3085       if (inst->src[i].abs)
3086          printf("|");
3087
3088       if (i < 3)
3089          printf(", ");
3090    }
3091
3092    printf(" ");
3093
3094    if (inst->force_uncompressed)
3095       printf("1sthalf ");
3096
3097    if (inst->force_sechalf)
3098       printf("2ndhalf ");
3099
3100    printf("\n");
3101 }
3102
3103 /**
3104  * Possibly returns an instruction that set up @param reg.
3105  *
3106  * Sometimes we want to take the result of some expression/variable
3107  * dereference tree and rewrite the instruction generating the result
3108  * of the tree.  When processing the tree, we know that the
3109  * instructions generated are all writing temporaries that are dead
3110  * outside of this tree.  So, if we have some instructions that write
3111  * a temporary, we're free to point that temp write somewhere else.
3112  *
3113  * Note that this doesn't guarantee that the instruction generated
3114  * only reg -- it might be the size=4 destination of a texture instruction.
3115  */
3116 fs_inst *
3117 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3118                                            fs_inst *end,
3119                                            fs_reg reg)
3120 {
3121    if (end == start ||
3122        end->is_partial_write() ||
3123        reg.reladdr ||
3124        !reg.equals(end->dst)) {
3125       return NULL;
3126    } else {
3127       return end;
3128    }
3129 }
3130
3131 void
3132 fs_visitor::setup_payload_gen6()
3133 {
3134    bool uses_depth =
3135       (fp->Base.InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3136    unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
3137
3138    assert(brw->gen >= 6);
3139
3140    /* R0-1: masks, pixel X/Y coordinates. */
3141    c->nr_payload_regs = 2;
3142    /* R2: only for 32-pixel dispatch.*/
3143
3144    /* R3-26: barycentric interpolation coordinates.  These appear in the
3145     * same order that they appear in the brw_wm_barycentric_interp_mode
3146     * enum.  Each set of coordinates occupies 2 registers if dispatch width
3147     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
3148     * appear if they were enabled using the "Barycentric Interpolation
3149     * Mode" bits in WM_STATE.
3150     */
3151    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3152       if (barycentric_interp_modes & (1 << i)) {
3153          c->barycentric_coord_reg[i] = c->nr_payload_regs;
3154          c->nr_payload_regs += 2;
3155          if (dispatch_width == 16) {
3156             c->nr_payload_regs += 2;
3157          }
3158       }
3159    }
3160
3161    /* R27: interpolated depth if uses source depth */
3162    if (uses_depth) {
3163       c->source_depth_reg = c->nr_payload_regs;
3164       c->nr_payload_regs++;
3165       if (dispatch_width == 16) {
3166          /* R28: interpolated depth if not 8-wide. */
3167          c->nr_payload_regs++;
3168       }
3169    }
3170    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3171    if (uses_depth) {
3172       c->source_w_reg = c->nr_payload_regs;
3173       c->nr_payload_regs++;
3174       if (dispatch_width == 16) {
3175          /* R30: interpolated W if not 8-wide. */
3176          c->nr_payload_regs++;
3177       }
3178    }
3179
3180    c->prog_data.uses_pos_offset = c->key.compute_pos_offset;
3181    /* R31: MSAA position offsets. */
3182    if (c->prog_data.uses_pos_offset) {
3183       c->sample_pos_reg = c->nr_payload_regs;
3184       c->nr_payload_regs++;
3185    }
3186
3187    /* R32-: bary for 32-pixel. */
3188    /* R58-59: interp W for 32-pixel. */
3189
3190    if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3191       c->source_depth_to_render_target = true;
3192    }
3193 }
3194
3195 void
3196 fs_visitor::assign_binding_table_offsets()
3197 {
3198    uint32_t next_binding_table_offset = 0;
3199
3200    c->prog_data.binding_table.render_target_start = next_binding_table_offset;
3201    next_binding_table_offset += c->key.nr_color_regions;
3202
3203    assign_common_binding_table_offsets(next_binding_table_offset);
3204 }
3205
3206 bool
3207 fs_visitor::run()
3208 {
3209    sanity_param_count = fp->Base.Parameters->NumParameters;
3210    uint32_t orig_nr_params = c->prog_data.nr_params;
3211
3212    assign_binding_table_offsets();
3213
3214    if (brw->gen >= 6)
3215       setup_payload_gen6();
3216    else
3217       setup_payload_gen4();
3218
3219    if (0) {
3220       emit_dummy_fs();
3221    } else {
3222       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3223          emit_shader_time_begin();
3224
3225       calculate_urb_setup();
3226       if (fp->Base.InputsRead > 0) {
3227          if (brw->gen < 6)
3228             emit_interpolation_setup_gen4();
3229          else
3230             emit_interpolation_setup_gen6();
3231       }
3232
3233       /* We handle discards by keeping track of the still-live pixels in f0.1.
3234        * Initialize it with the dispatched pixels.
3235        */
3236       if (fp->UsesKill) {
3237          fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
3238          discard_init->flag_subreg = 1;
3239       }
3240
3241       /* Generate FS IR for main().  (the visitor only descends into
3242        * functions called "main").
3243        */
3244       if (shader) {
3245          foreach_list(node, &*shader->ir) {
3246             ir_instruction *ir = (ir_instruction *)node;
3247             base_ir = ir;
3248             this->result = reg_undef;
3249             ir->accept(this);
3250          }
3251       } else {
3252          emit_fragment_program_code();
3253       }
3254       base_ir = NULL;
3255       if (failed)
3256          return false;
3257
3258       emit(FS_OPCODE_PLACEHOLDER_HALT);
3259
3260       emit_fb_writes();
3261
3262       split_virtual_grfs();
3263
3264       move_uniform_array_access_to_pull_constants();
3265       remove_dead_constants();
3266       setup_pull_constants();
3267
3268       bool progress;
3269       do {
3270          progress = false;
3271
3272          compact_virtual_grfs();
3273
3274          progress = remove_duplicate_mrf_writes() || progress;
3275
3276          progress = opt_algebraic() || progress;
3277          progress = opt_cse() || progress;
3278          progress = opt_copy_propagate() || progress;
3279          progress = dead_code_eliminate() || progress;
3280          progress = dead_code_eliminate_local() || progress;
3281          progress = register_coalesce() || progress;
3282          progress = register_coalesce_2() || progress;
3283          progress = compute_to_mrf() || progress;
3284       } while (progress);
3285
3286       schedule_instructions(false);
3287
3288       lower_uniform_pull_constant_loads();
3289
3290       assign_curb_setup();
3291       assign_urb_setup();
3292
3293       if (0)
3294          assign_regs_trivial();
3295       else {
3296          while (!assign_regs()) {
3297             if (failed)
3298                break;
3299          }
3300       }
3301    }
3302    assert(force_uncompressed_stack == 0);
3303    assert(force_sechalf_stack == 0);
3304
3305    /* This must come after all optimization and register allocation, since
3306     * it inserts dead code that happens to have side effects, and it does
3307     * so based on the actual physical registers in use.
3308     */
3309    insert_gen4_send_dependency_workarounds();
3310
3311    if (failed)
3312       return false;
3313
3314    schedule_instructions(true);
3315
3316    if (dispatch_width == 8) {
3317       c->prog_data.reg_blocks = brw_register_blocks(grf_used);
3318    } else {
3319       c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
3320
3321       /* Make sure we didn't try to sneak in an extra uniform */
3322       assert(orig_nr_params == c->prog_data.nr_params);
3323       (void) orig_nr_params;
3324    }
3325
3326    /* If any state parameters were appended, then ParameterValues could have
3327     * been realloced, in which case the driver uniform storage set up by
3328     * _mesa_associate_uniform_storage() would point to freed memory.  Make
3329     * sure that didn't happen.
3330     */
3331    assert(sanity_param_count == fp->Base.Parameters->NumParameters);
3332
3333    return !failed;
3334 }
3335
3336 const unsigned *
3337 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
3338                struct gl_fragment_program *fp,
3339                struct gl_shader_program *prog,
3340                unsigned *final_assembly_size)
3341 {
3342    bool start_busy = false;
3343    float start_time = 0;
3344
3345    if (unlikely(brw->perf_debug)) {
3346       start_busy = (brw->batch.last_bo &&
3347                     drm_intel_bo_busy(brw->batch.last_bo));
3348       start_time = get_time();
3349    }
3350
3351    struct brw_shader *shader = NULL;
3352    if (prog)
3353       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3354
3355    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3356       if (prog) {
3357          printf("GLSL IR for native fragment shader %d:\n", prog->Name);
3358          _mesa_print_ir(shader->ir, NULL);
3359          printf("\n\n");
3360       } else {
3361          printf("ARB_fragment_program %d ir for native fragment shader\n",
3362                 fp->Base.Id);
3363          _mesa_print_program(&fp->Base);
3364       }
3365    }
3366
3367    /* Now the main event: Visit the shader IR and generate our FS IR for it.
3368     */
3369    fs_visitor v(brw, c, prog, fp, 8);
3370    if (!v.run()) {
3371       if (prog) {
3372          prog->LinkStatus = false;
3373          ralloc_strcat(&prog->InfoLog, v.fail_msg);
3374       }
3375
3376       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3377                     v.fail_msg);
3378
3379       return NULL;
3380    }
3381
3382    exec_list *simd16_instructions = NULL;
3383    fs_visitor v2(brw, c, prog, fp, 16);
3384    if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16))) {
3385       if (c->prog_data.nr_pull_params == 0) {
3386          /* Try a 16-wide compile */
3387          v2.import_uniforms(&v);
3388          if (!v2.run()) {
3389             perf_debug("16-wide shader failed to compile, falling back to "
3390                        "8-wide at a 10-20%% performance cost: %s", v2.fail_msg);
3391          } else {
3392             simd16_instructions = &v2.instructions;
3393          }
3394       } else {
3395          perf_debug("Skipping 16-wide due to pull parameters.\n");
3396       }
3397    }
3398
3399    fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
3400    const unsigned *generated = g.generate_assembly(&v.instructions,
3401                                                    simd16_instructions,
3402                                                    final_assembly_size);
3403
3404    if (unlikely(brw->perf_debug) && shader) {
3405       if (shader->compiled_once)
3406          brw_wm_debug_recompile(brw, prog, &c->key);
3407       shader->compiled_once = true;
3408
3409       if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
3410          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3411                     (get_time() - start_time) * 1000);
3412       }
3413    }
3414
3415    return generated;
3416 }
3417
3418 bool
3419 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
3420 {
3421    struct brw_context *brw = brw_context(ctx);
3422    struct brw_wm_prog_key key;
3423
3424    if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
3425       return true;
3426
3427    struct gl_fragment_program *fp = (struct gl_fragment_program *)
3428       prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
3429    struct brw_fragment_program *bfp = brw_fragment_program(fp);
3430    bool program_uses_dfdy = fp->UsesDFdy;
3431
3432    memset(&key, 0, sizeof(key));
3433
3434    if (brw->gen < 6) {
3435       if (fp->UsesKill)
3436          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3437
3438       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3439          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3440
3441       /* Just assume depth testing. */
3442       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3443       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3444    }
3445
3446    if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
3447                                          BRW_FS_VARYING_INPUT_MASK) > 16)
3448       key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
3449
3450    key.clamp_fragment_color = ctx->API == API_OPENGL_COMPAT;
3451
3452    unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
3453    for (unsigned i = 0; i < sampler_count; i++) {
3454       if (fp->Base.ShadowSamplers & (1 << i)) {
3455          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3456          key.tex.swizzles[i] =
3457             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3458       } else {
3459          /* Color sampler: assume no swizzling. */
3460          key.tex.swizzles[i] = SWIZZLE_XYZW;
3461       }
3462    }
3463
3464    if (fp->Base.InputsRead & VARYING_BIT_POS) {
3465       key.drawable_height = ctx->DrawBuffer->Height;
3466    }
3467
3468    if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3469       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
3470    }
3471
3472    key.nr_color_regions = 1;
3473
3474    /* GL_FRAGMENT_SHADER_DERIVATIVE_HINT is almost always GL_DONT_CARE.  The
3475     * quality of the derivatives is likely to be determined by the driconf
3476     * option.
3477     */
3478    key.high_quality_derivatives = brw->disable_derivative_optimization;
3479
3480    key.program_string_id = bfp->id;
3481
3482    uint32_t old_prog_offset = brw->wm.base.prog_offset;
3483    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3484
3485    bool success = do_wm_prog(brw, prog, bfp, &key);
3486
3487    brw->wm.base.prog_offset = old_prog_offset;
3488    brw->wm.prog_data = old_prog_data;
3489
3490    return success;
3491 }