src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 extern "C" {
  32
  33 #include <sys/types.h>
  34
  35 #include "main/hash_table.h"
  36 #include "main/macros.h"
  37 #include "main/shaderobj.h"
  38 #include "main/fbobject.h"
  39 #include "program/prog_parameter.h"
  40 #include "program/prog_print.h"
  41 #include "program/register_allocate.h"
  42 #include "program/sampler.h"
  43 #include "program/hash_table.h"
  44 #include "brw_context.h"
  45 #include "brw_eu.h"
  46 #include "brw_wm.h"
  47 }
  48 #include "brw_fs.h"
  49 #include "brw_dead_control_flow.h"
  50 #include "main/uniforms.h"
  51 #include "brw_fs_live_variables.h"
  52 #include "glsl/glsl_types.h"
  53
  54 void
  55 fs_inst::init(enum opcode opcode, const fs_reg &dst, fs_reg *src, int sources)
  56 {
  57    memset(this, 0, sizeof(*this));
  58
  59    this->opcode = opcode;
  60    this->dst = dst;
  61    this->src = src;
  62    this->sources = sources;
  63
  64    this->conditional_mod = BRW_CONDITIONAL_NONE;
  65
  66    /* This will be the case for almost all instructions. */
  67    this->regs_written = 1;
  68
  69    this->writes_accumulator = false;
  70 }
  71
  72 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst)
  73 {
  74    fs_reg *src = ralloc_array(this, fs_reg, 3);
  75    init(opcode, dst, src, 0);
  76 }
  77
  78 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
  79 {
  80    fs_reg *src = ralloc_array(this, fs_reg, 3);
  81    src[0] = src0;
  82    init(opcode, dst, src, 1);
  83 }
  84
  85 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
  86                  const fs_reg &src1)
  87 {
  88    fs_reg *src = ralloc_array(this, fs_reg, 3);
  89    src[0] = src0;
  90    src[1] = src1;
  91    init(opcode, dst, src, 2);
  92 }
  93
  94 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
  95                  const fs_reg &src1, const fs_reg &src2)
  96 {
  97    fs_reg *src = ralloc_array(this, fs_reg, 3);
  98    src[0] = src0;
  99    src[1] = src1;
 100    src[2] = src2;
 101    init(opcode, dst, src, 3);
 102 }
 103
 104 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, fs_reg src[], int sources)
 105 {
 106    init(opcode, dst, src, sources);
 107 }
 108
 109 fs_inst::fs_inst(const fs_inst &that)
 110 {
 111    memcpy(this, &that, sizeof(that));
 112
 113    this->src = ralloc_array(this, fs_reg, that.sources);
 114
 115    for (int i = 0; i < that.sources; i++)
 116       this->src[i] = that.src[i];
 117 }
 118
 119 void
 120 fs_inst::resize_sources(uint8_t num_sources)
 121 {
 122    if (this->sources != num_sources) {
 123       this->src = reralloc(this, this->src, fs_reg, num_sources);
 124       this->sources = num_sources;
 125    }
 126 }
 127
 128 #define ALU1(op)                                                        \
 129    fs_inst *                                                            \
 130    fs_visitor::op(const fs_reg &dst, const fs_reg &src0)                \
 131    {                                                                    \
 132       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
 133    }
 134
 135 #define ALU2(op)                                                        \
 136    fs_inst *                                                            \
 137    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 138                   const fs_reg &src1)                                   \
 139    {                                                                    \
 140       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
 141    }
 142
 143 #define ALU2_ACC(op)                                                    \
 144    fs_inst *                                                            \
 145    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 146                   const fs_reg &src1)                                   \
 147    {                                                                    \
 148       fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
 149       inst->writes_accumulator = true;                                  \
 150       return inst;                                                      \
 151    }
 152
 153 #define ALU3(op)                                                        \
 154    fs_inst *                                                            \
 155    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 156                   const fs_reg &src1, const fs_reg &src2)               \
 157    {                                                                    \
 158       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
 159    }
 160
 161 ALU1(NOT)
 162 ALU1(MOV)
 163 ALU1(FRC)
 164 ALU1(RNDD)
 165 ALU1(RNDE)
 166 ALU1(RNDZ)
 167 ALU2(ADD)
 168 ALU2(MUL)
 169 ALU2_ACC(MACH)
 170 ALU2(AND)
 171 ALU2(OR)
 172 ALU2(XOR)
 173 ALU2(SHL)
 174 ALU2(SHR)
 175 ALU2(ASR)
 176 ALU3(LRP)
 177 ALU1(BFREV)
 178 ALU3(BFE)
 179 ALU2(BFI1)
 180 ALU3(BFI2)
 181 ALU1(FBH)
 182 ALU1(FBL)
 183 ALU1(CBIT)
 184 ALU3(MAD)
 185 ALU2_ACC(ADDC)
 186 ALU2_ACC(SUBB)
 187 ALU2(SEL)
 188 ALU2(MAC)
 189
 190 /** Gen4 predicated IF. */
 191 fs_inst *
 192 fs_visitor::IF(uint32_t predicate)
 193 {
 194    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
 195    inst->predicate = predicate;
 196    return inst;
 197 }
 198
 199 /** Gen6 IF with embedded comparison. */
 200 fs_inst *
 201 fs_visitor::IF(const fs_reg &src0, const fs_reg &src1, uint32_t condition)
 202 {
 203    assert(brw->gen == 6);
 204    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
 205                                         reg_null_d, src0, src1);
 206    inst->conditional_mod = condition;
 207    return inst;
 208 }
 209
 210 /**
 211  * CMP: Sets the low bit of the destination channels with the result
 212  * of the comparison, while the upper bits are undefined, and updates
 213  * the flag register with the packed 16 bits of the result.
 214  */
 215 fs_inst *
 216 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
 217 {
 218    fs_inst *inst;
 219
 220    /* Take the instruction:
 221     *
 222     * CMP null<d> src0<f> src1<f>
 223     *
 224     * Original gen4 does type conversion to the destination type before
 225     * comparison, producing garbage results for floating point comparisons.
 226     * gen5 does the comparison on the execution type (resolved source types),
 227     * so dst type doesn't matter.  gen6 does comparison and then uses the
 228     * result as if it was the dst type with no conversion, which happens to
 229     * mostly work out for float-interpreted-as-int since our comparisons are
 230     * for >0, =0, <0.
 231     */
 232    if (brw->gen == 4) {
 233       dst.type = src0.type;
 234       if (dst.file == HW_REG)
 235          dst.fixed_hw_reg.type = dst.type;
 236    }
 237
 238    resolve_ud_negate(&src0);
 239    resolve_ud_negate(&src1);
 240
 241    inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
 242    inst->conditional_mod = condition;
 243
 244    return inst;
 245 }
 246
 247 fs_inst *
 248 fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources)
 249 {
 250    fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, dst, src,
 251                                         sources);
 252    inst->regs_written = sources;
 253
 254    return inst;
 255 }
 256
 257 exec_list
 258 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
 259                                        const fs_reg &surf_index,
 260                                        const fs_reg &varying_offset,
 261                                        uint32_t const_offset)
 262 {
 263    exec_list instructions;
 264    fs_inst *inst;
 265
 266    /* We have our constant surface use a pitch of 4 bytes, so our index can
 267     * be any component of a vector, and then we load 4 contiguous
 268     * components starting from that.
 269     *
 270     * We break down the const_offset to a portion added to the variable
 271     * offset and a portion done using reg_offset, which means that if you
 272     * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
 273     * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
 274     * CSE can later notice that those loads are all the same and eliminate
 275     * the redundant ones.
 276     */
 277    fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
 278    instructions.push_tail(ADD(vec4_offset,
 279                               varying_offset, const_offset & ~3));
 280
 281    int scale = 1;
 282    if (brw->gen == 4 && dispatch_width == 8) {
 283       /* Pre-gen5, we can either use a SIMD8 message that requires (header,
 284        * u, v, r) as parameters, or we can just use the SIMD16 message
 285        * consisting of (header, u).  We choose the second, at the cost of a
 286        * longer return length.
 287        */
 288       scale = 2;
 289    }
 290
 291    enum opcode op;
 292    if (brw->gen >= 7)
 293       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
 294    else
 295       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
 296    fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4 * scale), dst.type);
 297    inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
 298    inst->regs_written = 4 * scale;
 299    instructions.push_tail(inst);
 300
 301    if (brw->gen < 7) {
 302       inst->base_mrf = 13;
 303       inst->header_present = true;
 304       if (brw->gen == 4)
 305          inst->mlen = 3;
 306       else
 307          inst->mlen = 1 + dispatch_width / 8;
 308    }
 309
 310    vec4_result.reg_offset += (const_offset & 3) * scale;
 311    instructions.push_tail(MOV(dst, vec4_result));
 312
 313    return instructions;
 314 }
 315
 316 /**
 317  * A helper for MOV generation for fixing up broken hardware SEND dependency
 318  * handling.
 319  */
 320 fs_inst *
 321 fs_visitor::DEP_RESOLVE_MOV(int grf)
 322 {
 323    fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
 324
 325    inst->ir = NULL;
 326    inst->annotation = "send dependency resolve";
 327
 328    /* The caller always wants uncompressed to emit the minimal extra
 329     * dependencies, and to avoid having to deal with aligning its regs to 2.
 330     */
 331    inst->force_uncompressed = true;
 332
 333    return inst;
 334 }
 335
 336 bool
 337 fs_inst::equals(fs_inst *inst) const
 338 {
 339    return (opcode == inst->opcode &&
 340            dst.equals(inst->dst) &&
 341            src[0].equals(inst->src[0]) &&
 342            src[1].equals(inst->src[1]) &&
 343            src[2].equals(inst->src[2]) &&
 344            saturate == inst->saturate &&
 345            predicate == inst->predicate &&
 346            conditional_mod == inst->conditional_mod &&
 347            mlen == inst->mlen &&
 348            base_mrf == inst->base_mrf &&
 349            sampler == inst->sampler &&
 350            target == inst->target &&
 351            eot == inst->eot &&
 352            header_present == inst->header_present &&
 353            shadow_compare == inst->shadow_compare &&
 354            offset == inst->offset);
 355 }
 356
 357 bool
 358 fs_inst::overwrites_reg(const fs_reg &reg) const
 359 {
 360    return (reg.file == dst.file &&
 361            reg.reg == dst.reg &&
 362            reg.reg_offset >= dst.reg_offset  &&
 363            reg.reg_offset < dst.reg_offset + regs_written);
 364 }
 365
 366 bool
 367 fs_inst::is_send_from_grf() const
 368 {
 369    return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
 370            opcode == SHADER_OPCODE_SHADER_TIME_ADD ||
 371            (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
 372             src[1].file == GRF) ||
 373            (is_tex() && src[0].file == GRF));
 374 }
 375
 376 bool
 377 fs_inst::can_do_source_mods(struct brw_context *brw)
 378 {
 379    if (brw->gen == 6 && is_math())
 380       return false;
 381
 382    if (is_send_from_grf())
 383       return false;
 384
 385    if (!backend_instruction::can_do_source_mods())
 386       return false;
 387
 388    return true;
 389 }
 390
 391 void
 392 fs_reg::init()
 393 {
 394    memset(this, 0, sizeof(*this));
 395    stride = 1;
 396 }
 397
 398 /** Generic unset register constructor. */
 399 fs_reg::fs_reg()
 400 {
 401    init();
 402    this->file = BAD_FILE;
 403 }
 404
 405 /** Immediate value constructor. */
 406 fs_reg::fs_reg(float f)
 407 {
 408    init();
 409    this->file = IMM;
 410    this->type = BRW_REGISTER_TYPE_F;
 411    this->imm.f = f;
 412 }
 413
 414 /** Immediate value constructor. */
 415 fs_reg::fs_reg(int32_t i)
 416 {
 417    init();
 418    this->file = IMM;
 419    this->type = BRW_REGISTER_TYPE_D;
 420    this->imm.i = i;
 421 }
 422
 423 /** Immediate value constructor. */
 424 fs_reg::fs_reg(uint32_t u)
 425 {
 426    init();
 427    this->file = IMM;
 428    this->type = BRW_REGISTER_TYPE_UD;
 429    this->imm.u = u;
 430 }
 431
 432 /** Fixed brw_reg. */
 433 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
 434 {
 435    init();
 436    this->file = HW_REG;
 437    this->fixed_hw_reg = fixed_hw_reg;
 438    this->type = fixed_hw_reg.type;
 439 }
 440
 441 bool
 442 fs_reg::equals(const fs_reg &r) const
 443 {
 444    return (file == r.file &&
 445            reg == r.reg &&
 446            reg_offset == r.reg_offset &&
 447            subreg_offset == r.subreg_offset &&
 448            type == r.type &&
 449            negate == r.negate &&
 450            abs == r.abs &&
 451            !reladdr && !r.reladdr &&
 452            memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
 453                   sizeof(fixed_hw_reg)) == 0 &&
 454            stride == r.stride &&
 455            imm.u == r.imm.u);
 456 }
 457
 458 fs_reg &
 459 fs_reg::apply_stride(unsigned stride)
 460 {
 461    assert((this->stride * stride) <= 4 &&
 462           (is_power_of_two(stride) || stride == 0) &&
 463           file != HW_REG && file != IMM);
 464    this->stride *= stride;
 465    return *this;
 466 }
 467
 468 fs_reg &
 469 fs_reg::set_smear(unsigned subreg)
 470 {
 471    assert(file != HW_REG && file != IMM);
 472    subreg_offset = subreg * type_sz(type);
 473    stride = 0;
 474    return *this;
 475 }
 476
 477 bool
 478 fs_reg::is_contiguous() const
 479 {
 480    return stride == 1;
 481 }
 482
 483 bool
 484 fs_reg::is_zero() const
 485 {
 486    if (file != IMM)
 487       return false;
 488
 489    return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
 490 }
 491
 492 bool
 493 fs_reg::is_one() const
 494 {
 495    if (file != IMM)
 496       return false;
 497
 498    return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
 499 }
 500
 501 bool
 502 fs_reg::is_null() const
 503 {
 504    return file == HW_REG &&
 505           fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
 506           fixed_hw_reg.nr == BRW_ARF_NULL;
 507 }
 508
 509 bool
 510 fs_reg::is_valid_3src() const
 511 {
 512    return file == GRF || file == UNIFORM;
 513 }
 514
 515 bool
 516 fs_reg::is_accumulator() const
 517 {
 518    return file == HW_REG &&
 519           fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
 520           fixed_hw_reg.nr == BRW_ARF_ACCUMULATOR;
 521 }
 522
 523 int
 524 fs_visitor::type_size(const struct glsl_type *type)
 525 {
 526    unsigned int size, i;
 527
 528    switch (type->base_type) {
 529    case GLSL_TYPE_UINT:
 530    case GLSL_TYPE_INT:
 531    case GLSL_TYPE_FLOAT:
 532    case GLSL_TYPE_BOOL:
 533       return type->components();
 534    case GLSL_TYPE_ARRAY:
 535       return type_size(type->fields.array) * type->length;
 536    case GLSL_TYPE_STRUCT:
 537       size = 0;
 538       for (i = 0; i < type->length; i++) {
 539          size += type_size(type->fields.structure[i].type);
 540       }
 541       return size;
 542    case GLSL_TYPE_SAMPLER:
 543       /* Samplers take up no register space, since they're baked in at
 544        * link time.
 545        */
 546       return 0;
 547    case GLSL_TYPE_ATOMIC_UINT:
 548       return 0;
 549    case GLSL_TYPE_IMAGE:
 550    case GLSL_TYPE_VOID:
 551    case GLSL_TYPE_ERROR:
 552    case GLSL_TYPE_INTERFACE:
 553       unreachable("not reached");
 554    }
 555
 556    return 0;
 557 }
 558
 559 fs_reg
 560 fs_visitor::get_timestamp()
 561 {
 562    assert(brw->gen >= 7);
 563
 564    fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 565                                           BRW_ARF_TIMESTAMP,
 566                                           0),
 567                              BRW_REGISTER_TYPE_UD));
 568
 569    fs_reg dst = fs_reg(this, glsl_type::uint_type);
 570
 571    fs_inst *mov = emit(MOV(dst, ts));
 572    /* We want to read the 3 fields we care about (mostly field 0, but also 2)
 573     * even if it's not enabled in the dispatch.
 574     */
 575    mov->force_writemask_all = true;
 576    mov->force_uncompressed = true;
 577
 578    /* The caller wants the low 32 bits of the timestamp.  Since it's running
 579     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
 580     * which is plenty of time for our purposes.  It is identical across the
 581     * EUs, but since it's tracking GPU core speed it will increment at a
 582     * varying rate as render P-states change.
 583     *
 584     * The caller could also check if render P-states have changed (or anything
 585     * else that might disrupt timing) by setting smear to 2 and checking if
 586     * that field is != 0.
 587     */
 588    dst.set_smear(0);
 589
 590    return dst;
 591 }
 592
 593 void
 594 fs_visitor::emit_shader_time_begin()
 595 {
 596    current_annotation = "shader time start";
 597    shader_start_time = get_timestamp();
 598 }
 599
 600 void
 601 fs_visitor::emit_shader_time_end()
 602 {
 603    current_annotation = "shader time end";
 604
 605    enum shader_time_shader_type type, written_type, reset_type;
 606    if (dispatch_width == 8) {
 607       type = ST_FS8;
 608       written_type = ST_FS8_WRITTEN;
 609       reset_type = ST_FS8_RESET;
 610    } else {
 611       assert(dispatch_width == 16);
 612       type = ST_FS16;
 613       written_type = ST_FS16_WRITTEN;
 614       reset_type = ST_FS16_RESET;
 615    }
 616
 617    fs_reg shader_end_time = get_timestamp();
 618
 619    /* Check that there weren't any timestamp reset events (assuming these
 620     * were the only two timestamp reads that happened).
 621     */
 622    fs_reg reset = shader_end_time;
 623    reset.set_smear(2);
 624    fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
 625    test->conditional_mod = BRW_CONDITIONAL_Z;
 626    emit(IF(BRW_PREDICATE_NORMAL));
 627
 628    push_force_uncompressed();
 629    fs_reg start = shader_start_time;
 630    start.negate = true;
 631    fs_reg diff = fs_reg(this, glsl_type::uint_type);
 632    emit(ADD(diff, start, shader_end_time));
 633
 634    /* If there were no instructions between the two timestamp gets, the diff
 635     * is 2 cycles.  Remove that overhead, so I can forget about that when
 636     * trying to determine the time taken for single instructions.
 637     */
 638    emit(ADD(diff, diff, fs_reg(-2u)));
 639
 640    emit_shader_time_write(type, diff);
 641    emit_shader_time_write(written_type, fs_reg(1u));
 642    emit(BRW_OPCODE_ELSE);
 643    emit_shader_time_write(reset_type, fs_reg(1u));
 644    emit(BRW_OPCODE_ENDIF);
 645
 646    pop_force_uncompressed();
 647 }
 648
 649 void
 650 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
 651                                    fs_reg value)
 652 {
 653    int shader_time_index =
 654       brw_get_shader_time_index(brw, shader_prog, &fp->Base, type);
 655    fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
 656
 657    fs_reg payload;
 658    if (dispatch_width == 8)
 659       payload = fs_reg(this, glsl_type::uvec2_type);
 660    else
 661       payload = fs_reg(this, glsl_type::uint_type);
 662
 663    emit(new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
 664                              fs_reg(), payload, offset, value));
 665 }
 666
 667 void
 668 fs_visitor::vfail(const char *format, va_list va)
 669 {
 670    char *msg;
 671
 672    if (failed)
 673       return;
 674
 675    failed = true;
 676
 677    msg = ralloc_vasprintf(mem_ctx, format, va);
 678    msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
 679
 680    this->fail_msg = msg;
 681
 682    if (INTEL_DEBUG & DEBUG_WM) {
 683       fprintf(stderr, "%s",  msg);
 684    }
 685 }
 686
 687 void
 688 fs_visitor::fail(const char *format, ...)
 689 {
 690    va_list va;
 691
 692    va_start(va, format);
 693    vfail(format, va);
 694    va_end(va);
 695 }
 696
 697 /**
 698  * Mark this program as impossible to compile in SIMD16 mode.
 699  *
 700  * During the SIMD8 compile (which happens first), we can detect and flag
 701  * things that are unsupported in SIMD16 mode, so the compiler can skip
 702  * the SIMD16 compile altogether.
 703  *
 704  * During a SIMD16 compile (if one happens anyway), this just calls fail().
 705  */
 706 void
 707 fs_visitor::no16(const char *format, ...)
 708 {
 709    va_list va;
 710
 711    va_start(va, format);
 712
 713    if (dispatch_width == 16) {
 714       vfail(format, va);
 715    } else {
 716       simd16_unsupported = true;
 717
 718       if (brw->perf_debug) {
 719          if (no16_msg)
 720             ralloc_vasprintf_append(&no16_msg, format, va);
 721          else
 722             no16_msg = ralloc_vasprintf(mem_ctx, format, va);
 723       }
 724    }
 725
 726    va_end(va);
 727 }
 728
 729 fs_inst *
 730 fs_visitor::emit(enum opcode opcode)
 731 {
 732    return emit(new(mem_ctx) fs_inst(opcode));
 733 }
 734
 735 fs_inst *
 736 fs_visitor::emit(enum opcode opcode, const fs_reg &dst)
 737 {
 738    return emit(new(mem_ctx) fs_inst(opcode, dst));
 739 }
 740
 741 fs_inst *
 742 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
 743 {
 744    return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
 745 }
 746
 747 fs_inst *
 748 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 749                  const fs_reg &src1)
 750 {
 751    return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
 752 }
 753
 754 fs_inst *
 755 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 756                  const fs_reg &src1, const fs_reg &src2)
 757 {
 758    return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
 759 }
 760
 761 fs_inst *
 762 fs_visitor::emit(enum opcode opcode, const fs_reg &dst,
 763                  fs_reg src[], int sources)
 764 {
 765    return emit(new(mem_ctx) fs_inst(opcode, dst, src, sources));
 766 }
 767
 768 void
 769 fs_visitor::push_force_uncompressed()
 770 {
 771    force_uncompressed_stack++;
 772 }
 773
 774 void
 775 fs_visitor::pop_force_uncompressed()
 776 {
 777    force_uncompressed_stack--;
 778    assert(force_uncompressed_stack >= 0);
 779 }
 780
 781 /**
 782  * Returns true if the instruction has a flag that means it won't
 783  * update an entire destination register.
 784  *
 785  * For example, dead code elimination and live variable analysis want to know
 786  * when a write to a variable screens off any preceding values that were in
 787  * it.
 788  */
 789 bool
 790 fs_inst::is_partial_write() const
 791 {
 792    return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
 793            this->force_uncompressed ||
 794            this->force_sechalf || !this->dst.is_contiguous());
 795 }
 796
 797 int
 798 fs_inst::regs_read(fs_visitor *v, int arg) const
 799 {
 800    if (is_tex() && arg == 0 && src[0].file == GRF) {
 801       if (v->dispatch_width == 16)
 802          return (mlen + 1) / 2;
 803       else
 804          return mlen;
 805    }
 806    return 1;
 807 }
 808
 809 bool
 810 fs_inst::reads_flag() const
 811 {
 812    return predicate;
 813 }
 814
 815 bool
 816 fs_inst::writes_flag() const
 817 {
 818    return (conditional_mod && opcode != BRW_OPCODE_SEL) ||
 819           opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
 820 }
 821
 822 /**
 823  * Returns how many MRFs an FS opcode will write over.
 824  *
 825  * Note that this is not the 0 or 1 implied writes in an actual gen
 826  * instruction -- the FS opcodes often generate MOVs in addition.
 827  */
 828 int
 829 fs_visitor::implied_mrf_writes(fs_inst *inst)
 830 {
 831    if (inst->mlen == 0)
 832       return 0;
 833
 834    if (inst->base_mrf == -1)
 835       return 0;
 836
 837    switch (inst->opcode) {
 838    case SHADER_OPCODE_RCP:
 839    case SHADER_OPCODE_RSQ:
 840    case SHADER_OPCODE_SQRT:
 841    case SHADER_OPCODE_EXP2:
 842    case SHADER_OPCODE_LOG2:
 843    case SHADER_OPCODE_SIN:
 844    case SHADER_OPCODE_COS:
 845       return 1 * dispatch_width / 8;
 846    case SHADER_OPCODE_POW:
 847    case SHADER_OPCODE_INT_QUOTIENT:
 848    case SHADER_OPCODE_INT_REMAINDER:
 849       return 2 * dispatch_width / 8;
 850    case SHADER_OPCODE_TEX:
 851    case FS_OPCODE_TXB:
 852    case SHADER_OPCODE_TXD:
 853    case SHADER_OPCODE_TXF:
 854    case SHADER_OPCODE_TXF_CMS:
 855    case SHADER_OPCODE_TXF_MCS:
 856    case SHADER_OPCODE_TG4:
 857    case SHADER_OPCODE_TG4_OFFSET:
 858    case SHADER_OPCODE_TXL:
 859    case SHADER_OPCODE_TXS:
 860    case SHADER_OPCODE_LOD:
 861       return 1;
 862    case FS_OPCODE_FB_WRITE:
 863       return 2;
 864    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 865    case SHADER_OPCODE_GEN4_SCRATCH_READ:
 866       return 1;
 867    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
 868       return inst->mlen;
 869    case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
 870       return 2;
 871    case SHADER_OPCODE_UNTYPED_ATOMIC:
 872    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
 873       return 0;
 874    default:
 875       unreachable("not reached");
 876    }
 877 }
 878
 879 int
 880 fs_visitor::virtual_grf_alloc(int size)
 881 {
 882    if (virtual_grf_array_size <= virtual_grf_count) {
 883       if (virtual_grf_array_size == 0)
 884          virtual_grf_array_size = 16;
 885       else
 886          virtual_grf_array_size *= 2;
 887       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 888                                    virtual_grf_array_size);
 889    }
 890    virtual_grf_sizes[virtual_grf_count] = size;
 891    return virtual_grf_count++;
 892 }
 893
 894 /** Fixed HW reg constructor. */
 895 fs_reg::fs_reg(enum register_file file, int reg)
 896 {
 897    init();
 898    this->file = file;
 899    this->reg = reg;
 900    this->type = BRW_REGISTER_TYPE_F;
 901 }
 902
 903 /** Fixed HW reg constructor. */
 904 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
 905 {
 906    init();
 907    this->file = file;
 908    this->reg = reg;
 909    this->type = type;
 910 }
 911
 912 /** Automatic reg constructor. */
 913 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
 914 {
 915    init();
 916
 917    this->file = GRF;
 918    this->reg = v->virtual_grf_alloc(v->type_size(type));
 919    this->reg_offset = 0;
 920    this->type = brw_type_for_base_type(type);
 921 }
 922
 923 fs_reg *
 924 fs_visitor::variable_storage(ir_variable *var)
 925 {
 926    return (fs_reg *)hash_table_find(this->variable_ht, var);
 927 }
 928
 929 void
 930 import_uniforms_callback(const void *key,
 931                          void *data,
 932                          void *closure)
 933 {
 934    struct hash_table *dst_ht = (struct hash_table *)closure;
 935    const fs_reg *reg = (const fs_reg *)data;
 936
 937    if (reg->file != UNIFORM)
 938       return;
 939
 940    hash_table_insert(dst_ht, data, key);
 941 }
 942
 943 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
 944  * This brings in those uniform definitions
 945  */
 946 void
 947 fs_visitor::import_uniforms(fs_visitor *v)
 948 {
 949    hash_table_call_foreach(v->variable_ht,
 950                            import_uniforms_callback,
 951                            variable_ht);
 952    this->push_constant_loc = v->push_constant_loc;
 953    this->pull_constant_loc = v->pull_constant_loc;
 954    this->uniforms = v->uniforms;
 955    this->param_size = v->param_size;
 956 }
 957
 958 /* Our support for uniforms is piggy-backed on the struct
 959  * gl_fragment_program, because that's where the values actually
 960  * get stored, rather than in some global gl_shader_program uniform
 961  * store.
 962  */
 963 void
 964 fs_visitor::setup_uniform_values(ir_variable *ir)
 965 {
 966    int namelen = strlen(ir->name);
 967
 968    /* The data for our (non-builtin) uniforms is stored in a series of
 969     * gl_uniform_driver_storage structs for each subcomponent that
 970     * glGetUniformLocation() could name.  We know it's been set up in the same
 971     * order we'd walk the type, so walk the list of storage and find anything
 972     * with our name, or the prefix of a component that starts with our name.
 973     */
 974    unsigned params_before = uniforms;
 975    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
 976       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 977
 978       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 979           (storage->name[namelen] != 0 &&
 980            storage->name[namelen] != '.' &&
 981            storage->name[namelen] != '[')) {
 982          continue;
 983       }
 984
 985       unsigned slots = storage->type->component_slots();
 986       if (storage->array_elements)
 987          slots *= storage->array_elements;
 988
 989       for (unsigned i = 0; i < slots; i++) {
 990          stage_prog_data->param[uniforms++] = &storage->storage[i].f;
 991       }
 992    }
 993
 994    /* Make sure we actually initialized the right amount of stuff here. */
 995    assert(params_before + ir->type->component_slots() == uniforms);
 996    (void)params_before;
 997 }
 998
 999
1000 /* Our support for builtin uniforms is even scarier than non-builtin.
1001  * It sits on top of the PROG_STATE_VAR parameters that are
1002  * automatically updated from GL context state.
1003  */
1004 void
1005 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
1006 {
1007    const ir_state_slot *const slots = ir->state_slots;
1008    assert(ir->state_slots != NULL);
1009
1010    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
1011       /* This state reference has already been setup by ir_to_mesa, but we'll
1012        * get the same index back here.
1013        */
1014       int index = _mesa_add_state_reference(this->fp->Base.Parameters,
1015                                             (gl_state_index *)slots[i].tokens);
1016
1017       /* Add each of the unique swizzles of the element as a parameter.
1018        * This'll end up matching the expected layout of the
1019        * array/matrix/structure we're trying to fill in.
1020        */
1021       int last_swiz = -1;
1022       for (unsigned int j = 0; j < 4; j++) {
1023          int swiz = GET_SWZ(slots[i].swizzle, j);
1024          if (swiz == last_swiz)
1025             break;
1026          last_swiz = swiz;
1027
1028          stage_prog_data->param[uniforms++] =
1029             &fp->Base.Parameters->ParameterValues[index][swiz].f;
1030       }
1031    }
1032 }
1033
1034 fs_reg *
1035 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
1036 {
1037    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1038    fs_reg wpos = *reg;
1039    bool flip = !ir->data.origin_upper_left ^ key->render_to_fbo;
1040
1041    /* gl_FragCoord.x */
1042    if (ir->data.pixel_center_integer) {
1043       emit(MOV(wpos, this->pixel_x));
1044    } else {
1045       emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
1046    }
1047    wpos.reg_offset++;
1048
1049    /* gl_FragCoord.y */
1050    if (!flip && ir->data.pixel_center_integer) {
1051       emit(MOV(wpos, this->pixel_y));
1052    } else {
1053       fs_reg pixel_y = this->pixel_y;
1054       float offset = (ir->data.pixel_center_integer ? 0.0 : 0.5);
1055
1056       if (flip) {
1057          pixel_y.negate = true;
1058          offset += key->drawable_height - 1.0;
1059       }
1060
1061       emit(ADD(wpos, pixel_y, fs_reg(offset)));
1062    }
1063    wpos.reg_offset++;
1064
1065    /* gl_FragCoord.z */
1066    if (brw->gen >= 6) {
1067       emit(MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
1068    } else {
1069       emit(FS_OPCODE_LINTERP, wpos,
1070            this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1071            this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1072            interp_reg(VARYING_SLOT_POS, 2));
1073    }
1074    wpos.reg_offset++;
1075
1076    /* gl_FragCoord.w: Already set up in emit_interpolation */
1077    emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1078
1079    return reg;
1080 }
1081
1082 fs_inst *
1083 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1084                          glsl_interp_qualifier interpolation_mode,
1085                          bool is_centroid, bool is_sample)
1086 {
1087    brw_wm_barycentric_interp_mode barycoord_mode;
1088    if (brw->gen >= 6) {
1089       if (is_centroid) {
1090          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1091             barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1092          else
1093             barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1094       } else if (is_sample) {
1095           if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1096             barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1097          else
1098             barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1099       } else {
1100          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1101             barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1102          else
1103             barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1104       }
1105    } else {
1106       /* On Ironlake and below, there is only one interpolation mode.
1107        * Centroid interpolation doesn't mean anything on this hardware --
1108        * there is no multisampling.
1109        */
1110       barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1111    }
1112    return emit(FS_OPCODE_LINTERP, attr,
1113                this->delta_x[barycoord_mode],
1114                this->delta_y[barycoord_mode], interp);
1115 }
1116
1117 fs_reg *
1118 fs_visitor::emit_general_interpolation(ir_variable *ir)
1119 {
1120    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1121    reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
1122    fs_reg attr = *reg;
1123
1124    unsigned int array_elements;
1125    const glsl_type *type;
1126
1127    if (ir->type->is_array()) {
1128       array_elements = ir->type->length;
1129       if (array_elements == 0) {
1130          fail("dereferenced array '%s' has length 0\n", ir->name);
1131       }
1132       type = ir->type->fields.array;
1133    } else {
1134       array_elements = 1;
1135       type = ir->type;
1136    }
1137
1138    glsl_interp_qualifier interpolation_mode =
1139       ir->determine_interpolation_mode(key->flat_shade);
1140
1141    int location = ir->data.location;
1142    for (unsigned int i = 0; i < array_elements; i++) {
1143       for (unsigned int j = 0; j < type->matrix_columns; j++) {
1144          if (prog_data->urb_setup[location] == -1) {
1145             /* If there's no incoming setup data for this slot, don't
1146              * emit interpolation for it.
1147              */
1148             attr.reg_offset += type->vector_elements;
1149             location++;
1150             continue;
1151          }
1152
1153          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1154             /* Constant interpolation (flat shading) case. The SF has
1155              * handed us defined values in only the constant offset
1156              * field of the setup reg.
1157              */
1158             for (unsigned int k = 0; k < type->vector_elements; k++) {
1159                struct brw_reg interp = interp_reg(location, k);
1160                interp = suboffset(interp, 3);
1161                interp.type = reg->type;
1162                emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1163                attr.reg_offset++;
1164             }
1165          } else {
1166             /* Smooth/noperspective interpolation case. */
1167             for (unsigned int k = 0; k < type->vector_elements; k++) {
1168                struct brw_reg interp = interp_reg(location, k);
1169                if (brw->needs_unlit_centroid_workaround && ir->data.centroid) {
1170                   /* Get the pixel/sample mask into f0 so that we know
1171                    * which pixels are lit.  Then, for each channel that is
1172                    * unlit, replace the centroid data with non-centroid
1173                    * data.
1174                    */
1175                   emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1176
1177                   fs_inst *inst;
1178                   inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1179                                       false, false);
1180                   inst->predicate = BRW_PREDICATE_NORMAL;
1181                   inst->predicate_inverse = true;
1182                   if (brw->has_pln)
1183                      inst->no_dd_clear = true;
1184
1185                   inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1186                                       ir->data.centroid && !key->persample_shading,
1187                                       ir->data.sample || key->persample_shading);
1188                   inst->predicate = BRW_PREDICATE_NORMAL;
1189                   inst->predicate_inverse = false;
1190                   if (brw->has_pln)
1191                      inst->no_dd_check = true;
1192
1193                } else {
1194                   emit_linterp(attr, fs_reg(interp), interpolation_mode,
1195                                ir->data.centroid && !key->persample_shading,
1196                                ir->data.sample || key->persample_shading);
1197                }
1198                if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1199                   emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1200                }
1201                attr.reg_offset++;
1202             }
1203
1204          }
1205          location++;
1206       }
1207    }
1208
1209    return reg;
1210 }
1211
1212 fs_reg *
1213 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1214 {
1215    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1216
1217    /* The frontfacing comes in as a bit in the thread payload. */
1218    if (brw->gen >= 6) {
1219       emit(BRW_OPCODE_ASR, *reg,
1220            fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1221            fs_reg(15));
1222       emit(BRW_OPCODE_NOT, *reg, *reg);
1223       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1224    } else {
1225       struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1226       /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1227        * us front face
1228        */
1229       emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1230       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1231    }
1232
1233    return reg;
1234 }
1235
1236 void
1237 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1238 {
1239    assert(dst.type == BRW_REGISTER_TYPE_F);
1240
1241    if (key->compute_pos_offset) {
1242       /* Convert int_sample_pos to floating point */
1243       emit(MOV(dst, int_sample_pos));
1244       /* Scale to the range [0, 1] */
1245       emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1246    }
1247    else {
1248       /* From ARB_sample_shading specification:
1249        * "When rendering to a non-multisample buffer, or if multisample
1250        *  rasterization is disabled, gl_SamplePosition will always be
1251        *  (0.5, 0.5).
1252        */
1253       emit(MOV(dst, fs_reg(0.5f)));
1254    }
1255 }
1256
1257 fs_reg *
1258 fs_visitor::emit_samplepos_setup(ir_variable *ir)
1259 {
1260    assert(brw->gen >= 6);
1261    assert(ir->type == glsl_type::vec2_type);
1262
1263    this->current_annotation = "compute sample position";
1264    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1265    fs_reg pos = *reg;
1266    fs_reg int_sample_x = fs_reg(this, glsl_type::int_type);
1267    fs_reg int_sample_y = fs_reg(this, glsl_type::int_type);
1268
1269    /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1270     * mode will be enabled.
1271     *
1272     * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1273     * R31.1:0         Position Offset X/Y for Slot[3:0]
1274     * R31.3:2         Position Offset X/Y for Slot[7:4]
1275     * .....
1276     *
1277     * The X, Y sample positions come in as bytes in  thread payload. So, read
1278     * the positions using vstride=16, width=8, hstride=2.
1279     */
1280    struct brw_reg sample_pos_reg =
1281       stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
1282                     BRW_REGISTER_TYPE_B), 16, 8, 2);
1283
1284    emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1285    if (dispatch_width == 16) {
1286       fs_inst *inst = emit(MOV(half(int_sample_x, 1),
1287                                fs_reg(suboffset(sample_pos_reg, 16))));
1288       inst->force_sechalf = true;
1289    }
1290    /* Compute gl_SamplePosition.x */
1291    compute_sample_position(pos, int_sample_x);
1292    pos.reg_offset++;
1293    emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1294    if (dispatch_width == 16) {
1295       fs_inst *inst = emit(MOV(half(int_sample_y, 1),
1296                                fs_reg(suboffset(sample_pos_reg, 17))));
1297       inst->force_sechalf = true;
1298    }
1299    /* Compute gl_SamplePosition.y */
1300    compute_sample_position(pos, int_sample_y);
1301    return reg;
1302 }
1303
1304 fs_reg *
1305 fs_visitor::emit_sampleid_setup(ir_variable *ir)
1306 {
1307    assert(brw->gen >= 6);
1308
1309    this->current_annotation = "compute sample id";
1310    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1311
1312    if (key->compute_sample_id) {
1313       fs_reg t1 = fs_reg(this, glsl_type::int_type);
1314       fs_reg t2 = fs_reg(this, glsl_type::int_type);
1315       t2.type = BRW_REGISTER_TYPE_UW;
1316
1317       /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1318        * 8x multisampling, subspan 0 will represent sample N (where N
1319        * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1320        * 7. We can find the value of N by looking at R0.0 bits 7:6
1321        * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1322        * (since samples are always delivered in pairs). That is, we
1323        * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1324        * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1325        * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1326        * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1327        * populating a temporary variable with the sequence (0, 1, 2, 3),
1328        * and then reading from it using vstride=1, width=4, hstride=0.
1329        * These computations hold good for 4x multisampling as well.
1330        */
1331       emit(BRW_OPCODE_AND, t1,
1332            fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
1333            fs_reg(0xc0));
1334       emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1335       /* This works for both SIMD8 and SIMD16 */
1336       emit(MOV(t2, brw_imm_v(0x3210)));
1337       /* This special instruction takes care of setting vstride=1,
1338        * width=4, hstride=0 of t2 during an ADD instruction.
1339        */
1340       emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1341    } else {
1342       /* As per GL_ARB_sample_shading specification:
1343        * "When rendering to a non-multisample buffer, or if multisample
1344        *  rasterization is disabled, gl_SampleID will always be zero."
1345        */
1346       emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1347    }
1348
1349    return reg;
1350 }
1351
1352 fs_reg
1353 fs_visitor::fix_math_operand(fs_reg src)
1354 {
1355    /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1356     * might be able to do better by doing execsize = 1 math and then
1357     * expanding that result out, but we would need to be careful with
1358     * masking.
1359     *
1360     * The hardware ignores source modifiers (negate and abs) on math
1361     * instructions, so we also move to a temp to set those up.
1362     */
1363    if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1364        !src.abs && !src.negate)
1365       return src;
1366
1367    /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1368     * operands to math
1369     */
1370    if (brw->gen >= 7 && src.file != IMM)
1371       return src;
1372
1373    fs_reg expanded = fs_reg(this, glsl_type::float_type);
1374    expanded.type = src.type;
1375    emit(BRW_OPCODE_MOV, expanded, src);
1376    return expanded;
1377 }
1378
1379 fs_inst *
1380 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1381 {
1382    switch (opcode) {
1383    case SHADER_OPCODE_RCP:
1384    case SHADER_OPCODE_RSQ:
1385    case SHADER_OPCODE_SQRT:
1386    case SHADER_OPCODE_EXP2:
1387    case SHADER_OPCODE_LOG2:
1388    case SHADER_OPCODE_SIN:
1389    case SHADER_OPCODE_COS:
1390       break;
1391    default:
1392       unreachable("not reached: bad math opcode");
1393    }
1394
1395    /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
1396     * might be able to do better by doing execsize = 1 math and then
1397     * expanding that result out, but we would need to be careful with
1398     * masking.
1399     *
1400     * Gen 6 hardware ignores source modifiers (negate and abs) on math
1401     * instructions, so we also move to a temp to set those up.
1402     */
1403    if (brw->gen == 6 || brw->gen == 7)
1404       src = fix_math_operand(src);
1405
1406    fs_inst *inst = emit(opcode, dst, src);
1407
1408    if (brw->gen < 6) {
1409       inst->base_mrf = 2;
1410       inst->mlen = dispatch_width / 8;
1411    }
1412
1413    return inst;
1414 }
1415
1416 fs_inst *
1417 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1418 {
1419    int base_mrf = 2;
1420    fs_inst *inst;
1421
1422    switch (opcode) {
1423    case SHADER_OPCODE_INT_QUOTIENT:
1424    case SHADER_OPCODE_INT_REMAINDER:
1425       if (brw->gen >= 7)
1426          no16("SIMD16 INTDIV unsupported\n");
1427       break;
1428    case SHADER_OPCODE_POW:
1429       break;
1430    default:
1431       unreachable("not reached: unsupported binary math opcode.");
1432    }
1433
1434    if (brw->gen >= 8) {
1435       inst = emit(opcode, dst, src0, src1);
1436    } else if (brw->gen >= 6) {
1437       src0 = fix_math_operand(src0);
1438       src1 = fix_math_operand(src1);
1439
1440       inst = emit(opcode, dst, src0, src1);
1441    } else {
1442       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1443        * "Message Payload":
1444        *
1445        * "Operand0[7].  For the INT DIV functions, this operand is the
1446        *  denominator."
1447        *  ...
1448        * "Operand1[7].  For the INT DIV functions, this operand is the
1449        *  numerator."
1450        */
1451       bool is_int_div = opcode != SHADER_OPCODE_POW;
1452       fs_reg &op0 = is_int_div ? src1 : src0;
1453       fs_reg &op1 = is_int_div ? src0 : src1;
1454
1455       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1456       inst = emit(opcode, dst, op0, reg_null_f);
1457
1458       inst->base_mrf = base_mrf;
1459       inst->mlen = 2 * dispatch_width / 8;
1460    }
1461    return inst;
1462 }
1463
1464 void
1465 fs_visitor::assign_curb_setup()
1466 {
1467    if (dispatch_width == 8) {
1468       prog_data->first_curbe_grf = payload.num_regs;
1469    } else {
1470       prog_data->first_curbe_grf_16 = payload.num_regs;
1471    }
1472
1473    prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1474
1475    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1476    foreach_in_list(fs_inst, inst, &instructions) {
1477       for (unsigned int i = 0; i < inst->sources; i++) {
1478          if (inst->src[i].file == UNIFORM) {
1479             int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1480             int constant_nr;
1481             if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1482                constant_nr = push_constant_loc[uniform_nr];
1483             } else {
1484                /* Section 5.11 of the OpenGL 4.1 spec says:
1485                 * "Out-of-bounds reads return undefined values, which include
1486                 *  values from other variables of the active program or zero."
1487                 * Just return the first push constant.
1488                 */
1489                constant_nr = 0;
1490             }
1491
1492             struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1493                                                   constant_nr / 8,
1494                                                   constant_nr % 8);
1495
1496             inst->src[i].file = HW_REG;
1497             inst->src[i].fixed_hw_reg = byte_offset(
1498                retype(brw_reg, inst->src[i].type),
1499                inst->src[i].subreg_offset);
1500          }
1501       }
1502    }
1503 }
1504
1505 void
1506 fs_visitor::calculate_urb_setup()
1507 {
1508    for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1509       prog_data->urb_setup[i] = -1;
1510    }
1511
1512    int urb_next = 0;
1513    /* Figure out where each of the incoming setup attributes lands. */
1514    if (brw->gen >= 6) {
1515       if (_mesa_bitcount_64(fp->Base.InputsRead &
1516                             BRW_FS_VARYING_INPUT_MASK) <= 16) {
1517          /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1518           * first 16 varying inputs, so we can put them wherever we want.
1519           * Just put them in order.
1520           *
1521           * This is useful because it means that (a) inputs not used by the
1522           * fragment shader won't take up valuable register space, and (b) we
1523           * won't have to recompile the fragment shader if it gets paired with
1524           * a different vertex (or geometry) shader.
1525           */
1526          for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1527             if (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1528                 BITFIELD64_BIT(i)) {
1529                prog_data->urb_setup[i] = urb_next++;
1530             }
1531          }
1532       } else {
1533          /* We have enough input varyings that the SF/SBE pipeline stage can't
1534           * arbitrarily rearrange them to suit our whim; we have to put them
1535           * in an order that matches the output of the previous pipeline stage
1536           * (geometry or vertex shader).
1537           */
1538          struct brw_vue_map prev_stage_vue_map;
1539          brw_compute_vue_map(brw, &prev_stage_vue_map,
1540                              key->input_slots_valid);
1541          int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1542          assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1543          for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1544               slot++) {
1545             int varying = prev_stage_vue_map.slot_to_varying[slot];
1546             /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1547              * unused.
1548              */
1549             if (varying != BRW_VARYING_SLOT_COUNT &&
1550                 (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1551                  BITFIELD64_BIT(varying))) {
1552                prog_data->urb_setup[varying] = slot - first_slot;
1553             }
1554          }
1555          urb_next = prev_stage_vue_map.num_slots - first_slot;
1556       }
1557    } else {
1558       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1559       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1560          /* Point size is packed into the header, not as a general attribute */
1561          if (i == VARYING_SLOT_PSIZ)
1562             continue;
1563
1564          if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1565             /* The back color slot is skipped when the front color is
1566              * also written to.  In addition, some slots can be
1567              * written in the vertex shader and not read in the
1568              * fragment shader.  So the register number must always be
1569              * incremented, mapped or not.
1570              */
1571             if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1572                prog_data->urb_setup[i] = urb_next;
1573             urb_next++;
1574          }
1575       }
1576
1577       /*
1578        * It's a FS only attribute, and we did interpolation for this attribute
1579        * in SF thread. So, count it here, too.
1580        *
1581        * See compile_sf_prog() for more info.
1582        */
1583       if (fp->Base.InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1584          prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1585    }
1586
1587    prog_data->num_varying_inputs = urb_next;
1588 }
1589
1590 void
1591 fs_visitor::assign_urb_setup()
1592 {
1593    int urb_start = payload.num_regs + prog_data->curb_read_length;
1594
1595    /* Offset all the urb_setup[] index by the actual position of the
1596     * setup regs, now that the location of the constants has been chosen.
1597     */
1598    foreach_in_list(fs_inst, inst, &instructions) {
1599       if (inst->opcode == FS_OPCODE_LINTERP) {
1600          assert(inst->src[2].file == HW_REG);
1601          inst->src[2].fixed_hw_reg.nr += urb_start;
1602       }
1603
1604       if (inst->opcode == FS_OPCODE_CINTERP) {
1605          assert(inst->src[0].file == HW_REG);
1606          inst->src[0].fixed_hw_reg.nr += urb_start;
1607       }
1608    }
1609
1610    /* Each attribute is 4 setup channels, each of which is half a reg. */
1611    this->first_non_payload_grf =
1612       urb_start + prog_data->num_varying_inputs * 2;
1613 }
1614
1615 /**
1616  * Split large virtual GRFs into separate components if we can.
1617  *
1618  * This is mostly duplicated with what brw_fs_vector_splitting does,
1619  * but that's really conservative because it's afraid of doing
1620  * splitting that doesn't result in real progress after the rest of
1621  * the optimization phases, which would cause infinite looping in
1622  * optimization.  We can do it once here, safely.  This also has the
1623  * opportunity to split interpolated values, or maybe even uniforms,
1624  * which we don't have at the IR level.
1625  *
1626  * We want to split, because virtual GRFs are what we register
1627  * allocate and spill (due to contiguousness requirements for some
1628  * instructions), and they're what we naturally generate in the
1629  * codegen process, but most virtual GRFs don't actually need to be
1630  * contiguous sets of GRFs.  If we split, we'll end up with reduced
1631  * live intervals and better dead code elimination and coalescing.
1632  */
1633 void
1634 fs_visitor::split_virtual_grfs()
1635 {
1636    int num_vars = this->virtual_grf_count;
1637    bool split_grf[num_vars];
1638    int new_virtual_grf[num_vars];
1639
1640    /* Try to split anything > 0 sized. */
1641    for (int i = 0; i < num_vars; i++) {
1642       if (this->virtual_grf_sizes[i] != 1)
1643          split_grf[i] = true;
1644       else
1645          split_grf[i] = false;
1646    }
1647
1648    if (brw->has_pln &&
1649        this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1650       /* PLN opcodes rely on the delta_xy being contiguous.  We only have to
1651        * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1652        * Gen6, that was the only supported interpolation mode, and since Gen6,
1653        * delta_x and delta_y are in fixed hardware registers.
1654        */
1655       split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1656          false;
1657    }
1658
1659    foreach_in_list(fs_inst, inst, &instructions) {
1660       /* If there's a SEND message that requires contiguous destination
1661        * registers, no splitting is allowed.
1662        */
1663       if (inst->regs_written > 1) {
1664          split_grf[inst->dst.reg] = false;
1665       }
1666
1667       /* If we're sending from a GRF, don't split it, on the assumption that
1668        * the send is reading the whole thing.
1669        */
1670       if (inst->is_send_from_grf()) {
1671          for (int i = 0; i < inst->sources; i++) {
1672             if (inst->src[i].file == GRF) {
1673                split_grf[inst->src[i].reg] = false;
1674             }
1675          }
1676       }
1677    }
1678
1679    /* Allocate new space for split regs.  Note that the virtual
1680     * numbers will be contiguous.
1681     */
1682    for (int i = 0; i < num_vars; i++) {
1683       if (split_grf[i]) {
1684          new_virtual_grf[i] = virtual_grf_alloc(1);
1685          for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1686             int reg = virtual_grf_alloc(1);
1687             assert(reg == new_virtual_grf[i] + j - 1);
1688             (void) reg;
1689          }
1690          this->virtual_grf_sizes[i] = 1;
1691       }
1692    }
1693
1694    foreach_in_list(fs_inst, inst, &instructions) {
1695       if (inst->dst.file == GRF &&
1696           split_grf[inst->dst.reg] &&
1697           inst->dst.reg_offset != 0) {
1698          inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1699                           inst->dst.reg_offset - 1);
1700          inst->dst.reg_offset = 0;
1701       }
1702       for (int i = 0; i < inst->sources; i++) {
1703          if (inst->src[i].file == GRF &&
1704              split_grf[inst->src[i].reg] &&
1705              inst->src[i].reg_offset != 0) {
1706             inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1707                                 inst->src[i].reg_offset - 1);
1708             inst->src[i].reg_offset = 0;
1709          }
1710       }
1711    }
1712    invalidate_live_intervals();
1713 }
1714
1715 /**
1716  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1717  *
1718  * During code generation, we create tons of temporary variables, many of
1719  * which get immediately killed and are never used again.  Yet, in later
1720  * optimization and analysis passes, such as compute_live_intervals, we need
1721  * to loop over all the virtual GRFs.  Compacting them can save a lot of
1722  * overhead.
1723  */
1724 void
1725 fs_visitor::compact_virtual_grfs()
1726 {
1727    if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER))
1728       return;
1729
1730    /* Mark which virtual GRFs are used, and count how many. */
1731    int remap_table[this->virtual_grf_count];
1732    memset(remap_table, -1, sizeof(remap_table));
1733
1734    foreach_in_list(const fs_inst, inst, &instructions) {
1735       if (inst->dst.file == GRF)
1736          remap_table[inst->dst.reg] = 0;
1737
1738       for (int i = 0; i < inst->sources; i++) {
1739          if (inst->src[i].file == GRF)
1740             remap_table[inst->src[i].reg] = 0;
1741       }
1742    }
1743
1744    /* Compact the GRF arrays. */
1745    int new_index = 0;
1746    for (int i = 0; i < this->virtual_grf_count; i++) {
1747       if (remap_table[i] != -1) {
1748          remap_table[i] = new_index;
1749          virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1750          invalidate_live_intervals();
1751          ++new_index;
1752       }
1753    }
1754
1755    this->virtual_grf_count = new_index;
1756
1757    /* Patch all the instructions to use the newly renumbered registers */
1758    foreach_in_list(fs_inst, inst, &instructions) {
1759       if (inst->dst.file == GRF)
1760          inst->dst.reg = remap_table[inst->dst.reg];
1761
1762       for (int i = 0; i < inst->sources; i++) {
1763          if (inst->src[i].file == GRF)
1764             inst->src[i].reg = remap_table[inst->src[i].reg];
1765       }
1766    }
1767 }
1768
1769 /*
1770  * Implements array access of uniforms by inserting a
1771  * PULL_CONSTANT_LOAD instruction.
1772  *
1773  * Unlike temporary GRF array access (where we don't support it due to
1774  * the difficulty of doing relative addressing on instruction
1775  * destinations), we could potentially do array access of uniforms
1776  * that were loaded in GRF space as push constants.  In real-world
1777  * usage we've seen, though, the arrays being used are always larger
1778  * than we could load as push constants, so just always move all
1779  * uniform array access out to a pull constant buffer.
1780  */
1781 void
1782 fs_visitor::move_uniform_array_access_to_pull_constants()
1783 {
1784    if (dispatch_width != 8)
1785       return;
1786
1787    pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
1788
1789    for (unsigned int i = 0; i < uniforms; i++) {
1790       pull_constant_loc[i] = -1;
1791    }
1792
1793    /* Walk through and find array access of uniforms.  Put a copy of that
1794     * uniform in the pull constant buffer.
1795     *
1796     * Note that we don't move constant-indexed accesses to arrays.  No
1797     * testing has been done of the performance impact of this choice.
1798     */
1799    foreach_in_list_safe(fs_inst, inst, &instructions) {
1800       for (int i = 0 ; i < inst->sources; i++) {
1801          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1802             continue;
1803
1804          int uniform = inst->src[i].reg;
1805
1806          /* If this array isn't already present in the pull constant buffer,
1807           * add it.
1808           */
1809          if (pull_constant_loc[uniform] == -1) {
1810             const float **values = &stage_prog_data->param[uniform];
1811
1812             assert(param_size[uniform]);
1813
1814             for (int j = 0; j < param_size[uniform]; j++) {
1815                pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
1816
1817                stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
1818                   values[j];
1819             }
1820          }
1821       }
1822    }
1823 }
1824
1825 /**
1826  * Assign UNIFORM file registers to either push constants or pull constants.
1827  *
1828  * We allow a fragment shader to have more than the specified minimum
1829  * maximum number of fragment shader uniform components (64).  If
1830  * there are too many of these, they'd fill up all of register space.
1831  * So, this will push some of them out to the pull constant buffer and
1832  * update the program to load them.
1833  */
1834 void
1835 fs_visitor::assign_constant_locations()
1836 {
1837    /* Only the first compile (SIMD8 mode) gets to decide on locations. */
1838    if (dispatch_width != 8)
1839       return;
1840
1841    /* Find which UNIFORM registers are still in use. */
1842    bool is_live[uniforms];
1843    for (unsigned int i = 0; i < uniforms; i++) {
1844       is_live[i] = false;
1845    }
1846
1847    foreach_in_list(fs_inst, inst, &instructions) {
1848       for (int i = 0; i < inst->sources; i++) {
1849          if (inst->src[i].file != UNIFORM)
1850             continue;
1851
1852          int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1853          if (constant_nr >= 0 && constant_nr < (int) uniforms)
1854             is_live[constant_nr] = true;
1855       }
1856    }
1857
1858    /* Only allow 16 registers (128 uniform components) as push constants.
1859     *
1860     * Just demote the end of the list.  We could probably do better
1861     * here, demoting things that are rarely used in the program first.
1862     */
1863    unsigned int max_push_components = 16 * 8;
1864    unsigned int num_push_constants = 0;
1865
1866    push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
1867
1868    for (unsigned int i = 0; i < uniforms; i++) {
1869       if (!is_live[i] || pull_constant_loc[i] != -1) {
1870          /* This UNIFORM register is either dead, or has already been demoted
1871           * to a pull const.  Mark it as no longer living in the param[] array.
1872           */
1873          push_constant_loc[i] = -1;
1874          continue;
1875       }
1876
1877       if (num_push_constants < max_push_components) {
1878          /* Retain as a push constant.  Record the location in the params[]
1879           * array.
1880           */
1881          push_constant_loc[i] = num_push_constants++;
1882       } else {
1883          /* Demote to a pull constant. */
1884          push_constant_loc[i] = -1;
1885
1886          int pull_index = stage_prog_data->nr_pull_params++;
1887          stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
1888          pull_constant_loc[i] = pull_index;
1889       }
1890    }
1891
1892    stage_prog_data->nr_params = num_push_constants;
1893
1894    /* Up until now, the param[] array has been indexed by reg + reg_offset
1895     * of UNIFORM registers.  Condense it to only contain the uniforms we
1896     * chose to upload as push constants.
1897     */
1898    for (unsigned int i = 0; i < uniforms; i++) {
1899       int remapped = push_constant_loc[i];
1900
1901       if (remapped == -1)
1902          continue;
1903
1904       assert(remapped <= (int)i);
1905       stage_prog_data->param[remapped] = stage_prog_data->param[i];
1906    }
1907 }
1908
1909 /**
1910  * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
1911  * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
1912  */
1913 void
1914 fs_visitor::demote_pull_constants()
1915 {
1916    foreach_in_list(fs_inst, inst, &instructions) {
1917       for (int i = 0; i < inst->sources; i++) {
1918          if (inst->src[i].file != UNIFORM)
1919             continue;
1920
1921          int pull_index = pull_constant_loc[inst->src[i].reg +
1922                                             inst->src[i].reg_offset];
1923          if (pull_index == -1)
1924             continue;
1925
1926          /* Set up the annotation tracking for new generated instructions. */
1927          base_ir = inst->ir;
1928          current_annotation = inst->annotation;
1929
1930          fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
1931          fs_reg dst = fs_reg(this, glsl_type::float_type);
1932
1933          /* Generate a pull load into dst. */
1934          if (inst->src[i].reladdr) {
1935             exec_list list = VARYING_PULL_CONSTANT_LOAD(dst,
1936                                                         surf_index,
1937                                                         *inst->src[i].reladdr,
1938                                                         pull_index);
1939             inst->insert_before(&list);
1940             inst->src[i].reladdr = NULL;
1941          } else {
1942             fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1943             fs_inst *pull =
1944                new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1945                                     dst, surf_index, offset);
1946             inst->insert_before(pull);
1947             inst->src[i].set_smear(pull_index & 3);
1948          }
1949
1950          /* Rewrite the instruction to use the temporary VGRF. */
1951          inst->src[i].file = GRF;
1952          inst->src[i].reg = dst.reg;
1953          inst->src[i].reg_offset = 0;
1954       }
1955    }
1956    invalidate_live_intervals();
1957 }
1958
1959 bool
1960 fs_visitor::opt_algebraic()
1961 {
1962    bool progress = false;
1963
1964    foreach_in_list(fs_inst, inst, &instructions) {
1965       switch (inst->opcode) {
1966       case BRW_OPCODE_MUL:
1967          if (inst->src[1].file != IMM)
1968             continue;
1969
1970          /* a * 1.0 = a */
1971          if (inst->src[1].is_one()) {
1972             inst->opcode = BRW_OPCODE_MOV;
1973             inst->src[1] = reg_undef;
1974             progress = true;
1975             break;
1976          }
1977
1978          /* a * 0.0 = 0.0 */
1979          if (inst->src[1].is_zero()) {
1980             inst->opcode = BRW_OPCODE_MOV;
1981             inst->src[0] = inst->src[1];
1982             inst->src[1] = reg_undef;
1983             progress = true;
1984             break;
1985          }
1986
1987          break;
1988       case BRW_OPCODE_ADD:
1989          if (inst->src[1].file != IMM)
1990             continue;
1991
1992          /* a + 0.0 = a */
1993          if (inst->src[1].is_zero()) {
1994             inst->opcode = BRW_OPCODE_MOV;
1995             inst->src[1] = reg_undef;
1996             progress = true;
1997             break;
1998          }
1999          break;
2000       case BRW_OPCODE_OR:
2001          if (inst->src[0].equals(inst->src[1])) {
2002             inst->opcode = BRW_OPCODE_MOV;
2003             inst->src[1] = reg_undef;
2004             progress = true;
2005             break;
2006          }
2007          break;
2008       case BRW_OPCODE_LRP:
2009          if (inst->src[1].equals(inst->src[2])) {
2010             inst->opcode = BRW_OPCODE_MOV;
2011             inst->src[0] = inst->src[1];
2012             inst->src[1] = reg_undef;
2013             inst->src[2] = reg_undef;
2014             progress = true;
2015             break;
2016          }
2017          break;
2018       case BRW_OPCODE_SEL:
2019          if (inst->src[0].equals(inst->src[1])) {
2020             inst->opcode = BRW_OPCODE_MOV;
2021             inst->src[1] = reg_undef;
2022             inst->predicate = BRW_PREDICATE_NONE;
2023             inst->predicate_inverse = false;
2024             progress = true;
2025          } else if (inst->saturate && inst->src[1].file == IMM) {
2026             switch (inst->conditional_mod) {
2027             case BRW_CONDITIONAL_LE:
2028             case BRW_CONDITIONAL_L:
2029                switch (inst->src[1].type) {
2030                case BRW_REGISTER_TYPE_F:
2031                   if (inst->src[1].imm.f >= 1.0f) {
2032                      inst->opcode = BRW_OPCODE_MOV;
2033                      inst->src[1] = reg_undef;
2034                      progress = true;
2035                   }
2036                   break;
2037                default:
2038                   break;
2039                }
2040                break;
2041             case BRW_CONDITIONAL_GE:
2042             case BRW_CONDITIONAL_G:
2043                switch (inst->src[1].type) {
2044                case BRW_REGISTER_TYPE_F:
2045                   if (inst->src[1].imm.f <= 0.0f) {
2046                      inst->opcode = BRW_OPCODE_MOV;
2047                      inst->src[1] = reg_undef;
2048                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
2049                      progress = true;
2050                   }
2051                   break;
2052                default:
2053                   break;
2054                }
2055             default:
2056                break;
2057             }
2058          }
2059          break;
2060       default:
2061          break;
2062       }
2063    }
2064
2065    return progress;
2066 }
2067
2068 bool
2069 fs_visitor::compute_to_mrf()
2070 {
2071    bool progress = false;
2072    int next_ip = 0;
2073
2074    calculate_live_intervals();
2075
2076    foreach_in_list_safe(fs_inst, inst, &instructions) {
2077       int ip = next_ip;
2078       next_ip++;
2079
2080       if (inst->opcode != BRW_OPCODE_MOV ||
2081           inst->is_partial_write() ||
2082           inst->dst.file != MRF || inst->src[0].file != GRF ||
2083           inst->dst.type != inst->src[0].type ||
2084           inst->src[0].abs || inst->src[0].negate ||
2085           !inst->src[0].is_contiguous() ||
2086           inst->src[0].subreg_offset)
2087          continue;
2088
2089       /* Work out which hardware MRF registers are written by this
2090        * instruction.
2091        */
2092       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2093       int mrf_high;
2094       if (inst->dst.reg & BRW_MRF_COMPR4) {
2095          mrf_high = mrf_low + 4;
2096       } else if (dispatch_width == 16 &&
2097                  (!inst->force_uncompressed && !inst->force_sechalf)) {
2098          mrf_high = mrf_low + 1;
2099       } else {
2100          mrf_high = mrf_low;
2101       }
2102
2103       /* Can't compute-to-MRF this GRF if someone else was going to
2104        * read it later.
2105        */
2106       if (this->virtual_grf_end[inst->src[0].reg] > ip)
2107          continue;
2108
2109       /* Found a move of a GRF to a MRF.  Let's see if we can go
2110        * rewrite the thing that made this GRF to write into the MRF.
2111        */
2112       fs_inst *scan_inst;
2113       for (scan_inst = (fs_inst *)inst->prev;
2114            !scan_inst->is_head_sentinel();
2115            scan_inst = (fs_inst *)scan_inst->prev) {
2116          if (scan_inst->dst.file == GRF &&
2117              scan_inst->dst.reg == inst->src[0].reg) {
2118             /* Found the last thing to write our reg we want to turn
2119              * into a compute-to-MRF.
2120              */
2121
2122             /* If this one instruction didn't populate all the
2123              * channels, bail.  We might be able to rewrite everything
2124              * that writes that reg, but it would require smarter
2125              * tracking to delay the rewriting until complete success.
2126              */
2127             if (scan_inst->is_partial_write())
2128                break;
2129
2130             /* Things returning more than one register would need us to
2131              * understand coalescing out more than one MOV at a time.
2132              */
2133             if (scan_inst->regs_written > 1)
2134                break;
2135
2136             /* SEND instructions can't have MRF as a destination. */
2137             if (scan_inst->mlen)
2138                break;
2139
2140             if (brw->gen == 6) {
2141                /* gen6 math instructions must have the destination be
2142                 * GRF, so no compute-to-MRF for them.
2143                 */
2144                if (scan_inst->is_math()) {
2145                   break;
2146                }
2147             }
2148
2149             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2150                /* Found the creator of our MRF's source value. */
2151                scan_inst->dst.file = MRF;
2152                scan_inst->dst.reg = inst->dst.reg;
2153                scan_inst->saturate |= inst->saturate;
2154                inst->remove();
2155                progress = true;
2156             }
2157             break;
2158          }
2159
2160          /* We don't handle control flow here.  Most computation of
2161           * values that end up in MRFs are shortly before the MRF
2162           * write anyway.
2163           */
2164          if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2165             break;
2166
2167          /* You can't read from an MRF, so if someone else reads our
2168           * MRF's source GRF that we wanted to rewrite, that stops us.
2169           */
2170          bool interfered = false;
2171          for (int i = 0; i < scan_inst->sources; i++) {
2172             if (scan_inst->src[i].file == GRF &&
2173                 scan_inst->src[i].reg == inst->src[0].reg &&
2174                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2175                interfered = true;
2176             }
2177          }
2178          if (interfered)
2179             break;
2180
2181          if (scan_inst->dst.file == MRF) {
2182             /* If somebody else writes our MRF here, we can't
2183              * compute-to-MRF before that.
2184              */
2185             int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2186             int scan_mrf_high;
2187
2188             if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2189                scan_mrf_high = scan_mrf_low + 4;
2190             } else if (dispatch_width == 16 &&
2191                        (!scan_inst->force_uncompressed &&
2192                         !scan_inst->force_sechalf)) {
2193                scan_mrf_high = scan_mrf_low + 1;
2194             } else {
2195                scan_mrf_high = scan_mrf_low;
2196             }
2197
2198             if (mrf_low == scan_mrf_low ||
2199                 mrf_low == scan_mrf_high ||
2200                 mrf_high == scan_mrf_low ||
2201                 mrf_high == scan_mrf_high) {
2202                break;
2203             }
2204          }
2205
2206          if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2207             /* Found a SEND instruction, which means that there are
2208              * live values in MRFs from base_mrf to base_mrf +
2209              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
2210              * above it.
2211              */
2212             if (mrf_low >= scan_inst->base_mrf &&
2213                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2214                break;
2215             }
2216             if (mrf_high >= scan_inst->base_mrf &&
2217                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2218                break;
2219             }
2220          }
2221       }
2222    }
2223
2224    if (progress)
2225       invalidate_live_intervals();
2226
2227    return progress;
2228 }
2229
2230 /**
2231  * Walks through basic blocks, looking for repeated MRF writes and
2232  * removing the later ones.
2233  */
2234 bool
2235 fs_visitor::remove_duplicate_mrf_writes()
2236 {
2237    fs_inst *last_mrf_move[16];
2238    bool progress = false;
2239
2240    /* Need to update the MRF tracking for compressed instructions. */
2241    if (dispatch_width == 16)
2242       return false;
2243
2244    memset(last_mrf_move, 0, sizeof(last_mrf_move));
2245
2246    foreach_in_list_safe(fs_inst, inst, &instructions) {
2247       if (inst->is_control_flow()) {
2248          memset(last_mrf_move, 0, sizeof(last_mrf_move));
2249       }
2250
2251       if (inst->opcode == BRW_OPCODE_MOV &&
2252           inst->dst.file == MRF) {
2253          fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2254          if (prev_inst && inst->equals(prev_inst)) {
2255             inst->remove();
2256             progress = true;
2257             continue;
2258          }
2259       }
2260
2261       /* Clear out the last-write records for MRFs that were overwritten. */
2262       if (inst->dst.file == MRF) {
2263          last_mrf_move[inst->dst.reg] = NULL;
2264       }
2265
2266       if (inst->mlen > 0 && inst->base_mrf != -1) {
2267          /* Found a SEND instruction, which will include two or fewer
2268           * implied MRF writes.  We could do better here.
2269           */
2270          for (int i = 0; i < implied_mrf_writes(inst); i++) {
2271             last_mrf_move[inst->base_mrf + i] = NULL;
2272          }
2273       }
2274
2275       /* Clear out any MRF move records whose sources got overwritten. */
2276       if (inst->dst.file == GRF) {
2277          for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2278             if (last_mrf_move[i] &&
2279                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2280                last_mrf_move[i] = NULL;
2281             }
2282          }
2283       }
2284
2285       if (inst->opcode == BRW_OPCODE_MOV &&
2286           inst->dst.file == MRF &&
2287           inst->src[0].file == GRF &&
2288           !inst->is_partial_write()) {
2289          last_mrf_move[inst->dst.reg] = inst;
2290       }
2291    }
2292
2293    if (progress)
2294       invalidate_live_intervals();
2295
2296    return progress;
2297 }
2298
2299 static void
2300 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2301                         int first_grf, int grf_len)
2302 {
2303    bool inst_simd16 = (dispatch_width > 8 &&
2304                        !inst->force_uncompressed &&
2305                        !inst->force_sechalf);
2306
2307    /* Clear the flag for registers that actually got read (as expected). */
2308    for (int i = 0; i < inst->sources; i++) {
2309       int grf;
2310       if (inst->src[i].file == GRF) {
2311          grf = inst->src[i].reg;
2312       } else if (inst->src[i].file == HW_REG &&
2313                  inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2314          grf = inst->src[i].fixed_hw_reg.nr;
2315       } else {
2316          continue;
2317       }
2318
2319       if (grf >= first_grf &&
2320           grf < first_grf + grf_len) {
2321          deps[grf - first_grf] = false;
2322          if (inst_simd16)
2323             deps[grf - first_grf + 1] = false;
2324       }
2325    }
2326 }
2327
2328 /**
2329  * Implements this workaround for the original 965:
2330  *
2331  *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2332  *      check for post destination dependencies on this instruction, software
2333  *      must ensure that there is no destination hazard for the case of ‘write
2334  *      followed by a posted write’ shown in the following example.
2335  *
2336  *      1. mov r3 0
2337  *      2. send r3.xy <rest of send instruction>
2338  *      3. mov r2 r3
2339  *
2340  *      Due to no post-destination dependency check on the ‘send’, the above
2341  *      code sequence could have two instructions (1 and 2) in flight at the
2342  *      same time that both consider ‘r3’ as the target of their final writes.
2343  */
2344 void
2345 fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
2346 {
2347    int reg_size = dispatch_width / 8;
2348    int write_len = inst->regs_written * reg_size;
2349    int first_write_grf = inst->dst.reg;
2350    bool needs_dep[BRW_MAX_MRF];
2351    assert(write_len < (int)sizeof(needs_dep) - 1);
2352
2353    memset(needs_dep, false, sizeof(needs_dep));
2354    memset(needs_dep, true, write_len);
2355
2356    clear_deps_for_inst_src(inst, dispatch_width,
2357                            needs_dep, first_write_grf, write_len);
2358
2359    /* Walk backwards looking for writes to registers we're writing which
2360     * aren't read since being written.  If we hit the start of the program,
2361     * we assume that there are no outstanding dependencies on entry to the
2362     * program.
2363     */
2364    for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2365         !scan_inst->is_head_sentinel();
2366         scan_inst = (fs_inst *)scan_inst->prev) {
2367
2368       /* If we hit control flow, assume that there *are* outstanding
2369        * dependencies, and force their cleanup before our instruction.
2370        */
2371       if (scan_inst->is_control_flow()) {
2372          for (int i = 0; i < write_len; i++) {
2373             if (needs_dep[i]) {
2374                inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2375             }
2376          }
2377          return;
2378       }
2379
2380       bool scan_inst_simd16 = (dispatch_width > 8 &&
2381                                !scan_inst->force_uncompressed &&
2382                                !scan_inst->force_sechalf);
2383
2384       /* We insert our reads as late as possible on the assumption that any
2385        * instruction but a MOV that might have left us an outstanding
2386        * dependency has more latency than a MOV.
2387        */
2388       if (scan_inst->dst.file == GRF) {
2389          for (int i = 0; i < scan_inst->regs_written; i++) {
2390             int reg = scan_inst->dst.reg + i * reg_size;
2391
2392             if (reg >= first_write_grf &&
2393                 reg < first_write_grf + write_len &&
2394                 needs_dep[reg - first_write_grf]) {
2395                inst->insert_before(DEP_RESOLVE_MOV(reg));
2396                needs_dep[reg - first_write_grf] = false;
2397                if (scan_inst_simd16)
2398                   needs_dep[reg - first_write_grf + 1] = false;
2399             }
2400          }
2401       }
2402
2403       /* Clear the flag for registers that actually got read (as expected). */
2404       clear_deps_for_inst_src(scan_inst, dispatch_width,
2405                               needs_dep, first_write_grf, write_len);
2406
2407       /* Continue the loop only if we haven't resolved all the dependencies */
2408       int i;
2409       for (i = 0; i < write_len; i++) {
2410          if (needs_dep[i])
2411             break;
2412       }
2413       if (i == write_len)
2414          return;
2415    }
2416 }
2417
2418 /**
2419  * Implements this workaround for the original 965:
2420  *
2421  *     "[DevBW, DevCL] Errata: A destination register from a send can not be
2422  *      used as a destination register until after it has been sourced by an
2423  *      instruction with a different destination register.
2424  */
2425 void
2426 fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
2427 {
2428    int write_len = inst->regs_written * dispatch_width / 8;
2429    int first_write_grf = inst->dst.reg;
2430    bool needs_dep[BRW_MAX_MRF];
2431    assert(write_len < (int)sizeof(needs_dep) - 1);
2432
2433    memset(needs_dep, false, sizeof(needs_dep));
2434    memset(needs_dep, true, write_len);
2435    /* Walk forwards looking for writes to registers we're writing which aren't
2436     * read before being written.
2437     */
2438    for (fs_inst *scan_inst = (fs_inst *)inst->next;
2439         !scan_inst->is_tail_sentinel();
2440         scan_inst = (fs_inst *)scan_inst->next) {
2441       /* If we hit control flow, force resolve all remaining dependencies. */
2442       if (scan_inst->is_control_flow()) {
2443          for (int i = 0; i < write_len; i++) {
2444             if (needs_dep[i])
2445                scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2446          }
2447          return;
2448       }
2449
2450       /* Clear the flag for registers that actually got read (as expected). */
2451       clear_deps_for_inst_src(scan_inst, dispatch_width,
2452                               needs_dep, first_write_grf, write_len);
2453
2454       /* We insert our reads as late as possible since they're reading the
2455        * result of a SEND, which has massive latency.
2456        */
2457       if (scan_inst->dst.file == GRF &&
2458           scan_inst->dst.reg >= first_write_grf &&
2459           scan_inst->dst.reg < first_write_grf + write_len &&
2460           needs_dep[scan_inst->dst.reg - first_write_grf]) {
2461          scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2462          needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2463       }
2464
2465       /* Continue the loop only if we haven't resolved all the dependencies */
2466       int i;
2467       for (i = 0; i < write_len; i++) {
2468          if (needs_dep[i])
2469             break;
2470       }
2471       if (i == write_len)
2472          return;
2473    }
2474
2475    /* If we hit the end of the program, resolve all remaining dependencies out
2476     * of paranoia.
2477     */
2478    fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2479    assert(last_inst->eot);
2480    for (int i = 0; i < write_len; i++) {
2481       if (needs_dep[i])
2482          last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2483    }
2484 }
2485
2486 void
2487 fs_visitor::insert_gen4_send_dependency_workarounds()
2488 {
2489    if (brw->gen != 4 || brw->is_g4x)
2490       return;
2491
2492    bool progress = false;
2493
2494    /* Note that we're done with register allocation, so GRF fs_regs always
2495     * have a .reg_offset of 0.
2496     */
2497
2498    foreach_in_list_safe(fs_inst, inst, &instructions) {
2499       if (inst->mlen != 0 && inst->dst.file == GRF) {
2500          insert_gen4_pre_send_dependency_workarounds(inst);
2501          insert_gen4_post_send_dependency_workarounds(inst);
2502          progress = true;
2503       }
2504    }
2505
2506    if (progress)
2507       invalidate_live_intervals();
2508 }
2509
2510 /**
2511  * Turns the generic expression-style uniform pull constant load instruction
2512  * into a hardware-specific series of instructions for loading a pull
2513  * constant.
2514  *
2515  * The expression style allows the CSE pass before this to optimize out
2516  * repeated loads from the same offset, and gives the pre-register-allocation
2517  * scheduling full flexibility, while the conversion to native instructions
2518  * allows the post-register-allocation scheduler the best information
2519  * possible.
2520  *
2521  * Note that execution masking for setting up pull constant loads is special:
2522  * the channels that need to be written are unrelated to the current execution
2523  * mask, since a later instruction will use one of the result channels as a
2524  * source operand for all 8 or 16 of its channels.
2525  */
2526 void
2527 fs_visitor::lower_uniform_pull_constant_loads()
2528 {
2529    foreach_in_list(fs_inst, inst, &instructions) {
2530       if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2531          continue;
2532
2533       if (brw->gen >= 7) {
2534          /* The offset arg before was a vec4-aligned byte offset.  We need to
2535           * turn it into a dword offset.
2536           */
2537          fs_reg const_offset_reg = inst->src[1];
2538          assert(const_offset_reg.file == IMM &&
2539                 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2540          const_offset_reg.imm.u /= 4;
2541          fs_reg payload = fs_reg(this, glsl_type::uint_type);
2542
2543          /* This is actually going to be a MOV, but since only the first dword
2544           * is accessed, we have a special opcode to do just that one.  Note
2545           * that this needs to be an operation that will be considered a def
2546           * by live variable analysis, or register allocation will explode.
2547           */
2548          fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2549                                                payload, const_offset_reg);
2550          setup->force_writemask_all = true;
2551
2552          setup->ir = inst->ir;
2553          setup->annotation = inst->annotation;
2554          inst->insert_before(setup);
2555
2556          /* Similarly, this will only populate the first 4 channels of the
2557           * result register (since we only use smear values from 0-3), but we
2558           * don't tell the optimizer.
2559           */
2560          inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2561          inst->src[1] = payload;
2562
2563          invalidate_live_intervals();
2564       } else {
2565          /* Before register allocation, we didn't tell the scheduler about the
2566           * MRF we use.  We know it's safe to use this MRF because nothing
2567           * else does except for register spill/unspill, which generates and
2568           * uses its MRF within a single IR instruction.
2569           */
2570          inst->base_mrf = 14;
2571          inst->mlen = 1;
2572       }
2573    }
2574 }
2575
2576 bool
2577 fs_visitor::lower_load_payload()
2578 {
2579    bool progress = false;
2580
2581    foreach_in_list_safe(fs_inst, inst, &instructions) {
2582       if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
2583          fs_reg dst = inst->dst;
2584
2585          /* src[0] represents the (optional) message header. */
2586          if (inst->src[0].file != BAD_FILE) {
2587             inst->insert_before(MOV(dst, inst->src[0]));
2588          }
2589          dst.reg_offset++;
2590
2591          for (int i = 1; i < inst->sources; i++) {
2592             inst->insert_before(MOV(dst, inst->src[i]));
2593             dst.reg_offset++;
2594          }
2595
2596          inst->remove();
2597          progress = true;
2598       }
2599    }
2600
2601    if (progress)
2602       invalidate_live_intervals();
2603
2604    return progress;
2605 }
2606
2607 void
2608 fs_visitor::dump_instructions()
2609 {
2610    dump_instructions(NULL);
2611 }
2612
2613 void
2614 fs_visitor::dump_instructions(const char *name)
2615 {
2616    calculate_register_pressure();
2617    FILE *file = stderr;
2618    if (name && geteuid() != 0) {
2619       file = fopen(name, "w");
2620       if (!file)
2621          file = stderr;
2622    }
2623
2624    int ip = 0, max_pressure = 0;
2625    foreach_in_list(backend_instruction, inst, &instructions) {
2626       max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
2627       fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
2628       dump_instruction(inst, file);
2629       ++ip;
2630    }
2631    fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
2632
2633    if (file != stderr) {
2634       fclose(file);
2635    }
2636 }
2637
2638 void
2639 fs_visitor::dump_instruction(backend_instruction *be_inst)
2640 {
2641    dump_instruction(be_inst, stderr);
2642 }
2643
2644 void
2645 fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
2646 {
2647    fs_inst *inst = (fs_inst *)be_inst;
2648
2649    if (inst->predicate) {
2650       fprintf(file, "(%cf0.%d) ",
2651              inst->predicate_inverse ? '-' : '+',
2652              inst->flag_subreg);
2653    }
2654
2655    fprintf(file, "%s", brw_instruction_name(inst->opcode));
2656    if (inst->saturate)
2657       fprintf(file, ".sat");
2658    if (inst->conditional_mod) {
2659       fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
2660       if (!inst->predicate &&
2661           (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2662                               inst->opcode != BRW_OPCODE_IF &&
2663                               inst->opcode != BRW_OPCODE_WHILE))) {
2664          fprintf(file, ".f0.%d", inst->flag_subreg);
2665       }
2666    }
2667    fprintf(file, " ");
2668
2669
2670    switch (inst->dst.file) {
2671    case GRF:
2672       fprintf(file, "vgrf%d", inst->dst.reg);
2673       if (virtual_grf_sizes[inst->dst.reg] != 1 ||
2674           inst->dst.subreg_offset)
2675          fprintf(file, "+%d.%d",
2676                  inst->dst.reg_offset, inst->dst.subreg_offset);
2677       break;
2678    case MRF:
2679       fprintf(file, "m%d", inst->dst.reg);
2680       break;
2681    case BAD_FILE:
2682       fprintf(file, "(null)");
2683       break;
2684    case UNIFORM:
2685       fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
2686       break;
2687    case HW_REG:
2688       if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
2689          switch (inst->dst.fixed_hw_reg.nr) {
2690          case BRW_ARF_NULL:
2691             fprintf(file, "null");
2692             break;
2693          case BRW_ARF_ADDRESS:
2694             fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
2695             break;
2696          case BRW_ARF_ACCUMULATOR:
2697             fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
2698             break;
2699          case BRW_ARF_FLAG:
2700             fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
2701                              inst->dst.fixed_hw_reg.subnr);
2702             break;
2703          default:
2704             fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
2705                                inst->dst.fixed_hw_reg.subnr);
2706             break;
2707          }
2708       } else {
2709          fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
2710       }
2711       if (inst->dst.fixed_hw_reg.subnr)
2712          fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
2713       break;
2714    default:
2715       fprintf(file, "???");
2716       break;
2717    }
2718    fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
2719
2720    for (int i = 0; i < inst->sources && inst->src[i].file != BAD_FILE; i++) {
2721       if (inst->src[i].negate)
2722          fprintf(file, "-");
2723       if (inst->src[i].abs)
2724          fprintf(file, "|");
2725       switch (inst->src[i].file) {
2726       case GRF:
2727          fprintf(file, "vgrf%d", inst->src[i].reg);
2728          if (virtual_grf_sizes[inst->src[i].reg] != 1 ||
2729              inst->src[i].subreg_offset)
2730             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
2731                     inst->src[i].subreg_offset);
2732          break;
2733       case MRF:
2734          fprintf(file, "***m%d***", inst->src[i].reg);
2735          break;
2736       case UNIFORM:
2737          fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
2738          if (inst->src[i].reladdr) {
2739             fprintf(file, "+reladdr");
2740          } else if (virtual_grf_sizes[inst->src[i].reg] != 1 ||
2741              inst->src[i].subreg_offset) {
2742             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
2743                     inst->src[i].subreg_offset);
2744          }
2745          break;
2746       case BAD_FILE:
2747          fprintf(file, "(null)");
2748          break;
2749       case IMM:
2750          switch (inst->src[i].type) {
2751          case BRW_REGISTER_TYPE_F:
2752             fprintf(file, "%ff", inst->src[i].imm.f);
2753             break;
2754          case BRW_REGISTER_TYPE_D:
2755             fprintf(file, "%dd", inst->src[i].imm.i);
2756             break;
2757          case BRW_REGISTER_TYPE_UD:
2758             fprintf(file, "%uu", inst->src[i].imm.u);
2759             break;
2760          default:
2761             fprintf(file, "???");
2762             break;
2763          }
2764          break;
2765       case HW_REG:
2766          if (inst->src[i].fixed_hw_reg.negate)
2767             fprintf(file, "-");
2768          if (inst->src[i].fixed_hw_reg.abs)
2769             fprintf(file, "|");
2770          if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
2771             switch (inst->src[i].fixed_hw_reg.nr) {
2772             case BRW_ARF_NULL:
2773                fprintf(file, "null");
2774                break;
2775             case BRW_ARF_ADDRESS:
2776                fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
2777                break;
2778             case BRW_ARF_ACCUMULATOR:
2779                fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
2780                break;
2781             case BRW_ARF_FLAG:
2782                fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
2783                                 inst->src[i].fixed_hw_reg.subnr);
2784                break;
2785             default:
2786                fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
2787                                   inst->src[i].fixed_hw_reg.subnr);
2788                break;
2789             }
2790          } else {
2791             fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
2792          }
2793          if (inst->src[i].fixed_hw_reg.subnr)
2794             fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
2795          if (inst->src[i].fixed_hw_reg.abs)
2796             fprintf(file, "|");
2797          break;
2798       default:
2799          fprintf(file, "???");
2800          break;
2801       }
2802       if (inst->src[i].abs)
2803          fprintf(file, "|");
2804
2805       if (inst->src[i].file != IMM) {
2806          fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
2807       }
2808
2809       if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
2810          fprintf(file, ", ");
2811    }
2812
2813    fprintf(file, " ");
2814
2815    if (inst->force_uncompressed)
2816       fprintf(file, "1sthalf ");
2817
2818    if (inst->force_sechalf)
2819       fprintf(file, "2ndhalf ");
2820
2821    fprintf(file, "\n");
2822 }
2823
2824 /**
2825  * Possibly returns an instruction that set up @param reg.
2826  *
2827  * Sometimes we want to take the result of some expression/variable
2828  * dereference tree and rewrite the instruction generating the result
2829  * of the tree.  When processing the tree, we know that the
2830  * instructions generated are all writing temporaries that are dead
2831  * outside of this tree.  So, if we have some instructions that write
2832  * a temporary, we're free to point that temp write somewhere else.
2833  *
2834  * Note that this doesn't guarantee that the instruction generated
2835  * only reg -- it might be the size=4 destination of a texture instruction.
2836  */
2837 fs_inst *
2838 fs_visitor::get_instruction_generating_reg(fs_inst *start,
2839                                            fs_inst *end,
2840                                            const fs_reg &reg)
2841 {
2842    if (end == start ||
2843        end->is_partial_write() ||
2844        reg.reladdr ||
2845        !reg.equals(end->dst)) {
2846       return NULL;
2847    } else {
2848       return end;
2849    }
2850 }
2851
2852 void
2853 fs_visitor::setup_payload_gen6()
2854 {
2855    bool uses_depth =
2856       (fp->Base.InputsRead & (1 << VARYING_SLOT_POS)) != 0;
2857    unsigned barycentric_interp_modes = prog_data->barycentric_interp_modes;
2858
2859    assert(brw->gen >= 6);
2860
2861    /* R0-1: masks, pixel X/Y coordinates. */
2862    payload.num_regs = 2;
2863    /* R2: only for 32-pixel dispatch.*/
2864
2865    /* R3-26: barycentric interpolation coordinates.  These appear in the
2866     * same order that they appear in the brw_wm_barycentric_interp_mode
2867     * enum.  Each set of coordinates occupies 2 registers if dispatch width
2868     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
2869     * appear if they were enabled using the "Barycentric Interpolation
2870     * Mode" bits in WM_STATE.
2871     */
2872    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
2873       if (barycentric_interp_modes & (1 << i)) {
2874          payload.barycentric_coord_reg[i] = payload.num_regs;
2875          payload.num_regs += 2;
2876          if (dispatch_width == 16) {
2877             payload.num_regs += 2;
2878          }
2879       }
2880    }
2881
2882    /* R27: interpolated depth if uses source depth */
2883    if (uses_depth) {
2884       payload.source_depth_reg = payload.num_regs;
2885       payload.num_regs++;
2886       if (dispatch_width == 16) {
2887          /* R28: interpolated depth if not SIMD8. */
2888          payload.num_regs++;
2889       }
2890    }
2891    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
2892    if (uses_depth) {
2893       payload.source_w_reg = payload.num_regs;
2894       payload.num_regs++;
2895       if (dispatch_width == 16) {
2896          /* R30: interpolated W if not SIMD8. */
2897          payload.num_regs++;
2898       }
2899    }
2900
2901    prog_data->uses_pos_offset = key->compute_pos_offset;
2902    /* R31: MSAA position offsets. */
2903    if (prog_data->uses_pos_offset) {
2904       payload.sample_pos_reg = payload.num_regs;
2905       payload.num_regs++;
2906    }
2907
2908    /* R32: MSAA input coverage mask */
2909    if (fp->Base.SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
2910       assert(brw->gen >= 7);
2911       payload.sample_mask_in_reg = payload.num_regs;
2912       payload.num_regs++;
2913       if (dispatch_width == 16) {
2914          /* R33: input coverage mask if not SIMD8. */
2915          payload.num_regs++;
2916       }
2917    }
2918
2919    /* R34-: bary for 32-pixel. */
2920    /* R58-59: interp W for 32-pixel. */
2921
2922    if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
2923       source_depth_to_render_target = true;
2924    }
2925 }
2926
2927 void
2928 fs_visitor::assign_binding_table_offsets()
2929 {
2930    uint32_t next_binding_table_offset = 0;
2931
2932    /* If there are no color regions, we still perform an FB write to a null
2933     * renderbuffer, which we place at surface index 0.
2934     */
2935    prog_data->binding_table.render_target_start = next_binding_table_offset;
2936    next_binding_table_offset += MAX2(key->nr_color_regions, 1);
2937
2938    assign_common_binding_table_offsets(next_binding_table_offset);
2939 }
2940
2941 void
2942 fs_visitor::calculate_register_pressure()
2943 {
2944    invalidate_live_intervals();
2945    calculate_live_intervals();
2946
2947    int num_instructions = 0;
2948    foreach_in_list(fs_inst, inst, &instructions) {
2949       ++num_instructions;
2950    }
2951
2952    regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
2953
2954    for (int reg = 0; reg < virtual_grf_count; reg++) {
2955       for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
2956          regs_live_at_ip[ip] += virtual_grf_sizes[reg];
2957    }
2958 }
2959
2960 /**
2961  * Look for repeated FS_OPCODE_MOV_DISPATCH_TO_FLAGS and drop the later ones.
2962  *
2963  * The needs_unlit_centroid_workaround ends up producing one of these per
2964  * channel of centroid input, so it's good to clean them up.
2965  *
2966  * An assumption here is that nothing ever modifies the dispatched pixels
2967  * value that FS_OPCODE_MOV_DISPATCH_TO_FLAGS reads from, but the hardware
2968  * dictates that anyway.
2969  */
2970 void
2971 fs_visitor::opt_drop_redundant_mov_to_flags()
2972 {
2973    bool flag_mov_found[2] = {false};
2974
2975    foreach_in_list_safe(fs_inst, inst, &instructions) {
2976       if (inst->is_control_flow()) {
2977          memset(flag_mov_found, 0, sizeof(flag_mov_found));
2978       } else if (inst->opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS) {
2979          if (!flag_mov_found[inst->flag_subreg])
2980             flag_mov_found[inst->flag_subreg] = true;
2981          else
2982             inst->remove();
2983       } else if (inst->writes_flag()) {
2984          flag_mov_found[inst->flag_subreg] = false;
2985       }
2986    }
2987 }
2988
2989 bool
2990 fs_visitor::run()
2991 {
2992    sanity_param_count = fp->Base.Parameters->NumParameters;
2993    bool allocated_without_spills;
2994
2995    assign_binding_table_offsets();
2996
2997    if (brw->gen >= 6)
2998       setup_payload_gen6();
2999    else
3000       setup_payload_gen4();
3001
3002    if (0) {
3003       emit_dummy_fs();
3004    } else {
3005       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3006          emit_shader_time_begin();
3007
3008       calculate_urb_setup();
3009       if (fp->Base.InputsRead > 0) {
3010          if (brw->gen < 6)
3011             emit_interpolation_setup_gen4();
3012          else
3013             emit_interpolation_setup_gen6();
3014       }
3015
3016       /* We handle discards by keeping track of the still-live pixels in f0.1.
3017        * Initialize it with the dispatched pixels.
3018        */
3019       if (fp->UsesKill || key->alpha_test_func) {
3020          fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
3021          discard_init->flag_subreg = 1;
3022       }
3023
3024       /* Generate FS IR for main().  (the visitor only descends into
3025        * functions called "main").
3026        */
3027       if (shader) {
3028          foreach_in_list(ir_instruction, ir, shader->base.ir) {
3029             base_ir = ir;
3030             this->result = reg_undef;
3031             ir->accept(this);
3032          }
3033       } else {
3034          emit_fragment_program_code();
3035       }
3036       base_ir = NULL;
3037       if (failed)
3038          return false;
3039
3040       emit(FS_OPCODE_PLACEHOLDER_HALT);
3041
3042       if (key->alpha_test_func)
3043          emit_alpha_test();
3044
3045       emit_fb_writes();
3046
3047       split_virtual_grfs();
3048
3049       move_uniform_array_access_to_pull_constants();
3050       assign_constant_locations();
3051       demote_pull_constants();
3052
3053       opt_drop_redundant_mov_to_flags();
3054
3055 #define OPT(pass, args...) do {                                            \
3056       pass_num++;                                                          \
3057       bool this_progress = pass(args);                                     \
3058                                                                            \
3059       if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) {      \
3060          char filename[64];                                                \
3061          snprintf(filename, 64, "fs%d-%04d-%02d-%02d-" #pass,              \
3062                   dispatch_width, shader_prog->Name, iteration, pass_num); \
3063                                                                            \
3064          backend_visitor::dump_instructions(filename);                     \
3065       }                                                                    \
3066                                                                            \
3067       progress = progress || this_progress;                                \
3068    } while (false)
3069
3070       if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
3071          char filename[64];
3072          snprintf(filename, 64, "fs%d-%04d-00-start",
3073                   dispatch_width, shader_prog->Name);
3074
3075          backend_visitor::dump_instructions(filename);
3076       }
3077
3078       bool progress;
3079       int iteration = 0;
3080       do {
3081          progress = false;
3082          iteration++;
3083          int pass_num = 0;
3084
3085          compact_virtual_grfs();
3086
3087          OPT(remove_duplicate_mrf_writes);
3088
3089          OPT(opt_algebraic);
3090          OPT(opt_cse);
3091          OPT(opt_copy_propagate);
3092          OPT(opt_peephole_predicated_break);
3093          OPT(dead_code_eliminate);
3094          OPT(opt_peephole_sel);
3095          OPT(dead_control_flow_eliminate, this);
3096          OPT(opt_saturate_propagation);
3097          OPT(register_coalesce);
3098          OPT(compute_to_mrf);
3099       } while (progress);
3100
3101       if (lower_load_payload()) {
3102          register_coalesce();
3103          dead_code_eliminate();
3104       }
3105
3106       lower_uniform_pull_constant_loads();
3107
3108       assign_curb_setup();
3109       assign_urb_setup();
3110
3111       static enum instruction_scheduler_mode pre_modes[] = {
3112          SCHEDULE_PRE,
3113          SCHEDULE_PRE_NON_LIFO,
3114          SCHEDULE_PRE_LIFO,
3115       };
3116
3117       /* Try each scheduling heuristic to see if it can successfully register
3118        * allocate without spilling.  They should be ordered by decreasing
3119        * performance but increasing likelihood of allocating.
3120        */
3121       for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3122          schedule_instructions(pre_modes[i]);
3123
3124          if (0) {
3125             assign_regs_trivial();
3126             allocated_without_spills = true;
3127          } else {
3128             allocated_without_spills = assign_regs(false);
3129          }
3130          if (allocated_without_spills)
3131             break;
3132       }
3133
3134       if (!allocated_without_spills) {
3135          /* We assume that any spilling is worse than just dropping back to
3136           * SIMD8.  There's probably actually some intermediate point where
3137           * SIMD16 with a couple of spills is still better.
3138           */
3139          if (dispatch_width == 16) {
3140             fail("Failure to register allocate.  Reduce number of "
3141                  "live scalar values to avoid this.");
3142          } else {
3143             perf_debug("Fragment shader triggered register spilling.  "
3144                        "Try reducing the number of live scalar values to "
3145                        "improve performance.\n");
3146          }
3147
3148          /* Since we're out of heuristics, just go spill registers until we
3149           * get an allocation.
3150           */
3151          while (!assign_regs(true)) {
3152             if (failed)
3153                break;
3154          }
3155       }
3156    }
3157    assert(force_uncompressed_stack == 0);
3158
3159    /* This must come after all optimization and register allocation, since
3160     * it inserts dead code that happens to have side effects, and it does
3161     * so based on the actual physical registers in use.
3162     */
3163    insert_gen4_send_dependency_workarounds();
3164
3165    if (failed)
3166       return false;
3167
3168    if (!allocated_without_spills)
3169       schedule_instructions(SCHEDULE_POST);
3170
3171    if (last_scratch > 0) {
3172       prog_data->total_scratch = brw_get_scratch_size(last_scratch);
3173    }
3174
3175    if (dispatch_width == 8)
3176       prog_data->reg_blocks = brw_register_blocks(grf_used);
3177    else
3178       prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
3179
3180    /* If any state parameters were appended, then ParameterValues could have
3181     * been realloced, in which case the driver uniform storage set up by
3182     * _mesa_associate_uniform_storage() would point to freed memory.  Make
3183     * sure that didn't happen.
3184     */
3185    assert(sanity_param_count == fp->Base.Parameters->NumParameters);
3186
3187    return !failed;
3188 }
3189
3190 const unsigned *
3191 brw_wm_fs_emit(struct brw_context *brw,
3192                void *mem_ctx,
3193                const struct brw_wm_prog_key *key,
3194                struct brw_wm_prog_data *prog_data,
3195                struct gl_fragment_program *fp,
3196                struct gl_shader_program *prog,
3197                unsigned *final_assembly_size)
3198 {
3199    bool start_busy = false;
3200    double start_time = 0;
3201
3202    if (unlikely(brw->perf_debug)) {
3203       start_busy = (brw->batch.last_bo &&
3204                     drm_intel_bo_busy(brw->batch.last_bo));
3205       start_time = get_time();
3206    }
3207
3208    struct brw_shader *shader = NULL;
3209    if (prog)
3210       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3211
3212    if (unlikely(INTEL_DEBUG & DEBUG_WM))
3213       brw_dump_ir(brw, "fragment", prog, &shader->base, &fp->Base);
3214
3215    /* Now the main event: Visit the shader IR and generate our FS IR for it.
3216     */
3217    fs_visitor v(brw, mem_ctx, key, prog_data, prog, fp, 8);
3218    if (!v.run()) {
3219       if (prog) {
3220          prog->LinkStatus = false;
3221          ralloc_strcat(&prog->InfoLog, v.fail_msg);
3222       }
3223
3224       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3225                     v.fail_msg);
3226
3227       return NULL;
3228    }
3229
3230    exec_list *simd16_instructions = NULL;
3231    fs_visitor v2(brw, mem_ctx, key, prog_data, prog, fp, 16);
3232    if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16))) {
3233       if (!v.simd16_unsupported) {
3234          /* Try a SIMD16 compile */
3235          v2.import_uniforms(&v);
3236          if (!v2.run()) {
3237             perf_debug("SIMD16 shader failed to compile, falling back to "
3238                        "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
3239          } else {
3240             simd16_instructions = &v2.instructions;
3241          }
3242       } else {
3243          perf_debug("SIMD16 shader unsupported, falling back to "
3244                     "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
3245       }
3246    }
3247
3248    const unsigned *assembly = NULL;
3249    if (brw->gen >= 8) {
3250       gen8_fs_generator g(brw, mem_ctx, key, prog_data, prog, fp, v.do_dual_src);
3251       assembly = g.generate_assembly(&v.instructions, simd16_instructions,
3252                                      final_assembly_size);
3253    } else {
3254       fs_generator g(brw, mem_ctx, key, prog_data, prog, fp, v.do_dual_src,
3255                      v.runtime_check_aads_emit, INTEL_DEBUG & DEBUG_WM);
3256       assembly = g.generate_assembly(&v.instructions, simd16_instructions,
3257                                      final_assembly_size);
3258    }
3259
3260    if (unlikely(brw->perf_debug) && shader) {
3261       if (shader->compiled_once)
3262          brw_wm_debug_recompile(brw, prog, key);
3263       shader->compiled_once = true;
3264
3265       if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
3266          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3267                     (get_time() - start_time) * 1000);
3268       }
3269    }
3270
3271    return assembly;
3272 }
3273
3274 bool
3275 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
3276 {
3277    struct brw_context *brw = brw_context(ctx);
3278    struct brw_wm_prog_key key;
3279
3280    if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
3281       return true;
3282
3283    struct gl_fragment_program *fp = (struct gl_fragment_program *)
3284       prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
3285    struct brw_fragment_program *bfp = brw_fragment_program(fp);
3286    bool program_uses_dfdy = fp->UsesDFdy;
3287
3288    memset(&key, 0, sizeof(key));
3289
3290    if (brw->gen < 6) {
3291       if (fp->UsesKill)
3292          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3293
3294       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3295          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3296
3297       /* Just assume depth testing. */
3298       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3299       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3300    }
3301
3302    if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
3303                                          BRW_FS_VARYING_INPUT_MASK) > 16)
3304       key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
3305
3306    unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
3307    for (unsigned i = 0; i < sampler_count; i++) {
3308       if (fp->Base.ShadowSamplers & (1 << i)) {
3309          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3310          key.tex.swizzles[i] =
3311             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3312       } else {
3313          /* Color sampler: assume no swizzling. */
3314          key.tex.swizzles[i] = SWIZZLE_XYZW;
3315       }
3316    }
3317
3318    if (fp->Base.InputsRead & VARYING_BIT_POS) {
3319       key.drawable_height = ctx->DrawBuffer->Height;
3320    }
3321
3322    key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
3323          ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
3324          BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
3325
3326    if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3327       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
3328                           key.nr_color_regions > 1;
3329    }
3330
3331    /* GL_FRAGMENT_SHADER_DERIVATIVE_HINT is almost always GL_DONT_CARE.  The
3332     * quality of the derivatives is likely to be determined by the driconf
3333     * option.
3334     */
3335    key.high_quality_derivatives = brw->disable_derivative_optimization;
3336
3337    key.program_string_id = bfp->id;
3338
3339    uint32_t old_prog_offset = brw->wm.base.prog_offset;
3340    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3341
3342    bool success = do_wm_prog(brw, prog, bfp, &key);
3343
3344    brw->wm.base.prog_offset = old_prog_offset;
3345    brw->wm.prog_data = old_prog_data;
3346
3347    return success;
3348 }