src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 extern "C" {
  32
  33 #include <sys/types.h>
  34
  35 #include "util/hash_table.h"
  36 #include "main/macros.h"
  37 #include "main/shaderobj.h"
  38 #include "main/fbobject.h"
  39 #include "program/prog_parameter.h"
  40 #include "program/prog_print.h"
  41 #include "program/register_allocate.h"
  42 #include "program/sampler.h"
  43 #include "program/hash_table.h"
  44 #include "brw_context.h"
  45 #include "brw_eu.h"
  46 #include "brw_wm.h"
  47 }
  48 #include "brw_fs.h"
  49 #include "brw_dead_control_flow.h"
  50 #include "main/uniforms.h"
  51 #include "brw_fs_live_variables.h"
  52 #include "glsl/glsl_types.h"
  53
  54 void
  55 fs_inst::init(enum opcode opcode, const fs_reg &dst, fs_reg *src, int sources)
  56 {
  57    memset(this, 0, sizeof(*this));
  58
  59    this->opcode = opcode;
  60    this->dst = dst;
  61    this->src = src;
  62    this->sources = sources;
  63
  64    this->conditional_mod = BRW_CONDITIONAL_NONE;
  65
  66    /* This will be the case for almost all instructions. */
  67    this->regs_written = 1;
  68
  69    this->writes_accumulator = false;
  70 }
  71
  72 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst)
  73 {
  74    fs_reg *src = ralloc_array(this, fs_reg, 3);
  75    init(opcode, dst, src, 0);
  76 }
  77
  78 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
  79 {
  80    fs_reg *src = ralloc_array(this, fs_reg, 3);
  81    src[0] = src0;
  82    init(opcode, dst, src, 1);
  83 }
  84
  85 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
  86                  const fs_reg &src1)
  87 {
  88    fs_reg *src = ralloc_array(this, fs_reg, 3);
  89    src[0] = src0;
  90    src[1] = src1;
  91    init(opcode, dst, src, 2);
  92 }
  93
  94 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
  95                  const fs_reg &src1, const fs_reg &src2)
  96 {
  97    fs_reg *src = ralloc_array(this, fs_reg, 3);
  98    src[0] = src0;
  99    src[1] = src1;
 100    src[2] = src2;
 101    init(opcode, dst, src, 3);
 102 }
 103
 104 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, fs_reg src[], int sources)
 105 {
 106    init(opcode, dst, src, sources);
 107 }
 108
 109 fs_inst::fs_inst(const fs_inst &that)
 110 {
 111    memcpy(this, &that, sizeof(that));
 112
 113    this->src = ralloc_array(this, fs_reg, that.sources);
 114
 115    for (int i = 0; i < that.sources; i++)
 116       this->src[i] = that.src[i];
 117 }
 118
 119 void
 120 fs_inst::resize_sources(uint8_t num_sources)
 121 {
 122    if (this->sources != num_sources) {
 123       this->src = reralloc(this, this->src, fs_reg, num_sources);
 124       this->sources = num_sources;
 125    }
 126 }
 127
 128 #define ALU1(op)                                                        \
 129    fs_inst *                                                            \
 130    fs_visitor::op(const fs_reg &dst, const fs_reg &src0)                \
 131    {                                                                    \
 132       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
 133    }
 134
 135 #define ALU2(op)                                                        \
 136    fs_inst *                                                            \
 137    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 138                   const fs_reg &src1)                                   \
 139    {                                                                    \
 140       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
 141    }
 142
 143 #define ALU2_ACC(op)                                                    \
 144    fs_inst *                                                            \
 145    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 146                   const fs_reg &src1)                                   \
 147    {                                                                    \
 148       fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
 149       inst->writes_accumulator = true;                                  \
 150       return inst;                                                      \
 151    }
 152
 153 #define ALU3(op)                                                        \
 154    fs_inst *                                                            \
 155    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 156                   const fs_reg &src1, const fs_reg &src2)               \
 157    {                                                                    \
 158       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
 159    }
 160
 161 ALU1(NOT)
 162 ALU1(MOV)
 163 ALU1(FRC)
 164 ALU1(RNDD)
 165 ALU1(RNDE)
 166 ALU1(RNDZ)
 167 ALU2(ADD)
 168 ALU2(MUL)
 169 ALU2_ACC(MACH)
 170 ALU2(AND)
 171 ALU2(OR)
 172 ALU2(XOR)
 173 ALU2(SHL)
 174 ALU2(SHR)
 175 ALU2(ASR)
 176 ALU3(LRP)
 177 ALU1(BFREV)
 178 ALU3(BFE)
 179 ALU2(BFI1)
 180 ALU3(BFI2)
 181 ALU1(FBH)
 182 ALU1(FBL)
 183 ALU1(CBIT)
 184 ALU3(MAD)
 185 ALU2_ACC(ADDC)
 186 ALU2_ACC(SUBB)
 187 ALU2(SEL)
 188 ALU2(MAC)
 189
 190 /** Gen4 predicated IF. */
 191 fs_inst *
 192 fs_visitor::IF(enum brw_predicate predicate)
 193 {
 194    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
 195    inst->predicate = predicate;
 196    return inst;
 197 }
 198
 199 /** Gen6 IF with embedded comparison. */
 200 fs_inst *
 201 fs_visitor::IF(const fs_reg &src0, const fs_reg &src1,
 202                enum brw_conditional_mod condition)
 203 {
 204    assert(brw->gen == 6);
 205    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
 206                                         reg_null_d, src0, src1);
 207    inst->conditional_mod = condition;
 208    return inst;
 209 }
 210
 211 /**
 212  * CMP: Sets the low bit of the destination channels with the result
 213  * of the comparison, while the upper bits are undefined, and updates
 214  * the flag register with the packed 16 bits of the result.
 215  */
 216 fs_inst *
 217 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1,
 218                 enum brw_conditional_mod condition)
 219 {
 220    fs_inst *inst;
 221
 222    /* Take the instruction:
 223     *
 224     * CMP null<d> src0<f> src1<f>
 225     *
 226     * Original gen4 does type conversion to the destination type before
 227     * comparison, producing garbage results for floating point comparisons.
 228     * gen5 does the comparison on the execution type (resolved source types),
 229     * so dst type doesn't matter.  gen6 does comparison and then uses the
 230     * result as if it was the dst type with no conversion, which happens to
 231     * mostly work out for float-interpreted-as-int since our comparisons are
 232     * for >0, =0, <0.
 233     */
 234    if (brw->gen == 4) {
 235       dst.type = src0.type;
 236       if (dst.file == HW_REG)
 237          dst.fixed_hw_reg.type = dst.type;
 238    }
 239
 240    resolve_ud_negate(&src0);
 241    resolve_ud_negate(&src1);
 242
 243    inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
 244    inst->conditional_mod = condition;
 245
 246    return inst;
 247 }
 248
 249 fs_inst *
 250 fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources)
 251 {
 252    fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, dst, src,
 253                                         sources);
 254    inst->regs_written = sources;
 255
 256    return inst;
 257 }
 258
 259 exec_list
 260 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
 261                                        const fs_reg &surf_index,
 262                                        const fs_reg &varying_offset,
 263                                        uint32_t const_offset)
 264 {
 265    exec_list instructions;
 266    fs_inst *inst;
 267
 268    /* We have our constant surface use a pitch of 4 bytes, so our index can
 269     * be any component of a vector, and then we load 4 contiguous
 270     * components starting from that.
 271     *
 272     * We break down the const_offset to a portion added to the variable
 273     * offset and a portion done using reg_offset, which means that if you
 274     * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
 275     * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
 276     * CSE can later notice that those loads are all the same and eliminate
 277     * the redundant ones.
 278     */
 279    fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
 280    instructions.push_tail(ADD(vec4_offset,
 281                               varying_offset, const_offset & ~3));
 282
 283    int scale = 1;
 284    if (brw->gen == 4 && dispatch_width == 8) {
 285       /* Pre-gen5, we can either use a SIMD8 message that requires (header,
 286        * u, v, r) as parameters, or we can just use the SIMD16 message
 287        * consisting of (header, u).  We choose the second, at the cost of a
 288        * longer return length.
 289        */
 290       scale = 2;
 291    }
 292
 293    enum opcode op;
 294    if (brw->gen >= 7)
 295       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
 296    else
 297       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
 298    fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4 * scale), dst.type);
 299    inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
 300    inst->regs_written = 4 * scale;
 301    instructions.push_tail(inst);
 302
 303    if (brw->gen < 7) {
 304       inst->base_mrf = 13;
 305       inst->header_present = true;
 306       if (brw->gen == 4)
 307          inst->mlen = 3;
 308       else
 309          inst->mlen = 1 + dispatch_width / 8;
 310    }
 311
 312    vec4_result.reg_offset += (const_offset & 3) * scale;
 313    instructions.push_tail(MOV(dst, vec4_result));
 314
 315    return instructions;
 316 }
 317
 318 /**
 319  * A helper for MOV generation for fixing up broken hardware SEND dependency
 320  * handling.
 321  */
 322 fs_inst *
 323 fs_visitor::DEP_RESOLVE_MOV(int grf)
 324 {
 325    fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
 326
 327    inst->ir = NULL;
 328    inst->annotation = "send dependency resolve";
 329
 330    /* The caller always wants uncompressed to emit the minimal extra
 331     * dependencies, and to avoid having to deal with aligning its regs to 2.
 332     */
 333    inst->force_uncompressed = true;
 334
 335    return inst;
 336 }
 337
 338 bool
 339 fs_inst::equals(fs_inst *inst) const
 340 {
 341    return (opcode == inst->opcode &&
 342            dst.equals(inst->dst) &&
 343            src[0].equals(inst->src[0]) &&
 344            src[1].equals(inst->src[1]) &&
 345            src[2].equals(inst->src[2]) &&
 346            saturate == inst->saturate &&
 347            predicate == inst->predicate &&
 348            conditional_mod == inst->conditional_mod &&
 349            mlen == inst->mlen &&
 350            base_mrf == inst->base_mrf &&
 351            target == inst->target &&
 352            eot == inst->eot &&
 353            header_present == inst->header_present &&
 354            shadow_compare == inst->shadow_compare &&
 355            offset == inst->offset);
 356 }
 357
 358 bool
 359 fs_inst::overwrites_reg(const fs_reg &reg) const
 360 {
 361    return (reg.file == dst.file &&
 362            reg.reg == dst.reg &&
 363            reg.reg_offset >= dst.reg_offset  &&
 364            reg.reg_offset < dst.reg_offset + regs_written);
 365 }
 366
 367 bool
 368 fs_inst::is_send_from_grf() const
 369 {
 370    return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
 371            opcode == SHADER_OPCODE_SHADER_TIME_ADD ||
 372            opcode == FS_OPCODE_INTERPOLATE_AT_CENTROID ||
 373            opcode == FS_OPCODE_INTERPOLATE_AT_SAMPLE ||
 374            opcode == FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET ||
 375            opcode == FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET ||
 376            (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
 377             src[1].file == GRF) ||
 378            (is_tex() && src[0].file == GRF));
 379 }
 380
 381 bool
 382 fs_inst::can_do_source_mods(struct brw_context *brw)
 383 {
 384    if (brw->gen == 6 && is_math())
 385       return false;
 386
 387    if (is_send_from_grf())
 388       return false;
 389
 390    if (!backend_instruction::can_do_source_mods())
 391       return false;
 392
 393    return true;
 394 }
 395
 396 void
 397 fs_reg::init()
 398 {
 399    memset(this, 0, sizeof(*this));
 400    stride = 1;
 401 }
 402
 403 /** Generic unset register constructor. */
 404 fs_reg::fs_reg()
 405 {
 406    init();
 407    this->file = BAD_FILE;
 408 }
 409
 410 /** Immediate value constructor. */
 411 fs_reg::fs_reg(float f)
 412 {
 413    init();
 414    this->file = IMM;
 415    this->type = BRW_REGISTER_TYPE_F;
 416    this->fixed_hw_reg.dw1.f = f;
 417 }
 418
 419 /** Immediate value constructor. */
 420 fs_reg::fs_reg(int32_t i)
 421 {
 422    init();
 423    this->file = IMM;
 424    this->type = BRW_REGISTER_TYPE_D;
 425    this->fixed_hw_reg.dw1.d = i;
 426 }
 427
 428 /** Immediate value constructor. */
 429 fs_reg::fs_reg(uint32_t u)
 430 {
 431    init();
 432    this->file = IMM;
 433    this->type = BRW_REGISTER_TYPE_UD;
 434    this->fixed_hw_reg.dw1.ud = u;
 435 }
 436
 437 /** Fixed brw_reg. */
 438 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
 439 {
 440    init();
 441    this->file = HW_REG;
 442    this->fixed_hw_reg = fixed_hw_reg;
 443    this->type = fixed_hw_reg.type;
 444 }
 445
 446 bool
 447 fs_reg::equals(const fs_reg &r) const
 448 {
 449    return (file == r.file &&
 450            reg == r.reg &&
 451            reg_offset == r.reg_offset &&
 452            subreg_offset == r.subreg_offset &&
 453            type == r.type &&
 454            negate == r.negate &&
 455            abs == r.abs &&
 456            !reladdr && !r.reladdr &&
 457            memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
 458                   sizeof(fixed_hw_reg)) == 0 &&
 459            stride == r.stride);
 460 }
 461
 462 fs_reg &
 463 fs_reg::apply_stride(unsigned stride)
 464 {
 465    assert((this->stride * stride) <= 4 &&
 466           (is_power_of_two(stride) || stride == 0) &&
 467           file != HW_REG && file != IMM);
 468    this->stride *= stride;
 469    return *this;
 470 }
 471
 472 fs_reg &
 473 fs_reg::set_smear(unsigned subreg)
 474 {
 475    assert(file != HW_REG && file != IMM);
 476    subreg_offset = subreg * type_sz(type);
 477    stride = 0;
 478    return *this;
 479 }
 480
 481 bool
 482 fs_reg::is_contiguous() const
 483 {
 484    return stride == 1;
 485 }
 486
 487 bool
 488 fs_reg::is_valid_3src() const
 489 {
 490    return file == GRF || file == UNIFORM;
 491 }
 492
 493 int
 494 fs_visitor::type_size(const struct glsl_type *type)
 495 {
 496    unsigned int size, i;
 497
 498    switch (type->base_type) {
 499    case GLSL_TYPE_UINT:
 500    case GLSL_TYPE_INT:
 501    case GLSL_TYPE_FLOAT:
 502    case GLSL_TYPE_BOOL:
 503       return type->components();
 504    case GLSL_TYPE_ARRAY:
 505       return type_size(type->fields.array) * type->length;
 506    case GLSL_TYPE_STRUCT:
 507       size = 0;
 508       for (i = 0; i < type->length; i++) {
 509          size += type_size(type->fields.structure[i].type);
 510       }
 511       return size;
 512    case GLSL_TYPE_SAMPLER:
 513       /* Samplers take up no register space, since they're baked in at
 514        * link time.
 515        */
 516       return 0;
 517    case GLSL_TYPE_ATOMIC_UINT:
 518       return 0;
 519    case GLSL_TYPE_IMAGE:
 520    case GLSL_TYPE_VOID:
 521    case GLSL_TYPE_ERROR:
 522    case GLSL_TYPE_INTERFACE:
 523       unreachable("not reached");
 524    }
 525
 526    return 0;
 527 }
 528
 529 fs_reg
 530 fs_visitor::get_timestamp()
 531 {
 532    assert(brw->gen >= 7);
 533
 534    fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 535                                           BRW_ARF_TIMESTAMP,
 536                                           0),
 537                              BRW_REGISTER_TYPE_UD));
 538
 539    fs_reg dst = fs_reg(this, glsl_type::uint_type);
 540
 541    fs_inst *mov = emit(MOV(dst, ts));
 542    /* We want to read the 3 fields we care about (mostly field 0, but also 2)
 543     * even if it's not enabled in the dispatch.
 544     */
 545    mov->force_writemask_all = true;
 546    mov->force_uncompressed = true;
 547
 548    /* The caller wants the low 32 bits of the timestamp.  Since it's running
 549     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
 550     * which is plenty of time for our purposes.  It is identical across the
 551     * EUs, but since it's tracking GPU core speed it will increment at a
 552     * varying rate as render P-states change.
 553     *
 554     * The caller could also check if render P-states have changed (or anything
 555     * else that might disrupt timing) by setting smear to 2 and checking if
 556     * that field is != 0.
 557     */
 558    dst.set_smear(0);
 559
 560    return dst;
 561 }
 562
 563 void
 564 fs_visitor::emit_shader_time_begin()
 565 {
 566    current_annotation = "shader time start";
 567    shader_start_time = get_timestamp();
 568 }
 569
 570 void
 571 fs_visitor::emit_shader_time_end()
 572 {
 573    current_annotation = "shader time end";
 574
 575    enum shader_time_shader_type type, written_type, reset_type;
 576    if (dispatch_width == 8) {
 577       type = ST_FS8;
 578       written_type = ST_FS8_WRITTEN;
 579       reset_type = ST_FS8_RESET;
 580    } else {
 581       assert(dispatch_width == 16);
 582       type = ST_FS16;
 583       written_type = ST_FS16_WRITTEN;
 584       reset_type = ST_FS16_RESET;
 585    }
 586
 587    fs_reg shader_end_time = get_timestamp();
 588
 589    /* Check that there weren't any timestamp reset events (assuming these
 590     * were the only two timestamp reads that happened).
 591     */
 592    fs_reg reset = shader_end_time;
 593    reset.set_smear(2);
 594    fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
 595    test->conditional_mod = BRW_CONDITIONAL_Z;
 596    emit(IF(BRW_PREDICATE_NORMAL));
 597
 598    push_force_uncompressed();
 599    fs_reg start = shader_start_time;
 600    start.negate = true;
 601    fs_reg diff = fs_reg(this, glsl_type::uint_type);
 602    emit(ADD(diff, start, shader_end_time));
 603
 604    /* If there were no instructions between the two timestamp gets, the diff
 605     * is 2 cycles.  Remove that overhead, so I can forget about that when
 606     * trying to determine the time taken for single instructions.
 607     */
 608    emit(ADD(diff, diff, fs_reg(-2u)));
 609
 610    emit_shader_time_write(type, diff);
 611    emit_shader_time_write(written_type, fs_reg(1u));
 612    emit(BRW_OPCODE_ELSE);
 613    emit_shader_time_write(reset_type, fs_reg(1u));
 614    emit(BRW_OPCODE_ENDIF);
 615
 616    pop_force_uncompressed();
 617 }
 618
 619 void
 620 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
 621                                    fs_reg value)
 622 {
 623    int shader_time_index =
 624       brw_get_shader_time_index(brw, shader_prog, &fp->Base, type);
 625    fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
 626
 627    fs_reg payload;
 628    if (dispatch_width == 8)
 629       payload = fs_reg(this, glsl_type::uvec2_type);
 630    else
 631       payload = fs_reg(this, glsl_type::uint_type);
 632
 633    emit(new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
 634                              fs_reg(), payload, offset, value));
 635 }
 636
 637 void
 638 fs_visitor::vfail(const char *format, va_list va)
 639 {
 640    char *msg;
 641
 642    if (failed)
 643       return;
 644
 645    failed = true;
 646
 647    msg = ralloc_vasprintf(mem_ctx, format, va);
 648    msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
 649
 650    this->fail_msg = msg;
 651
 652    if (INTEL_DEBUG & DEBUG_WM) {
 653       fprintf(stderr, "%s",  msg);
 654    }
 655 }
 656
 657 void
 658 fs_visitor::fail(const char *format, ...)
 659 {
 660    va_list va;
 661
 662    va_start(va, format);
 663    vfail(format, va);
 664    va_end(va);
 665 }
 666
 667 /**
 668  * Mark this program as impossible to compile in SIMD16 mode.
 669  *
 670  * During the SIMD8 compile (which happens first), we can detect and flag
 671  * things that are unsupported in SIMD16 mode, so the compiler can skip
 672  * the SIMD16 compile altogether.
 673  *
 674  * During a SIMD16 compile (if one happens anyway), this just calls fail().
 675  */
 676 void
 677 fs_visitor::no16(const char *format, ...)
 678 {
 679    va_list va;
 680
 681    va_start(va, format);
 682
 683    if (dispatch_width == 16) {
 684       vfail(format, va);
 685    } else {
 686       simd16_unsupported = true;
 687
 688       if (brw->perf_debug) {
 689          if (no16_msg)
 690             ralloc_vasprintf_append(&no16_msg, format, va);
 691          else
 692             no16_msg = ralloc_vasprintf(mem_ctx, format, va);
 693       }
 694    }
 695
 696    va_end(va);
 697 }
 698
 699 fs_inst *
 700 fs_visitor::emit(enum opcode opcode)
 701 {
 702    return emit(new(mem_ctx) fs_inst(opcode));
 703 }
 704
 705 fs_inst *
 706 fs_visitor::emit(enum opcode opcode, const fs_reg &dst)
 707 {
 708    return emit(new(mem_ctx) fs_inst(opcode, dst));
 709 }
 710
 711 fs_inst *
 712 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
 713 {
 714    return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
 715 }
 716
 717 fs_inst *
 718 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 719                  const fs_reg &src1)
 720 {
 721    return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
 722 }
 723
 724 fs_inst *
 725 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 726                  const fs_reg &src1, const fs_reg &src2)
 727 {
 728    return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
 729 }
 730
 731 fs_inst *
 732 fs_visitor::emit(enum opcode opcode, const fs_reg &dst,
 733                  fs_reg src[], int sources)
 734 {
 735    return emit(new(mem_ctx) fs_inst(opcode, dst, src, sources));
 736 }
 737
 738 void
 739 fs_visitor::push_force_uncompressed()
 740 {
 741    force_uncompressed_stack++;
 742 }
 743
 744 void
 745 fs_visitor::pop_force_uncompressed()
 746 {
 747    force_uncompressed_stack--;
 748    assert(force_uncompressed_stack >= 0);
 749 }
 750
 751 /**
 752  * Returns true if the instruction has a flag that means it won't
 753  * update an entire destination register.
 754  *
 755  * For example, dead code elimination and live variable analysis want to know
 756  * when a write to a variable screens off any preceding values that were in
 757  * it.
 758  */
 759 bool
 760 fs_inst::is_partial_write() const
 761 {
 762    return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
 763            this->force_uncompressed ||
 764            this->force_sechalf || !this->dst.is_contiguous());
 765 }
 766
 767 int
 768 fs_inst::regs_read(fs_visitor *v, int arg) const
 769 {
 770    if (is_tex() && arg == 0 && src[0].file == GRF) {
 771       if (v->dispatch_width == 16)
 772          return (mlen + 1) / 2;
 773       else
 774          return mlen;
 775    }
 776    return 1;
 777 }
 778
 779 bool
 780 fs_inst::reads_flag() const
 781 {
 782    return predicate;
 783 }
 784
 785 bool
 786 fs_inst::writes_flag() const
 787 {
 788    return (conditional_mod && opcode != BRW_OPCODE_SEL) ||
 789           opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
 790 }
 791
 792 /**
 793  * Returns how many MRFs an FS opcode will write over.
 794  *
 795  * Note that this is not the 0 or 1 implied writes in an actual gen
 796  * instruction -- the FS opcodes often generate MOVs in addition.
 797  */
 798 int
 799 fs_visitor::implied_mrf_writes(fs_inst *inst)
 800 {
 801    if (inst->mlen == 0)
 802       return 0;
 803
 804    if (inst->base_mrf == -1)
 805       return 0;
 806
 807    switch (inst->opcode) {
 808    case SHADER_OPCODE_RCP:
 809    case SHADER_OPCODE_RSQ:
 810    case SHADER_OPCODE_SQRT:
 811    case SHADER_OPCODE_EXP2:
 812    case SHADER_OPCODE_LOG2:
 813    case SHADER_OPCODE_SIN:
 814    case SHADER_OPCODE_COS:
 815       return 1 * dispatch_width / 8;
 816    case SHADER_OPCODE_POW:
 817    case SHADER_OPCODE_INT_QUOTIENT:
 818    case SHADER_OPCODE_INT_REMAINDER:
 819       return 2 * dispatch_width / 8;
 820    case SHADER_OPCODE_TEX:
 821    case FS_OPCODE_TXB:
 822    case SHADER_OPCODE_TXD:
 823    case SHADER_OPCODE_TXF:
 824    case SHADER_OPCODE_TXF_CMS:
 825    case SHADER_OPCODE_TXF_MCS:
 826    case SHADER_OPCODE_TG4:
 827    case SHADER_OPCODE_TG4_OFFSET:
 828    case SHADER_OPCODE_TXL:
 829    case SHADER_OPCODE_TXS:
 830    case SHADER_OPCODE_LOD:
 831       return 1;
 832    case FS_OPCODE_FB_WRITE:
 833       return 2;
 834    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 835    case SHADER_OPCODE_GEN4_SCRATCH_READ:
 836       return 1;
 837    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
 838       return inst->mlen;
 839    case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
 840       return 2;
 841    case SHADER_OPCODE_UNTYPED_ATOMIC:
 842    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
 843    case FS_OPCODE_INTERPOLATE_AT_CENTROID:
 844    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
 845    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
 846    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
 847       return 0;
 848    default:
 849       unreachable("not reached");
 850    }
 851 }
 852
 853 int
 854 fs_visitor::virtual_grf_alloc(int size)
 855 {
 856    if (virtual_grf_array_size <= virtual_grf_count) {
 857       if (virtual_grf_array_size == 0)
 858          virtual_grf_array_size = 16;
 859       else
 860          virtual_grf_array_size *= 2;
 861       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 862                                    virtual_grf_array_size);
 863    }
 864    virtual_grf_sizes[virtual_grf_count] = size;
 865    return virtual_grf_count++;
 866 }
 867
 868 /** Fixed HW reg constructor. */
 869 fs_reg::fs_reg(enum register_file file, int reg)
 870 {
 871    init();
 872    this->file = file;
 873    this->reg = reg;
 874    this->type = BRW_REGISTER_TYPE_F;
 875 }
 876
 877 /** Fixed HW reg constructor. */
 878 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type)
 879 {
 880    init();
 881    this->file = file;
 882    this->reg = reg;
 883    this->type = type;
 884 }
 885
 886 /** Automatic reg constructor. */
 887 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
 888 {
 889    init();
 890
 891    this->file = GRF;
 892    this->reg = v->virtual_grf_alloc(v->type_size(type));
 893    this->reg_offset = 0;
 894    this->type = brw_type_for_base_type(type);
 895 }
 896
 897 fs_reg *
 898 fs_visitor::variable_storage(ir_variable *var)
 899 {
 900    return (fs_reg *)hash_table_find(this->variable_ht, var);
 901 }
 902
 903 void
 904 import_uniforms_callback(const void *key,
 905                          void *data,
 906                          void *closure)
 907 {
 908    struct hash_table *dst_ht = (struct hash_table *)closure;
 909    const fs_reg *reg = (const fs_reg *)data;
 910
 911    if (reg->file != UNIFORM)
 912       return;
 913
 914    hash_table_insert(dst_ht, data, key);
 915 }
 916
 917 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
 918  * This brings in those uniform definitions
 919  */
 920 void
 921 fs_visitor::import_uniforms(fs_visitor *v)
 922 {
 923    hash_table_call_foreach(v->variable_ht,
 924                            import_uniforms_callback,
 925                            variable_ht);
 926    this->push_constant_loc = v->push_constant_loc;
 927    this->pull_constant_loc = v->pull_constant_loc;
 928    this->uniforms = v->uniforms;
 929    this->param_size = v->param_size;
 930 }
 931
 932 /* Our support for uniforms is piggy-backed on the struct
 933  * gl_fragment_program, because that's where the values actually
 934  * get stored, rather than in some global gl_shader_program uniform
 935  * store.
 936  */
 937 void
 938 fs_visitor::setup_uniform_values(ir_variable *ir)
 939 {
 940    int namelen = strlen(ir->name);
 941
 942    /* The data for our (non-builtin) uniforms is stored in a series of
 943     * gl_uniform_driver_storage structs for each subcomponent that
 944     * glGetUniformLocation() could name.  We know it's been set up in the same
 945     * order we'd walk the type, so walk the list of storage and find anything
 946     * with our name, or the prefix of a component that starts with our name.
 947     */
 948    unsigned params_before = uniforms;
 949    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
 950       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 951
 952       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 953           (storage->name[namelen] != 0 &&
 954            storage->name[namelen] != '.' &&
 955            storage->name[namelen] != '[')) {
 956          continue;
 957       }
 958
 959       unsigned slots = storage->type->component_slots();
 960       if (storage->array_elements)
 961          slots *= storage->array_elements;
 962
 963       for (unsigned i = 0; i < slots; i++) {
 964          stage_prog_data->param[uniforms++] = &storage->storage[i];
 965       }
 966    }
 967
 968    /* Make sure we actually initialized the right amount of stuff here. */
 969    assert(params_before + ir->type->component_slots() == uniforms);
 970    (void)params_before;
 971 }
 972
 973
 974 /* Our support for builtin uniforms is even scarier than non-builtin.
 975  * It sits on top of the PROG_STATE_VAR parameters that are
 976  * automatically updated from GL context state.
 977  */
 978 void
 979 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
 980 {
 981    const ir_state_slot *const slots = ir->state_slots;
 982    assert(ir->state_slots != NULL);
 983
 984    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 985       /* This state reference has already been setup by ir_to_mesa, but we'll
 986        * get the same index back here.
 987        */
 988       int index = _mesa_add_state_reference(this->fp->Base.Parameters,
 989                                             (gl_state_index *)slots[i].tokens);
 990
 991       /* Add each of the unique swizzles of the element as a parameter.
 992        * This'll end up matching the expected layout of the
 993        * array/matrix/structure we're trying to fill in.
 994        */
 995       int last_swiz = -1;
 996       for (unsigned int j = 0; j < 4; j++) {
 997          int swiz = GET_SWZ(slots[i].swizzle, j);
 998          if (swiz == last_swiz)
 999             break;
1000          last_swiz = swiz;
1001
1002          stage_prog_data->param[uniforms++] =
1003             &fp->Base.Parameters->ParameterValues[index][swiz];
1004       }
1005    }
1006 }
1007
1008 fs_reg *
1009 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
1010 {
1011    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1012    fs_reg wpos = *reg;
1013    bool flip = !ir->data.origin_upper_left ^ key->render_to_fbo;
1014
1015    /* gl_FragCoord.x */
1016    if (ir->data.pixel_center_integer) {
1017       emit(MOV(wpos, this->pixel_x));
1018    } else {
1019       emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
1020    }
1021    wpos.reg_offset++;
1022
1023    /* gl_FragCoord.y */
1024    if (!flip && ir->data.pixel_center_integer) {
1025       emit(MOV(wpos, this->pixel_y));
1026    } else {
1027       fs_reg pixel_y = this->pixel_y;
1028       float offset = (ir->data.pixel_center_integer ? 0.0 : 0.5);
1029
1030       if (flip) {
1031          pixel_y.negate = true;
1032          offset += key->drawable_height - 1.0;
1033       }
1034
1035       emit(ADD(wpos, pixel_y, fs_reg(offset)));
1036    }
1037    wpos.reg_offset++;
1038
1039    /* gl_FragCoord.z */
1040    if (brw->gen >= 6) {
1041       emit(MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
1042    } else {
1043       emit(FS_OPCODE_LINTERP, wpos,
1044            this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1045            this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1046            interp_reg(VARYING_SLOT_POS, 2));
1047    }
1048    wpos.reg_offset++;
1049
1050    /* gl_FragCoord.w: Already set up in emit_interpolation */
1051    emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1052
1053    return reg;
1054 }
1055
1056 fs_inst *
1057 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1058                          glsl_interp_qualifier interpolation_mode,
1059                          bool is_centroid, bool is_sample)
1060 {
1061    brw_wm_barycentric_interp_mode barycoord_mode;
1062    if (brw->gen >= 6) {
1063       if (is_centroid) {
1064          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1065             barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1066          else
1067             barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1068       } else if (is_sample) {
1069           if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1070             barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1071          else
1072             barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1073       } else {
1074          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1075             barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1076          else
1077             barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1078       }
1079    } else {
1080       /* On Ironlake and below, there is only one interpolation mode.
1081        * Centroid interpolation doesn't mean anything on this hardware --
1082        * there is no multisampling.
1083        */
1084       barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1085    }
1086    return emit(FS_OPCODE_LINTERP, attr,
1087                this->delta_x[barycoord_mode],
1088                this->delta_y[barycoord_mode], interp);
1089 }
1090
1091 fs_reg *
1092 fs_visitor::emit_general_interpolation(ir_variable *ir)
1093 {
1094    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1095    reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
1096    fs_reg attr = *reg;
1097
1098    unsigned int array_elements;
1099    const glsl_type *type;
1100
1101    if (ir->type->is_array()) {
1102       array_elements = ir->type->length;
1103       if (array_elements == 0) {
1104          fail("dereferenced array '%s' has length 0\n", ir->name);
1105       }
1106       type = ir->type->fields.array;
1107    } else {
1108       array_elements = 1;
1109       type = ir->type;
1110    }
1111
1112    glsl_interp_qualifier interpolation_mode =
1113       ir->determine_interpolation_mode(key->flat_shade);
1114
1115    int location = ir->data.location;
1116    for (unsigned int i = 0; i < array_elements; i++) {
1117       for (unsigned int j = 0; j < type->matrix_columns; j++) {
1118          if (prog_data->urb_setup[location] == -1) {
1119             /* If there's no incoming setup data for this slot, don't
1120              * emit interpolation for it.
1121              */
1122             attr.reg_offset += type->vector_elements;
1123             location++;
1124             continue;
1125          }
1126
1127          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1128             /* Constant interpolation (flat shading) case. The SF has
1129              * handed us defined values in only the constant offset
1130              * field of the setup reg.
1131              */
1132             for (unsigned int k = 0; k < type->vector_elements; k++) {
1133                struct brw_reg interp = interp_reg(location, k);
1134                interp = suboffset(interp, 3);
1135                interp.type = reg->type;
1136                emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1137                attr.reg_offset++;
1138             }
1139          } else {
1140             /* Smooth/noperspective interpolation case. */
1141             for (unsigned int k = 0; k < type->vector_elements; k++) {
1142                struct brw_reg interp = interp_reg(location, k);
1143                if (brw->needs_unlit_centroid_workaround && ir->data.centroid) {
1144                   /* Get the pixel/sample mask into f0 so that we know
1145                    * which pixels are lit.  Then, for each channel that is
1146                    * unlit, replace the centroid data with non-centroid
1147                    * data.
1148                    */
1149                   emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1150
1151                   fs_inst *inst;
1152                   inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1153                                       false, false);
1154                   inst->predicate = BRW_PREDICATE_NORMAL;
1155                   inst->predicate_inverse = true;
1156                   if (brw->has_pln)
1157                      inst->no_dd_clear = true;
1158
1159                   inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1160                                       ir->data.centroid && !key->persample_shading,
1161                                       ir->data.sample || key->persample_shading);
1162                   inst->predicate = BRW_PREDICATE_NORMAL;
1163                   inst->predicate_inverse = false;
1164                   if (brw->has_pln)
1165                      inst->no_dd_check = true;
1166
1167                } else {
1168                   emit_linterp(attr, fs_reg(interp), interpolation_mode,
1169                                ir->data.centroid && !key->persample_shading,
1170                                ir->data.sample || key->persample_shading);
1171                }
1172                if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1173                   emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1174                }
1175                attr.reg_offset++;
1176             }
1177
1178          }
1179          location++;
1180       }
1181    }
1182
1183    return reg;
1184 }
1185
1186 fs_reg *
1187 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1188 {
1189    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1190
1191    /* The frontfacing comes in as a bit in the thread payload. */
1192    if (brw->gen >= 6) {
1193       emit(BRW_OPCODE_ASR, *reg,
1194            fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1195            fs_reg(15));
1196       emit(BRW_OPCODE_NOT, *reg, *reg);
1197       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1198    } else {
1199       struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1200       /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1201        * us front face
1202        */
1203       emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1204       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1205    }
1206
1207    return reg;
1208 }
1209
1210 void
1211 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1212 {
1213    assert(dst.type == BRW_REGISTER_TYPE_F);
1214
1215    if (key->compute_pos_offset) {
1216       /* Convert int_sample_pos to floating point */
1217       emit(MOV(dst, int_sample_pos));
1218       /* Scale to the range [0, 1] */
1219       emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1220    }
1221    else {
1222       /* From ARB_sample_shading specification:
1223        * "When rendering to a non-multisample buffer, or if multisample
1224        *  rasterization is disabled, gl_SamplePosition will always be
1225        *  (0.5, 0.5).
1226        */
1227       emit(MOV(dst, fs_reg(0.5f)));
1228    }
1229 }
1230
1231 fs_reg *
1232 fs_visitor::emit_samplepos_setup(ir_variable *ir)
1233 {
1234    assert(brw->gen >= 6);
1235    assert(ir->type == glsl_type::vec2_type);
1236
1237    this->current_annotation = "compute sample position";
1238    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1239    fs_reg pos = *reg;
1240    fs_reg int_sample_x = fs_reg(this, glsl_type::int_type);
1241    fs_reg int_sample_y = fs_reg(this, glsl_type::int_type);
1242
1243    /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1244     * mode will be enabled.
1245     *
1246     * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1247     * R31.1:0         Position Offset X/Y for Slot[3:0]
1248     * R31.3:2         Position Offset X/Y for Slot[7:4]
1249     * .....
1250     *
1251     * The X, Y sample positions come in as bytes in  thread payload. So, read
1252     * the positions using vstride=16, width=8, hstride=2.
1253     */
1254    struct brw_reg sample_pos_reg =
1255       stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
1256                     BRW_REGISTER_TYPE_B), 16, 8, 2);
1257
1258    fs_inst *inst = emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1259    if (dispatch_width == 16) {
1260       inst->force_uncompressed = true;
1261       inst = emit(MOV(half(int_sample_x, 1),
1262                       fs_reg(suboffset(sample_pos_reg, 16))));
1263       inst->force_sechalf = true;
1264    }
1265    /* Compute gl_SamplePosition.x */
1266    compute_sample_position(pos, int_sample_x);
1267    pos.reg_offset++;
1268    inst = emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1269    if (dispatch_width == 16) {
1270       inst->force_uncompressed = true;
1271       inst = emit(MOV(half(int_sample_y, 1),
1272                       fs_reg(suboffset(sample_pos_reg, 17))));
1273       inst->force_sechalf = true;
1274    }
1275    /* Compute gl_SamplePosition.y */
1276    compute_sample_position(pos, int_sample_y);
1277    return reg;
1278 }
1279
1280 fs_reg *
1281 fs_visitor::emit_sampleid_setup(ir_variable *ir)
1282 {
1283    assert(brw->gen >= 6);
1284
1285    this->current_annotation = "compute sample id";
1286    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1287
1288    if (key->compute_sample_id) {
1289       fs_reg t1 = fs_reg(this, glsl_type::int_type);
1290       fs_reg t2 = fs_reg(this, glsl_type::int_type);
1291       t2.type = BRW_REGISTER_TYPE_UW;
1292
1293       /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1294        * 8x multisampling, subspan 0 will represent sample N (where N
1295        * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1296        * 7. We can find the value of N by looking at R0.0 bits 7:6
1297        * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1298        * (since samples are always delivered in pairs). That is, we
1299        * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1300        * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1301        * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1302        * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1303        * populating a temporary variable with the sequence (0, 1, 2, 3),
1304        * and then reading from it using vstride=1, width=4, hstride=0.
1305        * These computations hold good for 4x multisampling as well.
1306        *
1307        * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
1308        * the first four slots are sample 0 of subspan 0; the next four
1309        * are sample 1 of subspan 0; the third group is sample 0 of
1310        * subspan 1, and finally sample 1 of subspan 1.
1311        */
1312       fs_inst *inst;
1313       inst = emit(BRW_OPCODE_AND, t1,
1314                   fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
1315                   fs_reg(0xc0));
1316       inst->force_writemask_all = true;
1317       inst = emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1318       inst->force_writemask_all = true;
1319       /* This works for both SIMD8 and SIMD16 */
1320       inst = emit(MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210)));
1321       inst->force_writemask_all = true;
1322       /* This special instruction takes care of setting vstride=1,
1323        * width=4, hstride=0 of t2 during an ADD instruction.
1324        */
1325       emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1326    } else {
1327       /* As per GL_ARB_sample_shading specification:
1328        * "When rendering to a non-multisample buffer, or if multisample
1329        *  rasterization is disabled, gl_SampleID will always be zero."
1330        */
1331       emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1332    }
1333
1334    return reg;
1335 }
1336
1337 fs_reg
1338 fs_visitor::fix_math_operand(fs_reg src)
1339 {
1340    /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1341     * might be able to do better by doing execsize = 1 math and then
1342     * expanding that result out, but we would need to be careful with
1343     * masking.
1344     *
1345     * The hardware ignores source modifiers (negate and abs) on math
1346     * instructions, so we also move to a temp to set those up.
1347     */
1348    if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1349        !src.abs && !src.negate)
1350       return src;
1351
1352    /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1353     * operands to math
1354     */
1355    if (brw->gen >= 7 && src.file != IMM)
1356       return src;
1357
1358    fs_reg expanded = fs_reg(this, glsl_type::float_type);
1359    expanded.type = src.type;
1360    emit(BRW_OPCODE_MOV, expanded, src);
1361    return expanded;
1362 }
1363
1364 fs_inst *
1365 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1366 {
1367    switch (opcode) {
1368    case SHADER_OPCODE_RCP:
1369    case SHADER_OPCODE_RSQ:
1370    case SHADER_OPCODE_SQRT:
1371    case SHADER_OPCODE_EXP2:
1372    case SHADER_OPCODE_LOG2:
1373    case SHADER_OPCODE_SIN:
1374    case SHADER_OPCODE_COS:
1375       break;
1376    default:
1377       unreachable("not reached: bad math opcode");
1378    }
1379
1380    /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
1381     * might be able to do better by doing execsize = 1 math and then
1382     * expanding that result out, but we would need to be careful with
1383     * masking.
1384     *
1385     * Gen 6 hardware ignores source modifiers (negate and abs) on math
1386     * instructions, so we also move to a temp to set those up.
1387     */
1388    if (brw->gen == 6 || brw->gen == 7)
1389       src = fix_math_operand(src);
1390
1391    fs_inst *inst = emit(opcode, dst, src);
1392
1393    if (brw->gen < 6) {
1394       inst->base_mrf = 2;
1395       inst->mlen = dispatch_width / 8;
1396    }
1397
1398    return inst;
1399 }
1400
1401 fs_inst *
1402 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1403 {
1404    int base_mrf = 2;
1405    fs_inst *inst;
1406
1407    if (brw->gen >= 8) {
1408       inst = emit(opcode, dst, src0, src1);
1409    } else if (brw->gen >= 6) {
1410       src0 = fix_math_operand(src0);
1411       src1 = fix_math_operand(src1);
1412
1413       inst = emit(opcode, dst, src0, src1);
1414    } else {
1415       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1416        * "Message Payload":
1417        *
1418        * "Operand0[7].  For the INT DIV functions, this operand is the
1419        *  denominator."
1420        *  ...
1421        * "Operand1[7].  For the INT DIV functions, this operand is the
1422        *  numerator."
1423        */
1424       bool is_int_div = opcode != SHADER_OPCODE_POW;
1425       fs_reg &op0 = is_int_div ? src1 : src0;
1426       fs_reg &op1 = is_int_div ? src0 : src1;
1427
1428       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1429       inst = emit(opcode, dst, op0, reg_null_f);
1430
1431       inst->base_mrf = base_mrf;
1432       inst->mlen = 2 * dispatch_width / 8;
1433    }
1434    return inst;
1435 }
1436
1437 void
1438 fs_visitor::assign_curb_setup()
1439 {
1440    if (dispatch_width == 8) {
1441       prog_data->base.dispatch_grf_start_reg = payload.num_regs;
1442    } else {
1443       prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1444    }
1445
1446    prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1447
1448    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1449    foreach_in_list(fs_inst, inst, &instructions) {
1450       for (unsigned int i = 0; i < inst->sources; i++) {
1451          if (inst->src[i].file == UNIFORM) {
1452             int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1453             int constant_nr;
1454             if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1455                constant_nr = push_constant_loc[uniform_nr];
1456             } else {
1457                /* Section 5.11 of the OpenGL 4.1 spec says:
1458                 * "Out-of-bounds reads return undefined values, which include
1459                 *  values from other variables of the active program or zero."
1460                 * Just return the first push constant.
1461                 */
1462                constant_nr = 0;
1463             }
1464
1465             struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1466                                                   constant_nr / 8,
1467                                                   constant_nr % 8);
1468
1469             inst->src[i].file = HW_REG;
1470             inst->src[i].fixed_hw_reg = byte_offset(
1471                retype(brw_reg, inst->src[i].type),
1472                inst->src[i].subreg_offset);
1473          }
1474       }
1475    }
1476 }
1477
1478 void
1479 fs_visitor::calculate_urb_setup()
1480 {
1481    for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1482       prog_data->urb_setup[i] = -1;
1483    }
1484
1485    int urb_next = 0;
1486    /* Figure out where each of the incoming setup attributes lands. */
1487    if (brw->gen >= 6) {
1488       if (_mesa_bitcount_64(fp->Base.InputsRead &
1489                             BRW_FS_VARYING_INPUT_MASK) <= 16) {
1490          /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1491           * first 16 varying inputs, so we can put them wherever we want.
1492           * Just put them in order.
1493           *
1494           * This is useful because it means that (a) inputs not used by the
1495           * fragment shader won't take up valuable register space, and (b) we
1496           * won't have to recompile the fragment shader if it gets paired with
1497           * a different vertex (or geometry) shader.
1498           */
1499          for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1500             if (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1501                 BITFIELD64_BIT(i)) {
1502                prog_data->urb_setup[i] = urb_next++;
1503             }
1504          }
1505       } else {
1506          /* We have enough input varyings that the SF/SBE pipeline stage can't
1507           * arbitrarily rearrange them to suit our whim; we have to put them
1508           * in an order that matches the output of the previous pipeline stage
1509           * (geometry or vertex shader).
1510           */
1511          struct brw_vue_map prev_stage_vue_map;
1512          brw_compute_vue_map(brw, &prev_stage_vue_map,
1513                              key->input_slots_valid);
1514          int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1515          assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1516          for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1517               slot++) {
1518             int varying = prev_stage_vue_map.slot_to_varying[slot];
1519             /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1520              * unused.
1521              */
1522             if (varying != BRW_VARYING_SLOT_COUNT &&
1523                 (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1524                  BITFIELD64_BIT(varying))) {
1525                prog_data->urb_setup[varying] = slot - first_slot;
1526             }
1527          }
1528          urb_next = prev_stage_vue_map.num_slots - first_slot;
1529       }
1530    } else {
1531       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1532       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1533          /* Point size is packed into the header, not as a general attribute */
1534          if (i == VARYING_SLOT_PSIZ)
1535             continue;
1536
1537          if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1538             /* The back color slot is skipped when the front color is
1539              * also written to.  In addition, some slots can be
1540              * written in the vertex shader and not read in the
1541              * fragment shader.  So the register number must always be
1542              * incremented, mapped or not.
1543              */
1544             if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1545                prog_data->urb_setup[i] = urb_next;
1546             urb_next++;
1547          }
1548       }
1549
1550       /*
1551        * It's a FS only attribute, and we did interpolation for this attribute
1552        * in SF thread. So, count it here, too.
1553        *
1554        * See compile_sf_prog() for more info.
1555        */
1556       if (fp->Base.InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1557          prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1558    }
1559
1560    prog_data->num_varying_inputs = urb_next;
1561 }
1562
1563 void
1564 fs_visitor::assign_urb_setup()
1565 {
1566    int urb_start = payload.num_regs + prog_data->curb_read_length;
1567
1568    /* Offset all the urb_setup[] index by the actual position of the
1569     * setup regs, now that the location of the constants has been chosen.
1570     */
1571    foreach_in_list(fs_inst, inst, &instructions) {
1572       if (inst->opcode == FS_OPCODE_LINTERP) {
1573          assert(inst->src[2].file == HW_REG);
1574          inst->src[2].fixed_hw_reg.nr += urb_start;
1575       }
1576
1577       if (inst->opcode == FS_OPCODE_CINTERP) {
1578          assert(inst->src[0].file == HW_REG);
1579          inst->src[0].fixed_hw_reg.nr += urb_start;
1580       }
1581    }
1582
1583    /* Each attribute is 4 setup channels, each of which is half a reg. */
1584    this->first_non_payload_grf =
1585       urb_start + prog_data->num_varying_inputs * 2;
1586 }
1587
1588 /**
1589  * Split large virtual GRFs into separate components if we can.
1590  *
1591  * This is mostly duplicated with what brw_fs_vector_splitting does,
1592  * but that's really conservative because it's afraid of doing
1593  * splitting that doesn't result in real progress after the rest of
1594  * the optimization phases, which would cause infinite looping in
1595  * optimization.  We can do it once here, safely.  This also has the
1596  * opportunity to split interpolated values, or maybe even uniforms,
1597  * which we don't have at the IR level.
1598  *
1599  * We want to split, because virtual GRFs are what we register
1600  * allocate and spill (due to contiguousness requirements for some
1601  * instructions), and they're what we naturally generate in the
1602  * codegen process, but most virtual GRFs don't actually need to be
1603  * contiguous sets of GRFs.  If we split, we'll end up with reduced
1604  * live intervals and better dead code elimination and coalescing.
1605  */
1606 void
1607 fs_visitor::split_virtual_grfs()
1608 {
1609    int num_vars = this->virtual_grf_count;
1610    bool split_grf[num_vars];
1611    int new_virtual_grf[num_vars];
1612
1613    /* Try to split anything > 0 sized. */
1614    for (int i = 0; i < num_vars; i++) {
1615       if (this->virtual_grf_sizes[i] != 1)
1616          split_grf[i] = true;
1617       else
1618          split_grf[i] = false;
1619    }
1620
1621    if (brw->has_pln &&
1622        this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1623       /* PLN opcodes rely on the delta_xy being contiguous.  We only have to
1624        * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1625        * Gen6, that was the only supported interpolation mode, and since Gen6,
1626        * delta_x and delta_y are in fixed hardware registers.
1627        */
1628       split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1629          false;
1630    }
1631
1632    foreach_in_list(fs_inst, inst, &instructions) {
1633       /* If there's a SEND message that requires contiguous destination
1634        * registers, no splitting is allowed.
1635        */
1636       if (inst->regs_written > 1) {
1637          split_grf[inst->dst.reg] = false;
1638       }
1639
1640       /* If we're sending from a GRF, don't split it, on the assumption that
1641        * the send is reading the whole thing.
1642        */
1643       if (inst->is_send_from_grf()) {
1644          for (int i = 0; i < inst->sources; i++) {
1645             if (inst->src[i].file == GRF) {
1646                split_grf[inst->src[i].reg] = false;
1647             }
1648          }
1649       }
1650    }
1651
1652    /* Allocate new space for split regs.  Note that the virtual
1653     * numbers will be contiguous.
1654     */
1655    for (int i = 0; i < num_vars; i++) {
1656       if (split_grf[i]) {
1657          new_virtual_grf[i] = virtual_grf_alloc(1);
1658          for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1659             int reg = virtual_grf_alloc(1);
1660             assert(reg == new_virtual_grf[i] + j - 1);
1661             (void) reg;
1662          }
1663          this->virtual_grf_sizes[i] = 1;
1664       }
1665    }
1666
1667    foreach_in_list(fs_inst, inst, &instructions) {
1668       if (inst->dst.file == GRF &&
1669           split_grf[inst->dst.reg] &&
1670           inst->dst.reg_offset != 0) {
1671          inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1672                           inst->dst.reg_offset - 1);
1673          inst->dst.reg_offset = 0;
1674       }
1675       for (int i = 0; i < inst->sources; i++) {
1676          if (inst->src[i].file == GRF &&
1677              split_grf[inst->src[i].reg] &&
1678              inst->src[i].reg_offset != 0) {
1679             inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1680                                 inst->src[i].reg_offset - 1);
1681             inst->src[i].reg_offset = 0;
1682          }
1683       }
1684    }
1685    invalidate_live_intervals();
1686 }
1687
1688 /**
1689  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1690  *
1691  * During code generation, we create tons of temporary variables, many of
1692  * which get immediately killed and are never used again.  Yet, in later
1693  * optimization and analysis passes, such as compute_live_intervals, we need
1694  * to loop over all the virtual GRFs.  Compacting them can save a lot of
1695  * overhead.
1696  */
1697 void
1698 fs_visitor::compact_virtual_grfs()
1699 {
1700    if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER))
1701       return;
1702
1703    /* Mark which virtual GRFs are used, and count how many. */
1704    int remap_table[this->virtual_grf_count];
1705    memset(remap_table, -1, sizeof(remap_table));
1706
1707    foreach_in_list(const fs_inst, inst, &instructions) {
1708       if (inst->dst.file == GRF)
1709          remap_table[inst->dst.reg] = 0;
1710
1711       for (int i = 0; i < inst->sources; i++) {
1712          if (inst->src[i].file == GRF)
1713             remap_table[inst->src[i].reg] = 0;
1714       }
1715    }
1716
1717    /* Compact the GRF arrays. */
1718    int new_index = 0;
1719    for (int i = 0; i < this->virtual_grf_count; i++) {
1720       if (remap_table[i] != -1) {
1721          remap_table[i] = new_index;
1722          virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1723          invalidate_live_intervals();
1724          ++new_index;
1725       }
1726    }
1727
1728    this->virtual_grf_count = new_index;
1729
1730    /* Patch all the instructions to use the newly renumbered registers */
1731    foreach_in_list(fs_inst, inst, &instructions) {
1732       if (inst->dst.file == GRF)
1733          inst->dst.reg = remap_table[inst->dst.reg];
1734
1735       for (int i = 0; i < inst->sources; i++) {
1736          if (inst->src[i].file == GRF)
1737             inst->src[i].reg = remap_table[inst->src[i].reg];
1738       }
1739    }
1740
1741    /* Patch all the references to delta_x/delta_y, since they're used in
1742     * register allocation.
1743     */
1744    for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
1745       if (delta_x[i].file == GRF && remap_table[delta_x[i].reg] != -1) {
1746          delta_x[i].reg = remap_table[delta_x[i].reg];
1747       }
1748    }
1749    for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
1750       if (delta_y[i].file == GRF && remap_table[delta_y[i].reg] != -1) {
1751          delta_y[i].reg = remap_table[delta_y[i].reg];
1752       }
1753    }
1754 }
1755
1756 /*
1757  * Implements array access of uniforms by inserting a
1758  * PULL_CONSTANT_LOAD instruction.
1759  *
1760  * Unlike temporary GRF array access (where we don't support it due to
1761  * the difficulty of doing relative addressing on instruction
1762  * destinations), we could potentially do array access of uniforms
1763  * that were loaded in GRF space as push constants.  In real-world
1764  * usage we've seen, though, the arrays being used are always larger
1765  * than we could load as push constants, so just always move all
1766  * uniform array access out to a pull constant buffer.
1767  */
1768 void
1769 fs_visitor::move_uniform_array_access_to_pull_constants()
1770 {
1771    if (dispatch_width != 8)
1772       return;
1773
1774    pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
1775
1776    for (unsigned int i = 0; i < uniforms; i++) {
1777       pull_constant_loc[i] = -1;
1778    }
1779
1780    /* Walk through and find array access of uniforms.  Put a copy of that
1781     * uniform in the pull constant buffer.
1782     *
1783     * Note that we don't move constant-indexed accesses to arrays.  No
1784     * testing has been done of the performance impact of this choice.
1785     */
1786    foreach_in_list_safe(fs_inst, inst, &instructions) {
1787       for (int i = 0 ; i < inst->sources; i++) {
1788          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1789             continue;
1790
1791          int uniform = inst->src[i].reg;
1792
1793          /* If this array isn't already present in the pull constant buffer,
1794           * add it.
1795           */
1796          if (pull_constant_loc[uniform] == -1) {
1797             const gl_constant_value **values = &stage_prog_data->param[uniform];
1798
1799             assert(param_size[uniform]);
1800
1801             for (int j = 0; j < param_size[uniform]; j++) {
1802                pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
1803
1804                stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
1805                   values[j];
1806             }
1807          }
1808       }
1809    }
1810 }
1811
1812 /**
1813  * Assign UNIFORM file registers to either push constants or pull constants.
1814  *
1815  * We allow a fragment shader to have more than the specified minimum
1816  * maximum number of fragment shader uniform components (64).  If
1817  * there are too many of these, they'd fill up all of register space.
1818  * So, this will push some of them out to the pull constant buffer and
1819  * update the program to load them.
1820  */
1821 void
1822 fs_visitor::assign_constant_locations()
1823 {
1824    /* Only the first compile (SIMD8 mode) gets to decide on locations. */
1825    if (dispatch_width != 8)
1826       return;
1827
1828    /* Find which UNIFORM registers are still in use. */
1829    bool is_live[uniforms];
1830    for (unsigned int i = 0; i < uniforms; i++) {
1831       is_live[i] = false;
1832    }
1833
1834    foreach_in_list(fs_inst, inst, &instructions) {
1835       for (int i = 0; i < inst->sources; i++) {
1836          if (inst->src[i].file != UNIFORM)
1837             continue;
1838
1839          int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1840          if (constant_nr >= 0 && constant_nr < (int) uniforms)
1841             is_live[constant_nr] = true;
1842       }
1843    }
1844
1845    /* Only allow 16 registers (128 uniform components) as push constants.
1846     *
1847     * Just demote the end of the list.  We could probably do better
1848     * here, demoting things that are rarely used in the program first.
1849     *
1850     * If changing this value, note the limitation about total_regs in
1851     * brw_curbe.c.
1852     */
1853    unsigned int max_push_components = 16 * 8;
1854    unsigned int num_push_constants = 0;
1855
1856    push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
1857
1858    for (unsigned int i = 0; i < uniforms; i++) {
1859       if (!is_live[i] || pull_constant_loc[i] != -1) {
1860          /* This UNIFORM register is either dead, or has already been demoted
1861           * to a pull const.  Mark it as no longer living in the param[] array.
1862           */
1863          push_constant_loc[i] = -1;
1864          continue;
1865       }
1866
1867       if (num_push_constants < max_push_components) {
1868          /* Retain as a push constant.  Record the location in the params[]
1869           * array.
1870           */
1871          push_constant_loc[i] = num_push_constants++;
1872       } else {
1873          /* Demote to a pull constant. */
1874          push_constant_loc[i] = -1;
1875
1876          int pull_index = stage_prog_data->nr_pull_params++;
1877          stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
1878          pull_constant_loc[i] = pull_index;
1879       }
1880    }
1881
1882    stage_prog_data->nr_params = num_push_constants;
1883
1884    /* Up until now, the param[] array has been indexed by reg + reg_offset
1885     * of UNIFORM registers.  Condense it to only contain the uniforms we
1886     * chose to upload as push constants.
1887     */
1888    for (unsigned int i = 0; i < uniforms; i++) {
1889       int remapped = push_constant_loc[i];
1890
1891       if (remapped == -1)
1892          continue;
1893
1894       assert(remapped <= (int)i);
1895       stage_prog_data->param[remapped] = stage_prog_data->param[i];
1896    }
1897 }
1898
1899 /**
1900  * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
1901  * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
1902  */
1903 void
1904 fs_visitor::demote_pull_constants()
1905 {
1906    foreach_in_list(fs_inst, inst, &instructions) {
1907       for (int i = 0; i < inst->sources; i++) {
1908          if (inst->src[i].file != UNIFORM)
1909             continue;
1910
1911          int pull_index = pull_constant_loc[inst->src[i].reg +
1912                                             inst->src[i].reg_offset];
1913          if (pull_index == -1)
1914             continue;
1915
1916          /* Set up the annotation tracking for new generated instructions. */
1917          base_ir = inst->ir;
1918          current_annotation = inst->annotation;
1919
1920          fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
1921          fs_reg dst = fs_reg(this, glsl_type::float_type);
1922
1923          /* Generate a pull load into dst. */
1924          if (inst->src[i].reladdr) {
1925             exec_list list = VARYING_PULL_CONSTANT_LOAD(dst,
1926                                                         surf_index,
1927                                                         *inst->src[i].reladdr,
1928                                                         pull_index);
1929             inst->insert_before(&list);
1930             inst->src[i].reladdr = NULL;
1931          } else {
1932             fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1933             fs_inst *pull =
1934                new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1935                                     dst, surf_index, offset);
1936             inst->insert_before(pull);
1937             inst->src[i].set_smear(pull_index & 3);
1938          }
1939
1940          /* Rewrite the instruction to use the temporary VGRF. */
1941          inst->src[i].file = GRF;
1942          inst->src[i].reg = dst.reg;
1943          inst->src[i].reg_offset = 0;
1944       }
1945    }
1946    invalidate_live_intervals();
1947 }
1948
1949 bool
1950 fs_visitor::opt_algebraic()
1951 {
1952    bool progress = false;
1953
1954    foreach_in_list(fs_inst, inst, &instructions) {
1955       switch (inst->opcode) {
1956       case BRW_OPCODE_MUL:
1957          if (inst->src[1].file != IMM)
1958             continue;
1959
1960          /* a * 1.0 = a */
1961          if (inst->src[1].is_one()) {
1962             inst->opcode = BRW_OPCODE_MOV;
1963             inst->src[1] = reg_undef;
1964             progress = true;
1965             break;
1966          }
1967
1968          /* a * 0.0 = 0.0 */
1969          if (inst->src[1].is_zero()) {
1970             inst->opcode = BRW_OPCODE_MOV;
1971             inst->src[0] = inst->src[1];
1972             inst->src[1] = reg_undef;
1973             progress = true;
1974             break;
1975          }
1976
1977          break;
1978       case BRW_OPCODE_ADD:
1979          if (inst->src[1].file != IMM)
1980             continue;
1981
1982          /* a + 0.0 = a */
1983          if (inst->src[1].is_zero()) {
1984             inst->opcode = BRW_OPCODE_MOV;
1985             inst->src[1] = reg_undef;
1986             progress = true;
1987             break;
1988          }
1989          break;
1990       case BRW_OPCODE_OR:
1991          if (inst->src[0].equals(inst->src[1])) {
1992             inst->opcode = BRW_OPCODE_MOV;
1993             inst->src[1] = reg_undef;
1994             progress = true;
1995             break;
1996          }
1997          break;
1998       case BRW_OPCODE_LRP:
1999          if (inst->src[1].equals(inst->src[2])) {
2000             inst->opcode = BRW_OPCODE_MOV;
2001             inst->src[0] = inst->src[1];
2002             inst->src[1] = reg_undef;
2003             inst->src[2] = reg_undef;
2004             progress = true;
2005             break;
2006          }
2007          break;
2008       case BRW_OPCODE_SEL:
2009          if (inst->src[0].equals(inst->src[1])) {
2010             inst->opcode = BRW_OPCODE_MOV;
2011             inst->src[1] = reg_undef;
2012             inst->predicate = BRW_PREDICATE_NONE;
2013             inst->predicate_inverse = false;
2014             progress = true;
2015          } else if (inst->saturate && inst->src[1].file == IMM) {
2016             switch (inst->conditional_mod) {
2017             case BRW_CONDITIONAL_LE:
2018             case BRW_CONDITIONAL_L:
2019                switch (inst->src[1].type) {
2020                case BRW_REGISTER_TYPE_F:
2021                   if (inst->src[1].fixed_hw_reg.dw1.f >= 1.0f) {
2022                      inst->opcode = BRW_OPCODE_MOV;
2023                      inst->src[1] = reg_undef;
2024                      progress = true;
2025                   }
2026                   break;
2027                default:
2028                   break;
2029                }
2030                break;
2031             case BRW_CONDITIONAL_GE:
2032             case BRW_CONDITIONAL_G:
2033                switch (inst->src[1].type) {
2034                case BRW_REGISTER_TYPE_F:
2035                   if (inst->src[1].fixed_hw_reg.dw1.f <= 0.0f) {
2036                      inst->opcode = BRW_OPCODE_MOV;
2037                      inst->src[1] = reg_undef;
2038                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
2039                      progress = true;
2040                   }
2041                   break;
2042                default:
2043                   break;
2044                }
2045             default:
2046                break;
2047             }
2048          }
2049          break;
2050       default:
2051          break;
2052       }
2053    }
2054
2055    return progress;
2056 }
2057
2058 bool
2059 fs_visitor::compute_to_mrf()
2060 {
2061    bool progress = false;
2062    int next_ip = 0;
2063
2064    calculate_live_intervals();
2065
2066    foreach_in_list_safe(fs_inst, inst, &instructions) {
2067       int ip = next_ip;
2068       next_ip++;
2069
2070       if (inst->opcode != BRW_OPCODE_MOV ||
2071           inst->is_partial_write() ||
2072           inst->dst.file != MRF || inst->src[0].file != GRF ||
2073           inst->dst.type != inst->src[0].type ||
2074           inst->src[0].abs || inst->src[0].negate ||
2075           !inst->src[0].is_contiguous() ||
2076           inst->src[0].subreg_offset)
2077          continue;
2078
2079       /* Work out which hardware MRF registers are written by this
2080        * instruction.
2081        */
2082       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2083       int mrf_high;
2084       if (inst->dst.reg & BRW_MRF_COMPR4) {
2085          mrf_high = mrf_low + 4;
2086       } else if (dispatch_width == 16 &&
2087                  (!inst->force_uncompressed && !inst->force_sechalf)) {
2088          mrf_high = mrf_low + 1;
2089       } else {
2090          mrf_high = mrf_low;
2091       }
2092
2093       /* Can't compute-to-MRF this GRF if someone else was going to
2094        * read it later.
2095        */
2096       if (this->virtual_grf_end[inst->src[0].reg] > ip)
2097          continue;
2098
2099       /* Found a move of a GRF to a MRF.  Let's see if we can go
2100        * rewrite the thing that made this GRF to write into the MRF.
2101        */
2102       fs_inst *scan_inst;
2103       for (scan_inst = (fs_inst *)inst->prev;
2104            !scan_inst->is_head_sentinel();
2105            scan_inst = (fs_inst *)scan_inst->prev) {
2106          if (scan_inst->dst.file == GRF &&
2107              scan_inst->dst.reg == inst->src[0].reg) {
2108             /* Found the last thing to write our reg we want to turn
2109              * into a compute-to-MRF.
2110              */
2111
2112             /* If this one instruction didn't populate all the
2113              * channels, bail.  We might be able to rewrite everything
2114              * that writes that reg, but it would require smarter
2115              * tracking to delay the rewriting until complete success.
2116              */
2117             if (scan_inst->is_partial_write())
2118                break;
2119
2120             /* Things returning more than one register would need us to
2121              * understand coalescing out more than one MOV at a time.
2122              */
2123             if (scan_inst->regs_written > 1)
2124                break;
2125
2126             /* SEND instructions can't have MRF as a destination. */
2127             if (scan_inst->mlen)
2128                break;
2129
2130             if (brw->gen == 6) {
2131                /* gen6 math instructions must have the destination be
2132                 * GRF, so no compute-to-MRF for them.
2133                 */
2134                if (scan_inst->is_math()) {
2135                   break;
2136                }
2137             }
2138
2139             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2140                /* Found the creator of our MRF's source value. */
2141                scan_inst->dst.file = MRF;
2142                scan_inst->dst.reg = inst->dst.reg;
2143                scan_inst->saturate |= inst->saturate;
2144                inst->remove();
2145                progress = true;
2146             }
2147             break;
2148          }
2149
2150          /* We don't handle control flow here.  Most computation of
2151           * values that end up in MRFs are shortly before the MRF
2152           * write anyway.
2153           */
2154          if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2155             break;
2156
2157          /* You can't read from an MRF, so if someone else reads our
2158           * MRF's source GRF that we wanted to rewrite, that stops us.
2159           */
2160          bool interfered = false;
2161          for (int i = 0; i < scan_inst->sources; i++) {
2162             if (scan_inst->src[i].file == GRF &&
2163                 scan_inst->src[i].reg == inst->src[0].reg &&
2164                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2165                interfered = true;
2166             }
2167          }
2168          if (interfered)
2169             break;
2170
2171          if (scan_inst->dst.file == MRF) {
2172             /* If somebody else writes our MRF here, we can't
2173              * compute-to-MRF before that.
2174              */
2175             int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2176             int scan_mrf_high;
2177
2178             if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2179                scan_mrf_high = scan_mrf_low + 4;
2180             } else if (dispatch_width == 16 &&
2181                        (!scan_inst->force_uncompressed &&
2182                         !scan_inst->force_sechalf)) {
2183                scan_mrf_high = scan_mrf_low + 1;
2184             } else {
2185                scan_mrf_high = scan_mrf_low;
2186             }
2187
2188             if (mrf_low == scan_mrf_low ||
2189                 mrf_low == scan_mrf_high ||
2190                 mrf_high == scan_mrf_low ||
2191                 mrf_high == scan_mrf_high) {
2192                break;
2193             }
2194          }
2195
2196          if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2197             /* Found a SEND instruction, which means that there are
2198              * live values in MRFs from base_mrf to base_mrf +
2199              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
2200              * above it.
2201              */
2202             if (mrf_low >= scan_inst->base_mrf &&
2203                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2204                break;
2205             }
2206             if (mrf_high >= scan_inst->base_mrf &&
2207                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2208                break;
2209             }
2210          }
2211       }
2212    }
2213
2214    if (progress)
2215       invalidate_live_intervals();
2216
2217    return progress;
2218 }
2219
2220 /**
2221  * Walks through basic blocks, looking for repeated MRF writes and
2222  * removing the later ones.
2223  */
2224 bool
2225 fs_visitor::remove_duplicate_mrf_writes()
2226 {
2227    fs_inst *last_mrf_move[16];
2228    bool progress = false;
2229
2230    /* Need to update the MRF tracking for compressed instructions. */
2231    if (dispatch_width == 16)
2232       return false;
2233
2234    memset(last_mrf_move, 0, sizeof(last_mrf_move));
2235
2236    foreach_in_list_safe(fs_inst, inst, &instructions) {
2237       if (inst->is_control_flow()) {
2238          memset(last_mrf_move, 0, sizeof(last_mrf_move));
2239       }
2240
2241       if (inst->opcode == BRW_OPCODE_MOV &&
2242           inst->dst.file == MRF) {
2243          fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2244          if (prev_inst && inst->equals(prev_inst)) {
2245             inst->remove();
2246             progress = true;
2247             continue;
2248          }
2249       }
2250
2251       /* Clear out the last-write records for MRFs that were overwritten. */
2252       if (inst->dst.file == MRF) {
2253          last_mrf_move[inst->dst.reg] = NULL;
2254       }
2255
2256       if (inst->mlen > 0 && inst->base_mrf != -1) {
2257          /* Found a SEND instruction, which will include two or fewer
2258           * implied MRF writes.  We could do better here.
2259           */
2260          for (int i = 0; i < implied_mrf_writes(inst); i++) {
2261             last_mrf_move[inst->base_mrf + i] = NULL;
2262          }
2263       }
2264
2265       /* Clear out any MRF move records whose sources got overwritten. */
2266       if (inst->dst.file == GRF) {
2267          for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2268             if (last_mrf_move[i] &&
2269                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2270                last_mrf_move[i] = NULL;
2271             }
2272          }
2273       }
2274
2275       if (inst->opcode == BRW_OPCODE_MOV &&
2276           inst->dst.file == MRF &&
2277           inst->src[0].file == GRF &&
2278           !inst->is_partial_write()) {
2279          last_mrf_move[inst->dst.reg] = inst;
2280       }
2281    }
2282
2283    if (progress)
2284       invalidate_live_intervals();
2285
2286    return progress;
2287 }
2288
2289 static void
2290 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2291                         int first_grf, int grf_len)
2292 {
2293    bool inst_simd16 = (dispatch_width > 8 &&
2294                        !inst->force_uncompressed &&
2295                        !inst->force_sechalf);
2296
2297    /* Clear the flag for registers that actually got read (as expected). */
2298    for (int i = 0; i < inst->sources; i++) {
2299       int grf;
2300       if (inst->src[i].file == GRF) {
2301          grf = inst->src[i].reg;
2302       } else if (inst->src[i].file == HW_REG &&
2303                  inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2304          grf = inst->src[i].fixed_hw_reg.nr;
2305       } else {
2306          continue;
2307       }
2308
2309       if (grf >= first_grf &&
2310           grf < first_grf + grf_len) {
2311          deps[grf - first_grf] = false;
2312          if (inst_simd16)
2313             deps[grf - first_grf + 1] = false;
2314       }
2315    }
2316 }
2317
2318 /**
2319  * Implements this workaround for the original 965:
2320  *
2321  *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2322  *      check for post destination dependencies on this instruction, software
2323  *      must ensure that there is no destination hazard for the case of ‘write
2324  *      followed by a posted write’ shown in the following example.
2325  *
2326  *      1. mov r3 0
2327  *      2. send r3.xy <rest of send instruction>
2328  *      3. mov r2 r3
2329  *
2330  *      Due to no post-destination dependency check on the ‘send’, the above
2331  *      code sequence could have two instructions (1 and 2) in flight at the
2332  *      same time that both consider ‘r3’ as the target of their final writes.
2333  */
2334 void
2335 fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
2336 {
2337    int reg_size = dispatch_width / 8;
2338    int write_len = inst->regs_written * reg_size;
2339    int first_write_grf = inst->dst.reg;
2340    bool needs_dep[BRW_MAX_MRF];
2341    assert(write_len < (int)sizeof(needs_dep) - 1);
2342
2343    memset(needs_dep, false, sizeof(needs_dep));
2344    memset(needs_dep, true, write_len);
2345
2346    clear_deps_for_inst_src(inst, dispatch_width,
2347                            needs_dep, first_write_grf, write_len);
2348
2349    /* Walk backwards looking for writes to registers we're writing which
2350     * aren't read since being written.  If we hit the start of the program,
2351     * we assume that there are no outstanding dependencies on entry to the
2352     * program.
2353     */
2354    for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2355         !scan_inst->is_head_sentinel();
2356         scan_inst = (fs_inst *)scan_inst->prev) {
2357
2358       /* If we hit control flow, assume that there *are* outstanding
2359        * dependencies, and force their cleanup before our instruction.
2360        */
2361       if (scan_inst->is_control_flow()) {
2362          for (int i = 0; i < write_len; i++) {
2363             if (needs_dep[i]) {
2364                inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2365             }
2366          }
2367          return;
2368       }
2369
2370       bool scan_inst_simd16 = (dispatch_width > 8 &&
2371                                !scan_inst->force_uncompressed &&
2372                                !scan_inst->force_sechalf);
2373
2374       /* We insert our reads as late as possible on the assumption that any
2375        * instruction but a MOV that might have left us an outstanding
2376        * dependency has more latency than a MOV.
2377        */
2378       if (scan_inst->dst.file == GRF) {
2379          for (int i = 0; i < scan_inst->regs_written; i++) {
2380             int reg = scan_inst->dst.reg + i * reg_size;
2381
2382             if (reg >= first_write_grf &&
2383                 reg < first_write_grf + write_len &&
2384                 needs_dep[reg - first_write_grf]) {
2385                inst->insert_before(DEP_RESOLVE_MOV(reg));
2386                needs_dep[reg - first_write_grf] = false;
2387                if (scan_inst_simd16)
2388                   needs_dep[reg - first_write_grf + 1] = false;
2389             }
2390          }
2391       }
2392
2393       /* Clear the flag for registers that actually got read (as expected). */
2394       clear_deps_for_inst_src(scan_inst, dispatch_width,
2395                               needs_dep, first_write_grf, write_len);
2396
2397       /* Continue the loop only if we haven't resolved all the dependencies */
2398       int i;
2399       for (i = 0; i < write_len; i++) {
2400          if (needs_dep[i])
2401             break;
2402       }
2403       if (i == write_len)
2404          return;
2405    }
2406 }
2407
2408 /**
2409  * Implements this workaround for the original 965:
2410  *
2411  *     "[DevBW, DevCL] Errata: A destination register from a send can not be
2412  *      used as a destination register until after it has been sourced by an
2413  *      instruction with a different destination register.
2414  */
2415 void
2416 fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
2417 {
2418    int write_len = inst->regs_written * dispatch_width / 8;
2419    int first_write_grf = inst->dst.reg;
2420    bool needs_dep[BRW_MAX_MRF];
2421    assert(write_len < (int)sizeof(needs_dep) - 1);
2422
2423    memset(needs_dep, false, sizeof(needs_dep));
2424    memset(needs_dep, true, write_len);
2425    /* Walk forwards looking for writes to registers we're writing which aren't
2426     * read before being written.
2427     */
2428    for (fs_inst *scan_inst = (fs_inst *)inst->next;
2429         !scan_inst->is_tail_sentinel();
2430         scan_inst = (fs_inst *)scan_inst->next) {
2431       /* If we hit control flow, force resolve all remaining dependencies. */
2432       if (scan_inst->is_control_flow()) {
2433          for (int i = 0; i < write_len; i++) {
2434             if (needs_dep[i])
2435                scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2436          }
2437          return;
2438       }
2439
2440       /* Clear the flag for registers that actually got read (as expected). */
2441       clear_deps_for_inst_src(scan_inst, dispatch_width,
2442                               needs_dep, first_write_grf, write_len);
2443
2444       /* We insert our reads as late as possible since they're reading the
2445        * result of a SEND, which has massive latency.
2446        */
2447       if (scan_inst->dst.file == GRF &&
2448           scan_inst->dst.reg >= first_write_grf &&
2449           scan_inst->dst.reg < first_write_grf + write_len &&
2450           needs_dep[scan_inst->dst.reg - first_write_grf]) {
2451          scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2452          needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2453       }
2454
2455       /* Continue the loop only if we haven't resolved all the dependencies */
2456       int i;
2457       for (i = 0; i < write_len; i++) {
2458          if (needs_dep[i])
2459             break;
2460       }
2461       if (i == write_len)
2462          return;
2463    }
2464
2465    /* If we hit the end of the program, resolve all remaining dependencies out
2466     * of paranoia.
2467     */
2468    fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2469    assert(last_inst->eot);
2470    for (int i = 0; i < write_len; i++) {
2471       if (needs_dep[i])
2472          last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2473    }
2474 }
2475
2476 void
2477 fs_visitor::insert_gen4_send_dependency_workarounds()
2478 {
2479    if (brw->gen != 4 || brw->is_g4x)
2480       return;
2481
2482    bool progress = false;
2483
2484    /* Note that we're done with register allocation, so GRF fs_regs always
2485     * have a .reg_offset of 0.
2486     */
2487
2488    foreach_in_list_safe(fs_inst, inst, &instructions) {
2489       if (inst->mlen != 0 && inst->dst.file == GRF) {
2490          insert_gen4_pre_send_dependency_workarounds(inst);
2491          insert_gen4_post_send_dependency_workarounds(inst);
2492          progress = true;
2493       }
2494    }
2495
2496    if (progress)
2497       invalidate_live_intervals();
2498 }
2499
2500 /**
2501  * Turns the generic expression-style uniform pull constant load instruction
2502  * into a hardware-specific series of instructions for loading a pull
2503  * constant.
2504  *
2505  * The expression style allows the CSE pass before this to optimize out
2506  * repeated loads from the same offset, and gives the pre-register-allocation
2507  * scheduling full flexibility, while the conversion to native instructions
2508  * allows the post-register-allocation scheduler the best information
2509  * possible.
2510  *
2511  * Note that execution masking for setting up pull constant loads is special:
2512  * the channels that need to be written are unrelated to the current execution
2513  * mask, since a later instruction will use one of the result channels as a
2514  * source operand for all 8 or 16 of its channels.
2515  */
2516 void
2517 fs_visitor::lower_uniform_pull_constant_loads()
2518 {
2519    foreach_in_list(fs_inst, inst, &instructions) {
2520       if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2521          continue;
2522
2523       if (brw->gen >= 7) {
2524          /* The offset arg before was a vec4-aligned byte offset.  We need to
2525           * turn it into a dword offset.
2526           */
2527          fs_reg const_offset_reg = inst->src[1];
2528          assert(const_offset_reg.file == IMM &&
2529                 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2530          const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
2531          fs_reg payload = fs_reg(this, glsl_type::uint_type);
2532
2533          /* This is actually going to be a MOV, but since only the first dword
2534           * is accessed, we have a special opcode to do just that one.  Note
2535           * that this needs to be an operation that will be considered a def
2536           * by live variable analysis, or register allocation will explode.
2537           */
2538          fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2539                                                payload, const_offset_reg);
2540          setup->force_writemask_all = true;
2541
2542          setup->ir = inst->ir;
2543          setup->annotation = inst->annotation;
2544          inst->insert_before(setup);
2545
2546          /* Similarly, this will only populate the first 4 channels of the
2547           * result register (since we only use smear values from 0-3), but we
2548           * don't tell the optimizer.
2549           */
2550          inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2551          inst->src[1] = payload;
2552
2553          invalidate_live_intervals();
2554       } else {
2555          /* Before register allocation, we didn't tell the scheduler about the
2556           * MRF we use.  We know it's safe to use this MRF because nothing
2557           * else does except for register spill/unspill, which generates and
2558           * uses its MRF within a single IR instruction.
2559           */
2560          inst->base_mrf = 14;
2561          inst->mlen = 1;
2562       }
2563    }
2564 }
2565
2566 bool
2567 fs_visitor::lower_load_payload()
2568 {
2569    bool progress = false;
2570
2571    foreach_in_list_safe(fs_inst, inst, &instructions) {
2572       if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
2573          fs_reg dst = inst->dst;
2574
2575          /* src[0] represents the (optional) message header. */
2576          if (inst->src[0].file != BAD_FILE) {
2577             inst->insert_before(MOV(dst, inst->src[0]));
2578          }
2579          dst.reg_offset++;
2580
2581          for (int i = 1; i < inst->sources; i++) {
2582             inst->insert_before(MOV(dst, inst->src[i]));
2583             dst.reg_offset++;
2584          }
2585
2586          inst->remove();
2587          progress = true;
2588       }
2589    }
2590
2591    if (progress)
2592       invalidate_live_intervals();
2593
2594    return progress;
2595 }
2596
2597 void
2598 fs_visitor::dump_instructions()
2599 {
2600    dump_instructions(NULL);
2601 }
2602
2603 void
2604 fs_visitor::dump_instructions(const char *name)
2605 {
2606    calculate_register_pressure();
2607    FILE *file = stderr;
2608    if (name && geteuid() != 0) {
2609       file = fopen(name, "w");
2610       if (!file)
2611          file = stderr;
2612    }
2613
2614    int ip = 0, max_pressure = 0;
2615    foreach_in_list(backend_instruction, inst, &instructions) {
2616       max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
2617       fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
2618       dump_instruction(inst, file);
2619       ++ip;
2620    }
2621    fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
2622
2623    if (file != stderr) {
2624       fclose(file);
2625    }
2626 }
2627
2628 void
2629 fs_visitor::dump_instruction(backend_instruction *be_inst)
2630 {
2631    dump_instruction(be_inst, stderr);
2632 }
2633
2634 void
2635 fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
2636 {
2637    fs_inst *inst = (fs_inst *)be_inst;
2638
2639    if (inst->predicate) {
2640       fprintf(file, "(%cf0.%d) ",
2641              inst->predicate_inverse ? '-' : '+',
2642              inst->flag_subreg);
2643    }
2644
2645    fprintf(file, "%s", brw_instruction_name(inst->opcode));
2646    if (inst->saturate)
2647       fprintf(file, ".sat");
2648    if (inst->conditional_mod) {
2649       fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
2650       if (!inst->predicate &&
2651           (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2652                               inst->opcode != BRW_OPCODE_IF &&
2653                               inst->opcode != BRW_OPCODE_WHILE))) {
2654          fprintf(file, ".f0.%d", inst->flag_subreg);
2655       }
2656    }
2657    fprintf(file, " ");
2658
2659
2660    switch (inst->dst.file) {
2661    case GRF:
2662       fprintf(file, "vgrf%d", inst->dst.reg);
2663       if (virtual_grf_sizes[inst->dst.reg] != 1 ||
2664           inst->dst.subreg_offset)
2665          fprintf(file, "+%d.%d",
2666                  inst->dst.reg_offset, inst->dst.subreg_offset);
2667       break;
2668    case MRF:
2669       fprintf(file, "m%d", inst->dst.reg);
2670       break;
2671    case BAD_FILE:
2672       fprintf(file, "(null)");
2673       break;
2674    case UNIFORM:
2675       fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
2676       break;
2677    case HW_REG:
2678       if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
2679          switch (inst->dst.fixed_hw_reg.nr) {
2680          case BRW_ARF_NULL:
2681             fprintf(file, "null");
2682             break;
2683          case BRW_ARF_ADDRESS:
2684             fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
2685             break;
2686          case BRW_ARF_ACCUMULATOR:
2687             fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
2688             break;
2689          case BRW_ARF_FLAG:
2690             fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
2691                              inst->dst.fixed_hw_reg.subnr);
2692             break;
2693          default:
2694             fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
2695                                inst->dst.fixed_hw_reg.subnr);
2696             break;
2697          }
2698       } else {
2699          fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
2700       }
2701       if (inst->dst.fixed_hw_reg.subnr)
2702          fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
2703       break;
2704    default:
2705       fprintf(file, "???");
2706       break;
2707    }
2708    fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
2709
2710    for (int i = 0; i < inst->sources && inst->src[i].file != BAD_FILE; i++) {
2711       if (inst->src[i].negate)
2712          fprintf(file, "-");
2713       if (inst->src[i].abs)
2714          fprintf(file, "|");
2715       switch (inst->src[i].file) {
2716       case GRF:
2717          fprintf(file, "vgrf%d", inst->src[i].reg);
2718          if (virtual_grf_sizes[inst->src[i].reg] != 1 ||
2719              inst->src[i].subreg_offset)
2720             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
2721                     inst->src[i].subreg_offset);
2722          break;
2723       case MRF:
2724          fprintf(file, "***m%d***", inst->src[i].reg);
2725          break;
2726       case UNIFORM:
2727          fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
2728          if (inst->src[i].reladdr) {
2729             fprintf(file, "+reladdr");
2730          } else if (virtual_grf_sizes[inst->src[i].reg] != 1 ||
2731              inst->src[i].subreg_offset) {
2732             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
2733                     inst->src[i].subreg_offset);
2734          }
2735          break;
2736       case BAD_FILE:
2737          fprintf(file, "(null)");
2738          break;
2739       case IMM:
2740          switch (inst->src[i].type) {
2741          case BRW_REGISTER_TYPE_F:
2742             fprintf(file, "%ff", inst->src[i].fixed_hw_reg.dw1.f);
2743             break;
2744          case BRW_REGISTER_TYPE_D:
2745             fprintf(file, "%dd", inst->src[i].fixed_hw_reg.dw1.d);
2746             break;
2747          case BRW_REGISTER_TYPE_UD:
2748             fprintf(file, "%uu", inst->src[i].fixed_hw_reg.dw1.ud);
2749             break;
2750          default:
2751             fprintf(file, "???");
2752             break;
2753          }
2754          break;
2755       case HW_REG:
2756          if (inst->src[i].fixed_hw_reg.negate)
2757             fprintf(file, "-");
2758          if (inst->src[i].fixed_hw_reg.abs)
2759             fprintf(file, "|");
2760          if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
2761             switch (inst->src[i].fixed_hw_reg.nr) {
2762             case BRW_ARF_NULL:
2763                fprintf(file, "null");
2764                break;
2765             case BRW_ARF_ADDRESS:
2766                fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
2767                break;
2768             case BRW_ARF_ACCUMULATOR:
2769                fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
2770                break;
2771             case BRW_ARF_FLAG:
2772                fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
2773                                 inst->src[i].fixed_hw_reg.subnr);
2774                break;
2775             default:
2776                fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
2777                                   inst->src[i].fixed_hw_reg.subnr);
2778                break;
2779             }
2780          } else {
2781             fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
2782          }
2783          if (inst->src[i].fixed_hw_reg.subnr)
2784             fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
2785          if (inst->src[i].fixed_hw_reg.abs)
2786             fprintf(file, "|");
2787          break;
2788       default:
2789          fprintf(file, "???");
2790          break;
2791       }
2792       if (inst->src[i].abs)
2793          fprintf(file, "|");
2794
2795       if (inst->src[i].file != IMM) {
2796          fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
2797       }
2798
2799       if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
2800          fprintf(file, ", ");
2801    }
2802
2803    fprintf(file, " ");
2804
2805    if (inst->force_uncompressed)
2806       fprintf(file, "1sthalf ");
2807
2808    if (inst->force_sechalf)
2809       fprintf(file, "2ndhalf ");
2810
2811    fprintf(file, "\n");
2812 }
2813
2814 /**
2815  * Possibly returns an instruction that set up @param reg.
2816  *
2817  * Sometimes we want to take the result of some expression/variable
2818  * dereference tree and rewrite the instruction generating the result
2819  * of the tree.  When processing the tree, we know that the
2820  * instructions generated are all writing temporaries that are dead
2821  * outside of this tree.  So, if we have some instructions that write
2822  * a temporary, we're free to point that temp write somewhere else.
2823  *
2824  * Note that this doesn't guarantee that the instruction generated
2825  * only reg -- it might be the size=4 destination of a texture instruction.
2826  */
2827 fs_inst *
2828 fs_visitor::get_instruction_generating_reg(fs_inst *start,
2829                                            fs_inst *end,
2830                                            const fs_reg &reg)
2831 {
2832    if (end == start ||
2833        end->is_partial_write() ||
2834        reg.reladdr ||
2835        !reg.equals(end->dst)) {
2836       return NULL;
2837    } else {
2838       return end;
2839    }
2840 }
2841
2842 void
2843 fs_visitor::setup_payload_gen6()
2844 {
2845    bool uses_depth =
2846       (fp->Base.InputsRead & (1 << VARYING_SLOT_POS)) != 0;
2847    unsigned barycentric_interp_modes = prog_data->barycentric_interp_modes;
2848
2849    assert(brw->gen >= 6);
2850
2851    /* R0-1: masks, pixel X/Y coordinates. */
2852    payload.num_regs = 2;
2853    /* R2: only for 32-pixel dispatch.*/
2854
2855    /* R3-26: barycentric interpolation coordinates.  These appear in the
2856     * same order that they appear in the brw_wm_barycentric_interp_mode
2857     * enum.  Each set of coordinates occupies 2 registers if dispatch width
2858     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
2859     * appear if they were enabled using the "Barycentric Interpolation
2860     * Mode" bits in WM_STATE.
2861     */
2862    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
2863       if (barycentric_interp_modes & (1 << i)) {
2864          payload.barycentric_coord_reg[i] = payload.num_regs;
2865          payload.num_regs += 2;
2866          if (dispatch_width == 16) {
2867             payload.num_regs += 2;
2868          }
2869       }
2870    }
2871
2872    /* R27: interpolated depth if uses source depth */
2873    if (uses_depth) {
2874       payload.source_depth_reg = payload.num_regs;
2875       payload.num_regs++;
2876       if (dispatch_width == 16) {
2877          /* R28: interpolated depth if not SIMD8. */
2878          payload.num_regs++;
2879       }
2880    }
2881    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
2882    if (uses_depth) {
2883       payload.source_w_reg = payload.num_regs;
2884       payload.num_regs++;
2885       if (dispatch_width == 16) {
2886          /* R30: interpolated W if not SIMD8. */
2887          payload.num_regs++;
2888       }
2889    }
2890
2891    prog_data->uses_pos_offset = key->compute_pos_offset;
2892    /* R31: MSAA position offsets. */
2893    if (prog_data->uses_pos_offset) {
2894       payload.sample_pos_reg = payload.num_regs;
2895       payload.num_regs++;
2896    }
2897
2898    /* R32: MSAA input coverage mask */
2899    if (fp->Base.SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
2900       assert(brw->gen >= 7);
2901       payload.sample_mask_in_reg = payload.num_regs;
2902       payload.num_regs++;
2903       if (dispatch_width == 16) {
2904          /* R33: input coverage mask if not SIMD8. */
2905          payload.num_regs++;
2906       }
2907    }
2908
2909    /* R34-: bary for 32-pixel. */
2910    /* R58-59: interp W for 32-pixel. */
2911
2912    if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
2913       source_depth_to_render_target = true;
2914    }
2915 }
2916
2917 void
2918 fs_visitor::assign_binding_table_offsets()
2919 {
2920    uint32_t next_binding_table_offset = 0;
2921
2922    /* If there are no color regions, we still perform an FB write to a null
2923     * renderbuffer, which we place at surface index 0.
2924     */
2925    prog_data->binding_table.render_target_start = next_binding_table_offset;
2926    next_binding_table_offset += MAX2(key->nr_color_regions, 1);
2927
2928    assign_common_binding_table_offsets(next_binding_table_offset);
2929 }
2930
2931 void
2932 fs_visitor::calculate_register_pressure()
2933 {
2934    invalidate_live_intervals();
2935    calculate_live_intervals();
2936
2937    unsigned num_instructions = instructions.length();
2938
2939    regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
2940
2941    for (int reg = 0; reg < virtual_grf_count; reg++) {
2942       for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
2943          regs_live_at_ip[ip] += virtual_grf_sizes[reg];
2944    }
2945 }
2946
2947 /**
2948  * Look for repeated FS_OPCODE_MOV_DISPATCH_TO_FLAGS and drop the later ones.
2949  *
2950  * The needs_unlit_centroid_workaround ends up producing one of these per
2951  * channel of centroid input, so it's good to clean them up.
2952  *
2953  * An assumption here is that nothing ever modifies the dispatched pixels
2954  * value that FS_OPCODE_MOV_DISPATCH_TO_FLAGS reads from, but the hardware
2955  * dictates that anyway.
2956  */
2957 void
2958 fs_visitor::opt_drop_redundant_mov_to_flags()
2959 {
2960    bool flag_mov_found[2] = {false};
2961
2962    foreach_in_list_safe(fs_inst, inst, &instructions) {
2963       if (inst->is_control_flow()) {
2964          memset(flag_mov_found, 0, sizeof(flag_mov_found));
2965       } else if (inst->opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS) {
2966          if (!flag_mov_found[inst->flag_subreg])
2967             flag_mov_found[inst->flag_subreg] = true;
2968          else
2969             inst->remove();
2970       } else if (inst->writes_flag()) {
2971          flag_mov_found[inst->flag_subreg] = false;
2972       }
2973    }
2974 }
2975
2976 bool
2977 fs_visitor::run()
2978 {
2979    sanity_param_count = fp->Base.Parameters->NumParameters;
2980    bool allocated_without_spills;
2981
2982    assign_binding_table_offsets();
2983
2984    if (brw->gen >= 6)
2985       setup_payload_gen6();
2986    else
2987       setup_payload_gen4();
2988
2989    if (0) {
2990       emit_dummy_fs();
2991    } else {
2992       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2993          emit_shader_time_begin();
2994
2995       calculate_urb_setup();
2996       if (fp->Base.InputsRead > 0) {
2997          if (brw->gen < 6)
2998             emit_interpolation_setup_gen4();
2999          else
3000             emit_interpolation_setup_gen6();
3001       }
3002
3003       /* We handle discards by keeping track of the still-live pixels in f0.1.
3004        * Initialize it with the dispatched pixels.
3005        */
3006       if (fp->UsesKill || key->alpha_test_func) {
3007          fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
3008          discard_init->flag_subreg = 1;
3009       }
3010
3011       /* Generate FS IR for main().  (the visitor only descends into
3012        * functions called "main").
3013        */
3014       if (shader) {
3015          foreach_in_list(ir_instruction, ir, shader->base.ir) {
3016             base_ir = ir;
3017             this->result = reg_undef;
3018             ir->accept(this);
3019          }
3020       } else {
3021          emit_fragment_program_code();
3022       }
3023       base_ir = NULL;
3024       if (failed)
3025          return false;
3026
3027       emit(FS_OPCODE_PLACEHOLDER_HALT);
3028
3029       if (key->alpha_test_func)
3030          emit_alpha_test();
3031
3032       emit_fb_writes();
3033
3034       split_virtual_grfs();
3035
3036       move_uniform_array_access_to_pull_constants();
3037       assign_constant_locations();
3038       demote_pull_constants();
3039
3040       opt_drop_redundant_mov_to_flags();
3041
3042 #define OPT(pass, args...) do {                                            \
3043       pass_num++;                                                          \
3044       bool this_progress = pass(args);                                     \
3045                                                                            \
3046       if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) {      \
3047          char filename[64];                                                \
3048          snprintf(filename, 64, "fs%d-%04d-%02d-%02d-" #pass,              \
3049                   dispatch_width, shader_prog->Name, iteration, pass_num); \
3050                                                                            \
3051          backend_visitor::dump_instructions(filename);                     \
3052       }                                                                    \
3053                                                                            \
3054       progress = progress || this_progress;                                \
3055    } while (false)
3056
3057       if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
3058          char filename[64];
3059          snprintf(filename, 64, "fs%d-%04d-00-start",
3060                   dispatch_width, shader_prog->Name);
3061
3062          backend_visitor::dump_instructions(filename);
3063       }
3064
3065       bool progress;
3066       int iteration = 0;
3067       do {
3068          progress = false;
3069          iteration++;
3070          int pass_num = 0;
3071
3072          compact_virtual_grfs();
3073
3074          OPT(remove_duplicate_mrf_writes);
3075
3076          OPT(opt_algebraic);
3077          OPT(opt_cse);
3078          OPT(opt_copy_propagate);
3079          OPT(opt_peephole_predicated_break);
3080          OPT(dead_code_eliminate);
3081          OPT(opt_peephole_sel);
3082          OPT(dead_control_flow_eliminate, this);
3083          OPT(opt_saturate_propagation);
3084          OPT(register_coalesce);
3085          OPT(compute_to_mrf);
3086       } while (progress);
3087
3088       if (lower_load_payload()) {
3089          register_coalesce();
3090          dead_code_eliminate();
3091       }
3092
3093       lower_uniform_pull_constant_loads();
3094
3095       assign_curb_setup();
3096       assign_urb_setup();
3097
3098       static enum instruction_scheduler_mode pre_modes[] = {
3099          SCHEDULE_PRE,
3100          SCHEDULE_PRE_NON_LIFO,
3101          SCHEDULE_PRE_LIFO,
3102       };
3103
3104       /* Try each scheduling heuristic to see if it can successfully register
3105        * allocate without spilling.  They should be ordered by decreasing
3106        * performance but increasing likelihood of allocating.
3107        */
3108       for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3109          schedule_instructions(pre_modes[i]);
3110
3111          if (0) {
3112             assign_regs_trivial();
3113             allocated_without_spills = true;
3114          } else {
3115             allocated_without_spills = assign_regs(false);
3116          }
3117          if (allocated_without_spills)
3118             break;
3119       }
3120
3121       if (!allocated_without_spills) {
3122          /* We assume that any spilling is worse than just dropping back to
3123           * SIMD8.  There's probably actually some intermediate point where
3124           * SIMD16 with a couple of spills is still better.
3125           */
3126          if (dispatch_width == 16) {
3127             fail("Failure to register allocate.  Reduce number of "
3128                  "live scalar values to avoid this.");
3129          } else {
3130             perf_debug("Fragment shader triggered register spilling.  "
3131                        "Try reducing the number of live scalar values to "
3132                        "improve performance.\n");
3133          }
3134
3135          /* Since we're out of heuristics, just go spill registers until we
3136           * get an allocation.
3137           */
3138          while (!assign_regs(true)) {
3139             if (failed)
3140                break;
3141          }
3142       }
3143    }
3144    assert(force_uncompressed_stack == 0);
3145
3146    /* This must come after all optimization and register allocation, since
3147     * it inserts dead code that happens to have side effects, and it does
3148     * so based on the actual physical registers in use.
3149     */
3150    insert_gen4_send_dependency_workarounds();
3151
3152    if (failed)
3153       return false;
3154
3155    if (!allocated_without_spills)
3156       schedule_instructions(SCHEDULE_POST);
3157
3158    if (last_scratch > 0) {
3159       prog_data->total_scratch = brw_get_scratch_size(last_scratch);
3160    }
3161
3162    if (dispatch_width == 8)
3163       prog_data->reg_blocks = brw_register_blocks(grf_used);
3164    else
3165       prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
3166
3167    /* If any state parameters were appended, then ParameterValues could have
3168     * been realloced, in which case the driver uniform storage set up by
3169     * _mesa_associate_uniform_storage() would point to freed memory.  Make
3170     * sure that didn't happen.
3171     */
3172    assert(sanity_param_count == fp->Base.Parameters->NumParameters);
3173
3174    return !failed;
3175 }
3176
3177 const unsigned *
3178 brw_wm_fs_emit(struct brw_context *brw,
3179                void *mem_ctx,
3180                const struct brw_wm_prog_key *key,
3181                struct brw_wm_prog_data *prog_data,
3182                struct gl_fragment_program *fp,
3183                struct gl_shader_program *prog,
3184                unsigned *final_assembly_size)
3185 {
3186    bool start_busy = false;
3187    double start_time = 0;
3188
3189    if (unlikely(brw->perf_debug)) {
3190       start_busy = (brw->batch.last_bo &&
3191                     drm_intel_bo_busy(brw->batch.last_bo));
3192       start_time = get_time();
3193    }
3194
3195    struct brw_shader *shader = NULL;
3196    if (prog)
3197       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3198
3199    if (unlikely(INTEL_DEBUG & DEBUG_WM))
3200       brw_dump_ir(brw, "fragment", prog, &shader->base, &fp->Base);
3201
3202    /* Now the main event: Visit the shader IR and generate our FS IR for it.
3203     */
3204    fs_visitor v(brw, mem_ctx, key, prog_data, prog, fp, 8);
3205    if (!v.run()) {
3206       if (prog) {
3207          prog->LinkStatus = false;
3208          ralloc_strcat(&prog->InfoLog, v.fail_msg);
3209       }
3210
3211       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3212                     v.fail_msg);
3213
3214       return NULL;
3215    }
3216
3217    exec_list *simd16_instructions = NULL;
3218    fs_visitor v2(brw, mem_ctx, key, prog_data, prog, fp, 16);
3219    if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16))) {
3220       if (!v.simd16_unsupported) {
3221          /* Try a SIMD16 compile */
3222          v2.import_uniforms(&v);
3223          if (!v2.run()) {
3224             perf_debug("SIMD16 shader failed to compile, falling back to "
3225                        "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
3226          } else {
3227             simd16_instructions = &v2.instructions;
3228          }
3229       } else {
3230          perf_debug("SIMD16 shader unsupported, falling back to "
3231                     "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
3232       }
3233    }
3234
3235    const unsigned *assembly = NULL;
3236    fs_generator g(brw, mem_ctx, key, prog_data, prog, fp,
3237                   v.runtime_check_aads_emit, INTEL_DEBUG & DEBUG_WM);
3238    assembly = g.generate_assembly(&v.instructions, simd16_instructions,
3239                                   final_assembly_size);
3240
3241    if (unlikely(brw->perf_debug) && shader) {
3242       if (shader->compiled_once)
3243          brw_wm_debug_recompile(brw, prog, key);
3244       shader->compiled_once = true;
3245
3246       if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
3247          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3248                     (get_time() - start_time) * 1000);
3249       }
3250    }
3251
3252    return assembly;
3253 }
3254
3255 bool
3256 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
3257 {
3258    struct brw_context *brw = brw_context(ctx);
3259    struct brw_wm_prog_key key;
3260
3261    if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
3262       return true;
3263
3264    struct gl_fragment_program *fp = (struct gl_fragment_program *)
3265       prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
3266    struct brw_fragment_program *bfp = brw_fragment_program(fp);
3267    bool program_uses_dfdy = fp->UsesDFdy;
3268
3269    memset(&key, 0, sizeof(key));
3270
3271    if (brw->gen < 6) {
3272       if (fp->UsesKill)
3273          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3274
3275       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3276          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3277
3278       /* Just assume depth testing. */
3279       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3280       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3281    }
3282
3283    if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
3284                                          BRW_FS_VARYING_INPUT_MASK) > 16)
3285       key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
3286
3287    unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
3288    for (unsigned i = 0; i < sampler_count; i++) {
3289       if (fp->Base.ShadowSamplers & (1 << i)) {
3290          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3291          key.tex.swizzles[i] =
3292             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3293       } else {
3294          /* Color sampler: assume no swizzling. */
3295          key.tex.swizzles[i] = SWIZZLE_XYZW;
3296       }
3297    }
3298
3299    if (fp->Base.InputsRead & VARYING_BIT_POS) {
3300       key.drawable_height = ctx->DrawBuffer->Height;
3301    }
3302
3303    key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
3304          ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
3305          BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
3306
3307    if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3308       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
3309                           key.nr_color_regions > 1;
3310    }
3311
3312    /* GL_FRAGMENT_SHADER_DERIVATIVE_HINT is almost always GL_DONT_CARE.  The
3313     * quality of the derivatives is likely to be determined by the driconf
3314     * option.
3315     */
3316    key.high_quality_derivatives = brw->disable_derivative_optimization;
3317
3318    key.program_string_id = bfp->id;
3319
3320    uint32_t old_prog_offset = brw->wm.base.prog_offset;
3321    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3322
3323    bool success = do_wm_prog(brw, prog, bfp, &key);
3324
3325    brw->wm.base.prog_offset = old_prog_offset;
3326    brw->wm.prog_data = old_prog_data;
3327
3328    return success;
3329 }