src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 extern "C" {
  32
  33 #include <sys/types.h>
  34
  35 #include "util/hash_table.h"
  36 #include "main/macros.h"
  37 #include "main/shaderobj.h"
  38 #include "main/fbobject.h"
  39 #include "program/prog_parameter.h"
  40 #include "program/prog_print.h"
  41 #include "program/register_allocate.h"
  42 #include "program/sampler.h"
  43 #include "program/hash_table.h"
  44 #include "brw_context.h"
  45 #include "brw_eu.h"
  46 #include "brw_wm.h"
  47 }
  48 #include "brw_fs.h"
  49 #include "brw_dead_control_flow.h"
  50 #include "main/uniforms.h"
  51 #include "brw_fs_live_variables.h"
  52 #include "glsl/glsl_types.h"
  53
  54 void
  55 fs_inst::init(enum opcode opcode, const fs_reg &dst, fs_reg *src, int sources)
  56 {
  57    memset(this, 0, sizeof(*this));
  58
  59    this->opcode = opcode;
  60    this->dst = dst;
  61    this->src = src;
  62    this->sources = sources;
  63
  64    this->conditional_mod = BRW_CONDITIONAL_NONE;
  65
  66    /* This will be the case for almost all instructions. */
  67    this->regs_written = 1;
  68
  69    this->writes_accumulator = false;
  70 }
  71
  72 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst)
  73 {
  74    fs_reg *src = ralloc_array(this, fs_reg, 3);
  75    init(opcode, dst, src, 0);
  76 }
  77
  78 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
  79 {
  80    fs_reg *src = ralloc_array(this, fs_reg, 3);
  81    src[0] = src0;
  82    init(opcode, dst, src, 1);
  83 }
  84
  85 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
  86                  const fs_reg &src1)
  87 {
  88    fs_reg *src = ralloc_array(this, fs_reg, 3);
  89    src[0] = src0;
  90    src[1] = src1;
  91    init(opcode, dst, src, 2);
  92 }
  93
  94 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
  95                  const fs_reg &src1, const fs_reg &src2)
  96 {
  97    fs_reg *src = ralloc_array(this, fs_reg, 3);
  98    src[0] = src0;
  99    src[1] = src1;
 100    src[2] = src2;
 101    init(opcode, dst, src, 3);
 102 }
 103
 104 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, fs_reg src[], int sources)
 105 {
 106    init(opcode, dst, src, sources);
 107 }
 108
 109 fs_inst::fs_inst(const fs_inst &that)
 110 {
 111    memcpy(this, &that, sizeof(that));
 112
 113    this->src = ralloc_array(this, fs_reg, that.sources);
 114
 115    for (int i = 0; i < that.sources; i++)
 116       this->src[i] = that.src[i];
 117 }
 118
 119 void
 120 fs_inst::resize_sources(uint8_t num_sources)
 121 {
 122    if (this->sources != num_sources) {
 123       this->src = reralloc(this, this->src, fs_reg, num_sources);
 124       this->sources = num_sources;
 125    }
 126 }
 127
 128 #define ALU1(op)                                                        \
 129    fs_inst *                                                            \
 130    fs_visitor::op(const fs_reg &dst, const fs_reg &src0)                \
 131    {                                                                    \
 132       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
 133    }
 134
 135 #define ALU2(op)                                                        \
 136    fs_inst *                                                            \
 137    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 138                   const fs_reg &src1)                                   \
 139    {                                                                    \
 140       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
 141    }
 142
 143 #define ALU2_ACC(op)                                                    \
 144    fs_inst *                                                            \
 145    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 146                   const fs_reg &src1)                                   \
 147    {                                                                    \
 148       fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
 149       inst->writes_accumulator = true;                                  \
 150       return inst;                                                      \
 151    }
 152
 153 #define ALU3(op)                                                        \
 154    fs_inst *                                                            \
 155    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 156                   const fs_reg &src1, const fs_reg &src2)               \
 157    {                                                                    \
 158       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
 159    }
 160
 161 ALU1(NOT)
 162 ALU1(MOV)
 163 ALU1(FRC)
 164 ALU1(RNDD)
 165 ALU1(RNDE)
 166 ALU1(RNDZ)
 167 ALU2(ADD)
 168 ALU2(MUL)
 169 ALU2_ACC(MACH)
 170 ALU2(AND)
 171 ALU2(OR)
 172 ALU2(XOR)
 173 ALU2(SHL)
 174 ALU2(SHR)
 175 ALU2(ASR)
 176 ALU3(LRP)
 177 ALU1(BFREV)
 178 ALU3(BFE)
 179 ALU2(BFI1)
 180 ALU3(BFI2)
 181 ALU1(FBH)
 182 ALU1(FBL)
 183 ALU1(CBIT)
 184 ALU3(MAD)
 185 ALU2_ACC(ADDC)
 186 ALU2_ACC(SUBB)
 187 ALU2(SEL)
 188 ALU2(MAC)
 189
 190 /** Gen4 predicated IF. */
 191 fs_inst *
 192 fs_visitor::IF(enum brw_predicate predicate)
 193 {
 194    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
 195    inst->predicate = predicate;
 196    return inst;
 197 }
 198
 199 /** Gen6 IF with embedded comparison. */
 200 fs_inst *
 201 fs_visitor::IF(const fs_reg &src0, const fs_reg &src1,
 202                enum brw_conditional_mod condition)
 203 {
 204    assert(brw->gen == 6);
 205    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
 206                                         reg_null_d, src0, src1);
 207    inst->conditional_mod = condition;
 208    return inst;
 209 }
 210
 211 /**
 212  * CMP: Sets the low bit of the destination channels with the result
 213  * of the comparison, while the upper bits are undefined, and updates
 214  * the flag register with the packed 16 bits of the result.
 215  */
 216 fs_inst *
 217 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1,
 218                 enum brw_conditional_mod condition)
 219 {
 220    fs_inst *inst;
 221
 222    /* Take the instruction:
 223     *
 224     * CMP null<d> src0<f> src1<f>
 225     *
 226     * Original gen4 does type conversion to the destination type before
 227     * comparison, producing garbage results for floating point comparisons.
 228     * gen5 does the comparison on the execution type (resolved source types),
 229     * so dst type doesn't matter.  gen6 does comparison and then uses the
 230     * result as if it was the dst type with no conversion, which happens to
 231     * mostly work out for float-interpreted-as-int since our comparisons are
 232     * for >0, =0, <0.
 233     */
 234    if (brw->gen == 4) {
 235       dst.type = src0.type;
 236       if (dst.file == HW_REG)
 237          dst.fixed_hw_reg.type = dst.type;
 238    }
 239
 240    resolve_ud_negate(&src0);
 241    resolve_ud_negate(&src1);
 242
 243    inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
 244    inst->conditional_mod = condition;
 245
 246    return inst;
 247 }
 248
 249 fs_inst *
 250 fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources)
 251 {
 252    fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, dst, src,
 253                                         sources);
 254    inst->regs_written = sources;
 255
 256    return inst;
 257 }
 258
 259 exec_list
 260 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
 261                                        const fs_reg &surf_index,
 262                                        const fs_reg &varying_offset,
 263                                        uint32_t const_offset)
 264 {
 265    exec_list instructions;
 266    fs_inst *inst;
 267
 268    /* We have our constant surface use a pitch of 4 bytes, so our index can
 269     * be any component of a vector, and then we load 4 contiguous
 270     * components starting from that.
 271     *
 272     * We break down the const_offset to a portion added to the variable
 273     * offset and a portion done using reg_offset, which means that if you
 274     * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
 275     * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
 276     * CSE can later notice that those loads are all the same and eliminate
 277     * the redundant ones.
 278     */
 279    fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
 280    instructions.push_tail(ADD(vec4_offset,
 281                               varying_offset, const_offset & ~3));
 282
 283    int scale = 1;
 284    if (brw->gen == 4 && dispatch_width == 8) {
 285       /* Pre-gen5, we can either use a SIMD8 message that requires (header,
 286        * u, v, r) as parameters, or we can just use the SIMD16 message
 287        * consisting of (header, u).  We choose the second, at the cost of a
 288        * longer return length.
 289        */
 290       scale = 2;
 291    }
 292
 293    enum opcode op;
 294    if (brw->gen >= 7)
 295       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
 296    else
 297       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
 298    fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4 * scale), dst.type);
 299    inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
 300    inst->regs_written = 4 * scale;
 301    instructions.push_tail(inst);
 302
 303    if (brw->gen < 7) {
 304       inst->base_mrf = 13;
 305       inst->header_present = true;
 306       if (brw->gen == 4)
 307          inst->mlen = 3;
 308       else
 309          inst->mlen = 1 + dispatch_width / 8;
 310    }
 311
 312    vec4_result.reg_offset += (const_offset & 3) * scale;
 313    instructions.push_tail(MOV(dst, vec4_result));
 314
 315    return instructions;
 316 }
 317
 318 /**
 319  * A helper for MOV generation for fixing up broken hardware SEND dependency
 320  * handling.
 321  */
 322 fs_inst *
 323 fs_visitor::DEP_RESOLVE_MOV(int grf)
 324 {
 325    fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
 326
 327    inst->ir = NULL;
 328    inst->annotation = "send dependency resolve";
 329
 330    /* The caller always wants uncompressed to emit the minimal extra
 331     * dependencies, and to avoid having to deal with aligning its regs to 2.
 332     */
 333    inst->force_uncompressed = true;
 334
 335    return inst;
 336 }
 337
 338 bool
 339 fs_inst::equals(fs_inst *inst) const
 340 {
 341    return (opcode == inst->opcode &&
 342            dst.equals(inst->dst) &&
 343            src[0].equals(inst->src[0]) &&
 344            src[1].equals(inst->src[1]) &&
 345            src[2].equals(inst->src[2]) &&
 346            saturate == inst->saturate &&
 347            predicate == inst->predicate &&
 348            conditional_mod == inst->conditional_mod &&
 349            mlen == inst->mlen &&
 350            base_mrf == inst->base_mrf &&
 351            target == inst->target &&
 352            eot == inst->eot &&
 353            header_present == inst->header_present &&
 354            shadow_compare == inst->shadow_compare &&
 355            offset == inst->offset);
 356 }
 357
 358 bool
 359 fs_inst::overwrites_reg(const fs_reg &reg) const
 360 {
 361    return (reg.file == dst.file &&
 362            reg.reg == dst.reg &&
 363            reg.reg_offset >= dst.reg_offset  &&
 364            reg.reg_offset < dst.reg_offset + regs_written);
 365 }
 366
 367 bool
 368 fs_inst::is_send_from_grf() const
 369 {
 370    return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
 371            opcode == SHADER_OPCODE_SHADER_TIME_ADD ||
 372            opcode == FS_OPCODE_INTERPOLATE_AT_CENTROID ||
 373            opcode == FS_OPCODE_INTERPOLATE_AT_SAMPLE ||
 374            opcode == FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET ||
 375            opcode == FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET ||
 376            (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
 377             src[1].file == GRF) ||
 378            (is_tex() && src[0].file == GRF));
 379 }
 380
 381 bool
 382 fs_inst::can_do_source_mods(struct brw_context *brw)
 383 {
 384    if (brw->gen == 6 && is_math())
 385       return false;
 386
 387    if (is_send_from_grf())
 388       return false;
 389
 390    if (!backend_instruction::can_do_source_mods())
 391       return false;
 392
 393    return true;
 394 }
 395
 396 void
 397 fs_reg::init()
 398 {
 399    memset(this, 0, sizeof(*this));
 400    stride = 1;
 401 }
 402
 403 /** Generic unset register constructor. */
 404 fs_reg::fs_reg()
 405 {
 406    init();
 407    this->file = BAD_FILE;
 408 }
 409
 410 /** Immediate value constructor. */
 411 fs_reg::fs_reg(float f)
 412 {
 413    init();
 414    this->file = IMM;
 415    this->type = BRW_REGISTER_TYPE_F;
 416    this->fixed_hw_reg.dw1.f = f;
 417 }
 418
 419 /** Immediate value constructor. */
 420 fs_reg::fs_reg(int32_t i)
 421 {
 422    init();
 423    this->file = IMM;
 424    this->type = BRW_REGISTER_TYPE_D;
 425    this->fixed_hw_reg.dw1.d = i;
 426 }
 427
 428 /** Immediate value constructor. */
 429 fs_reg::fs_reg(uint32_t u)
 430 {
 431    init();
 432    this->file = IMM;
 433    this->type = BRW_REGISTER_TYPE_UD;
 434    this->fixed_hw_reg.dw1.ud = u;
 435 }
 436
 437 /** Fixed brw_reg. */
 438 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
 439 {
 440    init();
 441    this->file = HW_REG;
 442    this->fixed_hw_reg = fixed_hw_reg;
 443    this->type = fixed_hw_reg.type;
 444 }
 445
 446 bool
 447 fs_reg::equals(const fs_reg &r) const
 448 {
 449    return (file == r.file &&
 450            reg == r.reg &&
 451            reg_offset == r.reg_offset &&
 452            subreg_offset == r.subreg_offset &&
 453            type == r.type &&
 454            negate == r.negate &&
 455            abs == r.abs &&
 456            !reladdr && !r.reladdr &&
 457            memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
 458                   sizeof(fixed_hw_reg)) == 0 &&
 459            stride == r.stride);
 460 }
 461
 462 fs_reg &
 463 fs_reg::apply_stride(unsigned stride)
 464 {
 465    assert((this->stride * stride) <= 4 &&
 466           (is_power_of_two(stride) || stride == 0) &&
 467           file != HW_REG && file != IMM);
 468    this->stride *= stride;
 469    return *this;
 470 }
 471
 472 fs_reg &
 473 fs_reg::set_smear(unsigned subreg)
 474 {
 475    assert(file != HW_REG && file != IMM);
 476    subreg_offset = subreg * type_sz(type);
 477    stride = 0;
 478    return *this;
 479 }
 480
 481 bool
 482 fs_reg::is_contiguous() const
 483 {
 484    return stride == 1;
 485 }
 486
 487 bool
 488 fs_reg::is_valid_3src() const
 489 {
 490    return file == GRF || file == UNIFORM;
 491 }
 492
 493 int
 494 fs_visitor::type_size(const struct glsl_type *type)
 495 {
 496    unsigned int size, i;
 497
 498    switch (type->base_type) {
 499    case GLSL_TYPE_UINT:
 500    case GLSL_TYPE_INT:
 501    case GLSL_TYPE_FLOAT:
 502    case GLSL_TYPE_BOOL:
 503       return type->components();
 504    case GLSL_TYPE_ARRAY:
 505       return type_size(type->fields.array) * type->length;
 506    case GLSL_TYPE_STRUCT:
 507       size = 0;
 508       for (i = 0; i < type->length; i++) {
 509          size += type_size(type->fields.structure[i].type);
 510       }
 511       return size;
 512    case GLSL_TYPE_SAMPLER:
 513       /* Samplers take up no register space, since they're baked in at
 514        * link time.
 515        */
 516       return 0;
 517    case GLSL_TYPE_ATOMIC_UINT:
 518       return 0;
 519    case GLSL_TYPE_IMAGE:
 520    case GLSL_TYPE_VOID:
 521    case GLSL_TYPE_ERROR:
 522    case GLSL_TYPE_INTERFACE:
 523       unreachable("not reached");
 524    }
 525
 526    return 0;
 527 }
 528
 529 fs_reg
 530 fs_visitor::get_timestamp()
 531 {
 532    assert(brw->gen >= 7);
 533
 534    fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 535                                           BRW_ARF_TIMESTAMP,
 536                                           0),
 537                              BRW_REGISTER_TYPE_UD));
 538
 539    fs_reg dst = fs_reg(this, glsl_type::uint_type);
 540
 541    fs_inst *mov = emit(MOV(dst, ts));
 542    /* We want to read the 3 fields we care about (mostly field 0, but also 2)
 543     * even if it's not enabled in the dispatch.
 544     */
 545    mov->force_writemask_all = true;
 546    mov->force_uncompressed = true;
 547
 548    /* The caller wants the low 32 bits of the timestamp.  Since it's running
 549     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
 550     * which is plenty of time for our purposes.  It is identical across the
 551     * EUs, but since it's tracking GPU core speed it will increment at a
 552     * varying rate as render P-states change.
 553     *
 554     * The caller could also check if render P-states have changed (or anything
 555     * else that might disrupt timing) by setting smear to 2 and checking if
 556     * that field is != 0.
 557     */
 558    dst.set_smear(0);
 559
 560    return dst;
 561 }
 562
 563 void
 564 fs_visitor::emit_shader_time_begin()
 565 {
 566    current_annotation = "shader time start";
 567    shader_start_time = get_timestamp();
 568 }
 569
 570 void
 571 fs_visitor::emit_shader_time_end()
 572 {
 573    current_annotation = "shader time end";
 574
 575    enum shader_time_shader_type type, written_type, reset_type;
 576    if (dispatch_width == 8) {
 577       type = ST_FS8;
 578       written_type = ST_FS8_WRITTEN;
 579       reset_type = ST_FS8_RESET;
 580    } else {
 581       assert(dispatch_width == 16);
 582       type = ST_FS16;
 583       written_type = ST_FS16_WRITTEN;
 584       reset_type = ST_FS16_RESET;
 585    }
 586
 587    fs_reg shader_end_time = get_timestamp();
 588
 589    /* Check that there weren't any timestamp reset events (assuming these
 590     * were the only two timestamp reads that happened).
 591     */
 592    fs_reg reset = shader_end_time;
 593    reset.set_smear(2);
 594    fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
 595    test->conditional_mod = BRW_CONDITIONAL_Z;
 596    emit(IF(BRW_PREDICATE_NORMAL));
 597
 598    push_force_uncompressed();
 599    fs_reg start = shader_start_time;
 600    start.negate = true;
 601    fs_reg diff = fs_reg(this, glsl_type::uint_type);
 602    emit(ADD(diff, start, shader_end_time));
 603
 604    /* If there were no instructions between the two timestamp gets, the diff
 605     * is 2 cycles.  Remove that overhead, so I can forget about that when
 606     * trying to determine the time taken for single instructions.
 607     */
 608    emit(ADD(diff, diff, fs_reg(-2u)));
 609
 610    emit_shader_time_write(type, diff);
 611    emit_shader_time_write(written_type, fs_reg(1u));
 612    emit(BRW_OPCODE_ELSE);
 613    emit_shader_time_write(reset_type, fs_reg(1u));
 614    emit(BRW_OPCODE_ENDIF);
 615
 616    pop_force_uncompressed();
 617 }
 618
 619 void
 620 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
 621                                    fs_reg value)
 622 {
 623    int shader_time_index =
 624       brw_get_shader_time_index(brw, shader_prog, &fp->Base, type);
 625    fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
 626
 627    fs_reg payload;
 628    if (dispatch_width == 8)
 629       payload = fs_reg(this, glsl_type::uvec2_type);
 630    else
 631       payload = fs_reg(this, glsl_type::uint_type);
 632
 633    emit(new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
 634                              fs_reg(), payload, offset, value));
 635 }
 636
 637 void
 638 fs_visitor::vfail(const char *format, va_list va)
 639 {
 640    char *msg;
 641
 642    if (failed)
 643       return;
 644
 645    failed = true;
 646
 647    msg = ralloc_vasprintf(mem_ctx, format, va);
 648    msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
 649
 650    this->fail_msg = msg;
 651
 652    if (INTEL_DEBUG & DEBUG_WM) {
 653       fprintf(stderr, "%s",  msg);
 654    }
 655 }
 656
 657 void
 658 fs_visitor::fail(const char *format, ...)
 659 {
 660    va_list va;
 661
 662    va_start(va, format);
 663    vfail(format, va);
 664    va_end(va);
 665 }
 666
 667 /**
 668  * Mark this program as impossible to compile in SIMD16 mode.
 669  *
 670  * During the SIMD8 compile (which happens first), we can detect and flag
 671  * things that are unsupported in SIMD16 mode, so the compiler can skip
 672  * the SIMD16 compile altogether.
 673  *
 674  * During a SIMD16 compile (if one happens anyway), this just calls fail().
 675  */
 676 void
 677 fs_visitor::no16(const char *format, ...)
 678 {
 679    va_list va;
 680
 681    va_start(va, format);
 682
 683    if (dispatch_width == 16) {
 684       vfail(format, va);
 685    } else {
 686       simd16_unsupported = true;
 687
 688       if (brw->perf_debug) {
 689          if (no16_msg)
 690             ralloc_vasprintf_append(&no16_msg, format, va);
 691          else
 692             no16_msg = ralloc_vasprintf(mem_ctx, format, va);
 693       }
 694    }
 695
 696    va_end(va);
 697 }
 698
 699 fs_inst *
 700 fs_visitor::emit(enum opcode opcode)
 701 {
 702    return emit(new(mem_ctx) fs_inst(opcode));
 703 }
 704
 705 fs_inst *
 706 fs_visitor::emit(enum opcode opcode, const fs_reg &dst)
 707 {
 708    return emit(new(mem_ctx) fs_inst(opcode, dst));
 709 }
 710
 711 fs_inst *
 712 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
 713 {
 714    return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
 715 }
 716
 717 fs_inst *
 718 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 719                  const fs_reg &src1)
 720 {
 721    return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
 722 }
 723
 724 fs_inst *
 725 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 726                  const fs_reg &src1, const fs_reg &src2)
 727 {
 728    return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
 729 }
 730
 731 fs_inst *
 732 fs_visitor::emit(enum opcode opcode, const fs_reg &dst,
 733                  fs_reg src[], int sources)
 734 {
 735    return emit(new(mem_ctx) fs_inst(opcode, dst, src, sources));
 736 }
 737
 738 void
 739 fs_visitor::push_force_uncompressed()
 740 {
 741    force_uncompressed_stack++;
 742 }
 743
 744 void
 745 fs_visitor::pop_force_uncompressed()
 746 {
 747    force_uncompressed_stack--;
 748    assert(force_uncompressed_stack >= 0);
 749 }
 750
 751 /**
 752  * Returns true if the instruction has a flag that means it won't
 753  * update an entire destination register.
 754  *
 755  * For example, dead code elimination and live variable analysis want to know
 756  * when a write to a variable screens off any preceding values that were in
 757  * it.
 758  */
 759 bool
 760 fs_inst::is_partial_write() const
 761 {
 762    return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
 763            this->force_uncompressed ||
 764            this->force_sechalf || !this->dst.is_contiguous());
 765 }
 766
 767 int
 768 fs_inst::regs_read(fs_visitor *v, int arg) const
 769 {
 770    if (is_tex() && arg == 0 && src[0].file == GRF) {
 771       if (v->dispatch_width == 16)
 772          return (mlen + 1) / 2;
 773       else
 774          return mlen;
 775    }
 776    return 1;
 777 }
 778
 779 bool
 780 fs_inst::reads_flag() const
 781 {
 782    return predicate;
 783 }
 784
 785 bool
 786 fs_inst::writes_flag() const
 787 {
 788    return (conditional_mod && opcode != BRW_OPCODE_SEL) ||
 789           opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
 790 }
 791
 792 /**
 793  * Returns how many MRFs an FS opcode will write over.
 794  *
 795  * Note that this is not the 0 or 1 implied writes in an actual gen
 796  * instruction -- the FS opcodes often generate MOVs in addition.
 797  */
 798 int
 799 fs_visitor::implied_mrf_writes(fs_inst *inst)
 800 {
 801    if (inst->mlen == 0)
 802       return 0;
 803
 804    if (inst->base_mrf == -1)
 805       return 0;
 806
 807    switch (inst->opcode) {
 808    case SHADER_OPCODE_RCP:
 809    case SHADER_OPCODE_RSQ:
 810    case SHADER_OPCODE_SQRT:
 811    case SHADER_OPCODE_EXP2:
 812    case SHADER_OPCODE_LOG2:
 813    case SHADER_OPCODE_SIN:
 814    case SHADER_OPCODE_COS:
 815       return 1 * dispatch_width / 8;
 816    case SHADER_OPCODE_POW:
 817    case SHADER_OPCODE_INT_QUOTIENT:
 818    case SHADER_OPCODE_INT_REMAINDER:
 819       return 2 * dispatch_width / 8;
 820    case SHADER_OPCODE_TEX:
 821    case FS_OPCODE_TXB:
 822    case SHADER_OPCODE_TXD:
 823    case SHADER_OPCODE_TXF:
 824    case SHADER_OPCODE_TXF_CMS:
 825    case SHADER_OPCODE_TXF_MCS:
 826    case SHADER_OPCODE_TG4:
 827    case SHADER_OPCODE_TG4_OFFSET:
 828    case SHADER_OPCODE_TXL:
 829    case SHADER_OPCODE_TXS:
 830    case SHADER_OPCODE_LOD:
 831       return 1;
 832    case FS_OPCODE_FB_WRITE:
 833       return 2;
 834    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 835    case SHADER_OPCODE_GEN4_SCRATCH_READ:
 836       return 1;
 837    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
 838       return inst->mlen;
 839    case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
 840       return 2;
 841    case SHADER_OPCODE_UNTYPED_ATOMIC:
 842    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
 843    case FS_OPCODE_INTERPOLATE_AT_CENTROID:
 844    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
 845    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
 846    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
 847       return 0;
 848    default:
 849       unreachable("not reached");
 850    }
 851 }
 852
 853 int
 854 fs_visitor::virtual_grf_alloc(int size)
 855 {
 856    if (virtual_grf_array_size <= virtual_grf_count) {
 857       if (virtual_grf_array_size == 0)
 858          virtual_grf_array_size = 16;
 859       else
 860          virtual_grf_array_size *= 2;
 861       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 862                                    virtual_grf_array_size);
 863    }
 864    virtual_grf_sizes[virtual_grf_count] = size;
 865    return virtual_grf_count++;
 866 }
 867
 868 /** Fixed HW reg constructor. */
 869 fs_reg::fs_reg(enum register_file file, int reg)
 870 {
 871    init();
 872    this->file = file;
 873    this->reg = reg;
 874    this->type = BRW_REGISTER_TYPE_F;
 875 }
 876
 877 /** Fixed HW reg constructor. */
 878 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type)
 879 {
 880    init();
 881    this->file = file;
 882    this->reg = reg;
 883    this->type = type;
 884 }
 885
 886 /** Automatic reg constructor. */
 887 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
 888 {
 889    init();
 890
 891    this->file = GRF;
 892    this->reg = v->virtual_grf_alloc(v->type_size(type));
 893    this->reg_offset = 0;
 894    this->type = brw_type_for_base_type(type);
 895 }
 896
 897 fs_reg *
 898 fs_visitor::variable_storage(ir_variable *var)
 899 {
 900    return (fs_reg *)hash_table_find(this->variable_ht, var);
 901 }
 902
 903 void
 904 import_uniforms_callback(const void *key,
 905                          void *data,
 906                          void *closure)
 907 {
 908    struct hash_table *dst_ht = (struct hash_table *)closure;
 909    const fs_reg *reg = (const fs_reg *)data;
 910
 911    if (reg->file != UNIFORM)
 912       return;
 913
 914    hash_table_insert(dst_ht, data, key);
 915 }
 916
 917 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
 918  * This brings in those uniform definitions
 919  */
 920 void
 921 fs_visitor::import_uniforms(fs_visitor *v)
 922 {
 923    hash_table_call_foreach(v->variable_ht,
 924                            import_uniforms_callback,
 925                            variable_ht);
 926    this->push_constant_loc = v->push_constant_loc;
 927    this->pull_constant_loc = v->pull_constant_loc;
 928    this->uniforms = v->uniforms;
 929    this->param_size = v->param_size;
 930 }
 931
 932 /* Our support for uniforms is piggy-backed on the struct
 933  * gl_fragment_program, because that's where the values actually
 934  * get stored, rather than in some global gl_shader_program uniform
 935  * store.
 936  */
 937 void
 938 fs_visitor::setup_uniform_values(ir_variable *ir)
 939 {
 940    int namelen = strlen(ir->name);
 941
 942    /* The data for our (non-builtin) uniforms is stored in a series of
 943     * gl_uniform_driver_storage structs for each subcomponent that
 944     * glGetUniformLocation() could name.  We know it's been set up in the same
 945     * order we'd walk the type, so walk the list of storage and find anything
 946     * with our name, or the prefix of a component that starts with our name.
 947     */
 948    unsigned params_before = uniforms;
 949    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
 950       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 951
 952       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 953           (storage->name[namelen] != 0 &&
 954            storage->name[namelen] != '.' &&
 955            storage->name[namelen] != '[')) {
 956          continue;
 957       }
 958
 959       unsigned slots = storage->type->component_slots();
 960       if (storage->array_elements)
 961          slots *= storage->array_elements;
 962
 963       for (unsigned i = 0; i < slots; i++) {
 964          stage_prog_data->param[uniforms++] = &storage->storage[i].f;
 965       }
 966    }
 967
 968    /* Make sure we actually initialized the right amount of stuff here. */
 969    assert(params_before + ir->type->component_slots() == uniforms);
 970    (void)params_before;
 971 }
 972
 973
 974 /* Our support for builtin uniforms is even scarier than non-builtin.
 975  * It sits on top of the PROG_STATE_VAR parameters that are
 976  * automatically updated from GL context state.
 977  */
 978 void
 979 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
 980 {
 981    const ir_state_slot *const slots = ir->state_slots;
 982    assert(ir->state_slots != NULL);
 983
 984    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 985       /* This state reference has already been setup by ir_to_mesa, but we'll
 986        * get the same index back here.
 987        */
 988       int index = _mesa_add_state_reference(this->fp->Base.Parameters,
 989                                             (gl_state_index *)slots[i].tokens);
 990
 991       /* Add each of the unique swizzles of the element as a parameter.
 992        * This'll end up matching the expected layout of the
 993        * array/matrix/structure we're trying to fill in.
 994        */
 995       int last_swiz = -1;
 996       for (unsigned int j = 0; j < 4; j++) {
 997          int swiz = GET_SWZ(slots[i].swizzle, j);
 998          if (swiz == last_swiz)
 999             break;
1000          last_swiz = swiz;
1001
1002          stage_prog_data->param[uniforms++] =
1003             &fp->Base.Parameters->ParameterValues[index][swiz].f;
1004       }
1005    }
1006 }
1007
1008 fs_reg *
1009 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
1010 {
1011    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1012    fs_reg wpos = *reg;
1013    bool flip = !ir->data.origin_upper_left ^ key->render_to_fbo;
1014
1015    /* gl_FragCoord.x */
1016    if (ir->data.pixel_center_integer) {
1017       emit(MOV(wpos, this->pixel_x));
1018    } else {
1019       emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
1020    }
1021    wpos.reg_offset++;
1022
1023    /* gl_FragCoord.y */
1024    if (!flip && ir->data.pixel_center_integer) {
1025       emit(MOV(wpos, this->pixel_y));
1026    } else {
1027       fs_reg pixel_y = this->pixel_y;
1028       float offset = (ir->data.pixel_center_integer ? 0.0 : 0.5);
1029
1030       if (flip) {
1031          pixel_y.negate = true;
1032          offset += key->drawable_height - 1.0;
1033       }
1034
1035       emit(ADD(wpos, pixel_y, fs_reg(offset)));
1036    }
1037    wpos.reg_offset++;
1038
1039    /* gl_FragCoord.z */
1040    if (brw->gen >= 6) {
1041       emit(MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
1042    } else {
1043       emit(FS_OPCODE_LINTERP, wpos,
1044            this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1045            this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1046            interp_reg(VARYING_SLOT_POS, 2));
1047    }
1048    wpos.reg_offset++;
1049
1050    /* gl_FragCoord.w: Already set up in emit_interpolation */
1051    emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1052
1053    return reg;
1054 }
1055
1056 fs_inst *
1057 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1058                          glsl_interp_qualifier interpolation_mode,
1059                          bool is_centroid, bool is_sample)
1060 {
1061    brw_wm_barycentric_interp_mode barycoord_mode;
1062    if (brw->gen >= 6) {
1063       if (is_centroid) {
1064          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1065             barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1066          else
1067             barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1068       } else if (is_sample) {
1069           if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1070             barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1071          else
1072             barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1073       } else {
1074          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1075             barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1076          else
1077             barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1078       }
1079    } else {
1080       /* On Ironlake and below, there is only one interpolation mode.
1081        * Centroid interpolation doesn't mean anything on this hardware --
1082        * there is no multisampling.
1083        */
1084       barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1085    }
1086    return emit(FS_OPCODE_LINTERP, attr,
1087                this->delta_x[barycoord_mode],
1088                this->delta_y[barycoord_mode], interp);
1089 }
1090
1091 fs_reg *
1092 fs_visitor::emit_general_interpolation(ir_variable *ir)
1093 {
1094    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1095    reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
1096    fs_reg attr = *reg;
1097
1098    unsigned int array_elements;
1099    const glsl_type *type;
1100
1101    if (ir->type->is_array()) {
1102       array_elements = ir->type->length;
1103       if (array_elements == 0) {
1104          fail("dereferenced array '%s' has length 0\n", ir->name);
1105       }
1106       type = ir->type->fields.array;
1107    } else {
1108       array_elements = 1;
1109       type = ir->type;
1110    }
1111
1112    glsl_interp_qualifier interpolation_mode =
1113       ir->determine_interpolation_mode(key->flat_shade);
1114
1115    int location = ir->data.location;
1116    for (unsigned int i = 0; i < array_elements; i++) {
1117       for (unsigned int j = 0; j < type->matrix_columns; j++) {
1118          if (prog_data->urb_setup[location] == -1) {
1119             /* If there's no incoming setup data for this slot, don't
1120              * emit interpolation for it.
1121              */
1122             attr.reg_offset += type->vector_elements;
1123             location++;
1124             continue;
1125          }
1126
1127          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1128             /* Constant interpolation (flat shading) case. The SF has
1129              * handed us defined values in only the constant offset
1130              * field of the setup reg.
1131              */
1132             for (unsigned int k = 0; k < type->vector_elements; k++) {
1133                struct brw_reg interp = interp_reg(location, k);
1134                interp = suboffset(interp, 3);
1135                interp.type = reg->type;
1136                emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1137                attr.reg_offset++;
1138             }
1139          } else {
1140             /* Smooth/noperspective interpolation case. */
1141             for (unsigned int k = 0; k < type->vector_elements; k++) {
1142                struct brw_reg interp = interp_reg(location, k);
1143                if (brw->needs_unlit_centroid_workaround && ir->data.centroid) {
1144                   /* Get the pixel/sample mask into f0 so that we know
1145                    * which pixels are lit.  Then, for each channel that is
1146                    * unlit, replace the centroid data with non-centroid
1147                    * data.
1148                    */
1149                   emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1150
1151                   fs_inst *inst;
1152                   inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1153                                       false, false);
1154                   inst->predicate = BRW_PREDICATE_NORMAL;
1155                   inst->predicate_inverse = true;
1156                   if (brw->has_pln)
1157                      inst->no_dd_clear = true;
1158
1159                   inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1160                                       ir->data.centroid && !key->persample_shading,
1161                                       ir->data.sample || key->persample_shading);
1162                   inst->predicate = BRW_PREDICATE_NORMAL;
1163                   inst->predicate_inverse = false;
1164                   if (brw->has_pln)
1165                      inst->no_dd_check = true;
1166
1167                } else {
1168                   emit_linterp(attr, fs_reg(interp), interpolation_mode,
1169                                ir->data.centroid && !key->persample_shading,
1170                                ir->data.sample || key->persample_shading);
1171                }
1172                if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1173                   emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1174                }
1175                attr.reg_offset++;
1176             }
1177
1178          }
1179          location++;
1180       }
1181    }
1182
1183    return reg;
1184 }
1185
1186 fs_reg *
1187 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1188 {
1189    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1190
1191    /* The frontfacing comes in as a bit in the thread payload. */
1192    if (brw->gen >= 6) {
1193       emit(BRW_OPCODE_ASR, *reg,
1194            fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1195            fs_reg(15));
1196       emit(BRW_OPCODE_NOT, *reg, *reg);
1197       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1198    } else {
1199       struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1200       /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1201        * us front face
1202        */
1203       emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1204       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1205    }
1206
1207    return reg;
1208 }
1209
1210 void
1211 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1212 {
1213    assert(dst.type == BRW_REGISTER_TYPE_F);
1214
1215    if (key->compute_pos_offset) {
1216       /* Convert int_sample_pos to floating point */
1217       emit(MOV(dst, int_sample_pos));
1218       /* Scale to the range [0, 1] */
1219       emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1220    }
1221    else {
1222       /* From ARB_sample_shading specification:
1223        * "When rendering to a non-multisample buffer, or if multisample
1224        *  rasterization is disabled, gl_SamplePosition will always be
1225        *  (0.5, 0.5).
1226        */
1227       emit(MOV(dst, fs_reg(0.5f)));
1228    }
1229 }
1230
1231 fs_reg *
1232 fs_visitor::emit_samplepos_setup(ir_variable *ir)
1233 {
1234    assert(brw->gen >= 6);
1235    assert(ir->type == glsl_type::vec2_type);
1236
1237    this->current_annotation = "compute sample position";
1238    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1239    fs_reg pos = *reg;
1240    fs_reg int_sample_x = fs_reg(this, glsl_type::int_type);
1241    fs_reg int_sample_y = fs_reg(this, glsl_type::int_type);
1242
1243    /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1244     * mode will be enabled.
1245     *
1246     * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1247     * R31.1:0         Position Offset X/Y for Slot[3:0]
1248     * R31.3:2         Position Offset X/Y for Slot[7:4]
1249     * .....
1250     *
1251     * The X, Y sample positions come in as bytes in  thread payload. So, read
1252     * the positions using vstride=16, width=8, hstride=2.
1253     */
1254    struct brw_reg sample_pos_reg =
1255       stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
1256                     BRW_REGISTER_TYPE_B), 16, 8, 2);
1257
1258    fs_inst *inst = emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1259    if (dispatch_width == 16) {
1260       inst->force_uncompressed = true;
1261       inst = emit(MOV(half(int_sample_x, 1),
1262                       fs_reg(suboffset(sample_pos_reg, 16))));
1263       inst->force_sechalf = true;
1264    }
1265    /* Compute gl_SamplePosition.x */
1266    compute_sample_position(pos, int_sample_x);
1267    pos.reg_offset++;
1268    inst = emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1269    if (dispatch_width == 16) {
1270       inst->force_uncompressed = true;
1271       inst = emit(MOV(half(int_sample_y, 1),
1272                       fs_reg(suboffset(sample_pos_reg, 17))));
1273       inst->force_sechalf = true;
1274    }
1275    /* Compute gl_SamplePosition.y */
1276    compute_sample_position(pos, int_sample_y);
1277    return reg;
1278 }
1279
1280 fs_reg *
1281 fs_visitor::emit_sampleid_setup(ir_variable *ir)
1282 {
1283    assert(brw->gen >= 6);
1284
1285    this->current_annotation = "compute sample id";
1286    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1287
1288    if (key->compute_sample_id) {
1289       fs_reg t1 = fs_reg(this, glsl_type::int_type);
1290       fs_reg t2 = fs_reg(this, glsl_type::int_type);
1291       t2.type = BRW_REGISTER_TYPE_UW;
1292
1293       /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1294        * 8x multisampling, subspan 0 will represent sample N (where N
1295        * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1296        * 7. We can find the value of N by looking at R0.0 bits 7:6
1297        * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1298        * (since samples are always delivered in pairs). That is, we
1299        * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1300        * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1301        * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1302        * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1303        * populating a temporary variable with the sequence (0, 1, 2, 3),
1304        * and then reading from it using vstride=1, width=4, hstride=0.
1305        * These computations hold good for 4x multisampling as well.
1306        *
1307        * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
1308        * the first four slots are sample 0 of subspan 0; the next four
1309        * are sample 1 of subspan 0; the third group is sample 0 of
1310        * subspan 1, and finally sample 1 of subspan 1.
1311        */
1312       fs_inst *inst;
1313       inst = emit(BRW_OPCODE_AND, t1,
1314                   fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
1315                   fs_reg(0xc0));
1316       inst->force_writemask_all = true;
1317       inst = emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1318       inst->force_writemask_all = true;
1319       /* This works for both SIMD8 and SIMD16 */
1320       inst = emit(MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210)));
1321       inst->force_writemask_all = true;
1322       /* This special instruction takes care of setting vstride=1,
1323        * width=4, hstride=0 of t2 during an ADD instruction.
1324        */
1325       emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1326    } else {
1327       /* As per GL_ARB_sample_shading specification:
1328        * "When rendering to a non-multisample buffer, or if multisample
1329        *  rasterization is disabled, gl_SampleID will always be zero."
1330        */
1331       emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1332    }
1333
1334    return reg;
1335 }
1336
1337 fs_reg
1338 fs_visitor::fix_math_operand(fs_reg src)
1339 {
1340    /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1341     * might be able to do better by doing execsize = 1 math and then
1342     * expanding that result out, but we would need to be careful with
1343     * masking.
1344     *
1345     * The hardware ignores source modifiers (negate and abs) on math
1346     * instructions, so we also move to a temp to set those up.
1347     */
1348    if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1349        !src.abs && !src.negate)
1350       return src;
1351
1352    /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1353     * operands to math
1354     */
1355    if (brw->gen >= 7 && src.file != IMM)
1356       return src;
1357
1358    fs_reg expanded = fs_reg(this, glsl_type::float_type);
1359    expanded.type = src.type;
1360    emit(BRW_OPCODE_MOV, expanded, src);
1361    return expanded;
1362 }
1363
1364 fs_inst *
1365 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1366 {
1367    switch (opcode) {
1368    case SHADER_OPCODE_RCP:
1369    case SHADER_OPCODE_RSQ:
1370    case SHADER_OPCODE_SQRT:
1371    case SHADER_OPCODE_EXP2:
1372    case SHADER_OPCODE_LOG2:
1373    case SHADER_OPCODE_SIN:
1374    case SHADER_OPCODE_COS:
1375       break;
1376    default:
1377       unreachable("not reached: bad math opcode");
1378    }
1379
1380    /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
1381     * might be able to do better by doing execsize = 1 math and then
1382     * expanding that result out, but we would need to be careful with
1383     * masking.
1384     *
1385     * Gen 6 hardware ignores source modifiers (negate and abs) on math
1386     * instructions, so we also move to a temp to set those up.
1387     */
1388    if (brw->gen == 6 || brw->gen == 7)
1389       src = fix_math_operand(src);
1390
1391    fs_inst *inst = emit(opcode, dst, src);
1392
1393    if (brw->gen < 6) {
1394       inst->base_mrf = 2;
1395       inst->mlen = dispatch_width / 8;
1396    }
1397
1398    return inst;
1399 }
1400
1401 fs_inst *
1402 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1403 {
1404    int base_mrf = 2;
1405    fs_inst *inst;
1406
1407    switch (opcode) {
1408    case SHADER_OPCODE_INT_QUOTIENT:
1409    case SHADER_OPCODE_INT_REMAINDER:
1410       if (brw->gen >= 7)
1411          no16("SIMD16 INTDIV unsupported\n");
1412       break;
1413    case SHADER_OPCODE_POW:
1414       break;
1415    default:
1416       unreachable("not reached: unsupported binary math opcode.");
1417    }
1418
1419    if (brw->gen >= 8) {
1420       inst = emit(opcode, dst, src0, src1);
1421    } else if (brw->gen >= 6) {
1422       src0 = fix_math_operand(src0);
1423       src1 = fix_math_operand(src1);
1424
1425       inst = emit(opcode, dst, src0, src1);
1426    } else {
1427       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1428        * "Message Payload":
1429        *
1430        * "Operand0[7].  For the INT DIV functions, this operand is the
1431        *  denominator."
1432        *  ...
1433        * "Operand1[7].  For the INT DIV functions, this operand is the
1434        *  numerator."
1435        */
1436       bool is_int_div = opcode != SHADER_OPCODE_POW;
1437       fs_reg &op0 = is_int_div ? src1 : src0;
1438       fs_reg &op1 = is_int_div ? src0 : src1;
1439
1440       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1441       inst = emit(opcode, dst, op0, reg_null_f);
1442
1443       inst->base_mrf = base_mrf;
1444       inst->mlen = 2 * dispatch_width / 8;
1445    }
1446    return inst;
1447 }
1448
1449 void
1450 fs_visitor::assign_curb_setup()
1451 {
1452    if (dispatch_width == 8) {
1453       prog_data->base.dispatch_grf_start_reg = payload.num_regs;
1454    } else {
1455       prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1456    }
1457
1458    prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1459
1460    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1461    foreach_in_list(fs_inst, inst, &instructions) {
1462       for (unsigned int i = 0; i < inst->sources; i++) {
1463          if (inst->src[i].file == UNIFORM) {
1464             int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1465             int constant_nr;
1466             if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1467                constant_nr = push_constant_loc[uniform_nr];
1468             } else {
1469                /* Section 5.11 of the OpenGL 4.1 spec says:
1470                 * "Out-of-bounds reads return undefined values, which include
1471                 *  values from other variables of the active program or zero."
1472                 * Just return the first push constant.
1473                 */
1474                constant_nr = 0;
1475             }
1476
1477             struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1478                                                   constant_nr / 8,
1479                                                   constant_nr % 8);
1480
1481             inst->src[i].file = HW_REG;
1482             inst->src[i].fixed_hw_reg = byte_offset(
1483                retype(brw_reg, inst->src[i].type),
1484                inst->src[i].subreg_offset);
1485          }
1486       }
1487    }
1488 }
1489
1490 void
1491 fs_visitor::calculate_urb_setup()
1492 {
1493    for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1494       prog_data->urb_setup[i] = -1;
1495    }
1496
1497    int urb_next = 0;
1498    /* Figure out where each of the incoming setup attributes lands. */
1499    if (brw->gen >= 6) {
1500       if (_mesa_bitcount_64(fp->Base.InputsRead &
1501                             BRW_FS_VARYING_INPUT_MASK) <= 16) {
1502          /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1503           * first 16 varying inputs, so we can put them wherever we want.
1504           * Just put them in order.
1505           *
1506           * This is useful because it means that (a) inputs not used by the
1507           * fragment shader won't take up valuable register space, and (b) we
1508           * won't have to recompile the fragment shader if it gets paired with
1509           * a different vertex (or geometry) shader.
1510           */
1511          for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1512             if (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1513                 BITFIELD64_BIT(i)) {
1514                prog_data->urb_setup[i] = urb_next++;
1515             }
1516          }
1517       } else {
1518          /* We have enough input varyings that the SF/SBE pipeline stage can't
1519           * arbitrarily rearrange them to suit our whim; we have to put them
1520           * in an order that matches the output of the previous pipeline stage
1521           * (geometry or vertex shader).
1522           */
1523          struct brw_vue_map prev_stage_vue_map;
1524          brw_compute_vue_map(brw, &prev_stage_vue_map,
1525                              key->input_slots_valid);
1526          int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1527          assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1528          for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1529               slot++) {
1530             int varying = prev_stage_vue_map.slot_to_varying[slot];
1531             /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1532              * unused.
1533              */
1534             if (varying != BRW_VARYING_SLOT_COUNT &&
1535                 (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1536                  BITFIELD64_BIT(varying))) {
1537                prog_data->urb_setup[varying] = slot - first_slot;
1538             }
1539          }
1540          urb_next = prev_stage_vue_map.num_slots - first_slot;
1541       }
1542    } else {
1543       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1544       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1545          /* Point size is packed into the header, not as a general attribute */
1546          if (i == VARYING_SLOT_PSIZ)
1547             continue;
1548
1549          if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1550             /* The back color slot is skipped when the front color is
1551              * also written to.  In addition, some slots can be
1552              * written in the vertex shader and not read in the
1553              * fragment shader.  So the register number must always be
1554              * incremented, mapped or not.
1555              */
1556             if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1557                prog_data->urb_setup[i] = urb_next;
1558             urb_next++;
1559          }
1560       }
1561
1562       /*
1563        * It's a FS only attribute, and we did interpolation for this attribute
1564        * in SF thread. So, count it here, too.
1565        *
1566        * See compile_sf_prog() for more info.
1567        */
1568       if (fp->Base.InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1569          prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1570    }
1571
1572    prog_data->num_varying_inputs = urb_next;
1573 }
1574
1575 void
1576 fs_visitor::assign_urb_setup()
1577 {
1578    int urb_start = payload.num_regs + prog_data->curb_read_length;
1579
1580    /* Offset all the urb_setup[] index by the actual position of the
1581     * setup regs, now that the location of the constants has been chosen.
1582     */
1583    foreach_in_list(fs_inst, inst, &instructions) {
1584       if (inst->opcode == FS_OPCODE_LINTERP) {
1585          assert(inst->src[2].file == HW_REG);
1586          inst->src[2].fixed_hw_reg.nr += urb_start;
1587       }
1588
1589       if (inst->opcode == FS_OPCODE_CINTERP) {
1590          assert(inst->src[0].file == HW_REG);
1591          inst->src[0].fixed_hw_reg.nr += urb_start;
1592       }
1593    }
1594
1595    /* Each attribute is 4 setup channels, each of which is half a reg. */
1596    this->first_non_payload_grf =
1597       urb_start + prog_data->num_varying_inputs * 2;
1598 }
1599
1600 /**
1601  * Split large virtual GRFs into separate components if we can.
1602  *
1603  * This is mostly duplicated with what brw_fs_vector_splitting does,
1604  * but that's really conservative because it's afraid of doing
1605  * splitting that doesn't result in real progress after the rest of
1606  * the optimization phases, which would cause infinite looping in
1607  * optimization.  We can do it once here, safely.  This also has the
1608  * opportunity to split interpolated values, or maybe even uniforms,
1609  * which we don't have at the IR level.
1610  *
1611  * We want to split, because virtual GRFs are what we register
1612  * allocate and spill (due to contiguousness requirements for some
1613  * instructions), and they're what we naturally generate in the
1614  * codegen process, but most virtual GRFs don't actually need to be
1615  * contiguous sets of GRFs.  If we split, we'll end up with reduced
1616  * live intervals and better dead code elimination and coalescing.
1617  */
1618 void
1619 fs_visitor::split_virtual_grfs()
1620 {
1621    int num_vars = this->virtual_grf_count;
1622    bool split_grf[num_vars];
1623    int new_virtual_grf[num_vars];
1624
1625    /* Try to split anything > 0 sized. */
1626    for (int i = 0; i < num_vars; i++) {
1627       if (this->virtual_grf_sizes[i] != 1)
1628          split_grf[i] = true;
1629       else
1630          split_grf[i] = false;
1631    }
1632
1633    if (brw->has_pln &&
1634        this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1635       /* PLN opcodes rely on the delta_xy being contiguous.  We only have to
1636        * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1637        * Gen6, that was the only supported interpolation mode, and since Gen6,
1638        * delta_x and delta_y are in fixed hardware registers.
1639        */
1640       split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1641          false;
1642    }
1643
1644    foreach_in_list(fs_inst, inst, &instructions) {
1645       /* If there's a SEND message that requires contiguous destination
1646        * registers, no splitting is allowed.
1647        */
1648       if (inst->regs_written > 1) {
1649          split_grf[inst->dst.reg] = false;
1650       }
1651
1652       /* If we're sending from a GRF, don't split it, on the assumption that
1653        * the send is reading the whole thing.
1654        */
1655       if (inst->is_send_from_grf()) {
1656          for (int i = 0; i < inst->sources; i++) {
1657             if (inst->src[i].file == GRF) {
1658                split_grf[inst->src[i].reg] = false;
1659             }
1660          }
1661       }
1662    }
1663
1664    /* Allocate new space for split regs.  Note that the virtual
1665     * numbers will be contiguous.
1666     */
1667    for (int i = 0; i < num_vars; i++) {
1668       if (split_grf[i]) {
1669          new_virtual_grf[i] = virtual_grf_alloc(1);
1670          for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1671             int reg = virtual_grf_alloc(1);
1672             assert(reg == new_virtual_grf[i] + j - 1);
1673             (void) reg;
1674          }
1675          this->virtual_grf_sizes[i] = 1;
1676       }
1677    }
1678
1679    foreach_in_list(fs_inst, inst, &instructions) {
1680       if (inst->dst.file == GRF &&
1681           split_grf[inst->dst.reg] &&
1682           inst->dst.reg_offset != 0) {
1683          inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1684                           inst->dst.reg_offset - 1);
1685          inst->dst.reg_offset = 0;
1686       }
1687       for (int i = 0; i < inst->sources; i++) {
1688          if (inst->src[i].file == GRF &&
1689              split_grf[inst->src[i].reg] &&
1690              inst->src[i].reg_offset != 0) {
1691             inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1692                                 inst->src[i].reg_offset - 1);
1693             inst->src[i].reg_offset = 0;
1694          }
1695       }
1696    }
1697    invalidate_live_intervals();
1698 }
1699
1700 /**
1701  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1702  *
1703  * During code generation, we create tons of temporary variables, many of
1704  * which get immediately killed and are never used again.  Yet, in later
1705  * optimization and analysis passes, such as compute_live_intervals, we need
1706  * to loop over all the virtual GRFs.  Compacting them can save a lot of
1707  * overhead.
1708  */
1709 void
1710 fs_visitor::compact_virtual_grfs()
1711 {
1712    if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER))
1713       return;
1714
1715    /* Mark which virtual GRFs are used, and count how many. */
1716    int remap_table[this->virtual_grf_count];
1717    memset(remap_table, -1, sizeof(remap_table));
1718
1719    foreach_in_list(const fs_inst, inst, &instructions) {
1720       if (inst->dst.file == GRF)
1721          remap_table[inst->dst.reg] = 0;
1722
1723       for (int i = 0; i < inst->sources; i++) {
1724          if (inst->src[i].file == GRF)
1725             remap_table[inst->src[i].reg] = 0;
1726       }
1727    }
1728
1729    /* Compact the GRF arrays. */
1730    int new_index = 0;
1731    for (int i = 0; i < this->virtual_grf_count; i++) {
1732       if (remap_table[i] != -1) {
1733          remap_table[i] = new_index;
1734          virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1735          invalidate_live_intervals();
1736          ++new_index;
1737       }
1738    }
1739
1740    this->virtual_grf_count = new_index;
1741
1742    /* Patch all the instructions to use the newly renumbered registers */
1743    foreach_in_list(fs_inst, inst, &instructions) {
1744       if (inst->dst.file == GRF)
1745          inst->dst.reg = remap_table[inst->dst.reg];
1746
1747       for (int i = 0; i < inst->sources; i++) {
1748          if (inst->src[i].file == GRF)
1749             inst->src[i].reg = remap_table[inst->src[i].reg];
1750       }
1751    }
1752
1753    /* Patch all the references to delta_x/delta_y, since they're used in
1754     * register allocation.
1755     */
1756    for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
1757       if (delta_x[i].file == GRF && remap_table[delta_x[i].reg] != -1) {
1758          delta_x[i].reg = remap_table[delta_x[i].reg];
1759       }
1760    }
1761    for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
1762       if (delta_y[i].file == GRF && remap_table[delta_y[i].reg] != -1) {
1763          delta_y[i].reg = remap_table[delta_y[i].reg];
1764       }
1765    }
1766 }
1767
1768 /*
1769  * Implements array access of uniforms by inserting a
1770  * PULL_CONSTANT_LOAD instruction.
1771  *
1772  * Unlike temporary GRF array access (where we don't support it due to
1773  * the difficulty of doing relative addressing on instruction
1774  * destinations), we could potentially do array access of uniforms
1775  * that were loaded in GRF space as push constants.  In real-world
1776  * usage we've seen, though, the arrays being used are always larger
1777  * than we could load as push constants, so just always move all
1778  * uniform array access out to a pull constant buffer.
1779  */
1780 void
1781 fs_visitor::move_uniform_array_access_to_pull_constants()
1782 {
1783    if (dispatch_width != 8)
1784       return;
1785
1786    pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
1787
1788    for (unsigned int i = 0; i < uniforms; i++) {
1789       pull_constant_loc[i] = -1;
1790    }
1791
1792    /* Walk through and find array access of uniforms.  Put a copy of that
1793     * uniform in the pull constant buffer.
1794     *
1795     * Note that we don't move constant-indexed accesses to arrays.  No
1796     * testing has been done of the performance impact of this choice.
1797     */
1798    foreach_in_list_safe(fs_inst, inst, &instructions) {
1799       for (int i = 0 ; i < inst->sources; i++) {
1800          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1801             continue;
1802
1803          int uniform = inst->src[i].reg;
1804
1805          /* If this array isn't already present in the pull constant buffer,
1806           * add it.
1807           */
1808          if (pull_constant_loc[uniform] == -1) {
1809             const float **values = &stage_prog_data->param[uniform];
1810
1811             assert(param_size[uniform]);
1812
1813             for (int j = 0; j < param_size[uniform]; j++) {
1814                pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
1815
1816                stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
1817                   values[j];
1818             }
1819          }
1820       }
1821    }
1822 }
1823
1824 /**
1825  * Assign UNIFORM file registers to either push constants or pull constants.
1826  *
1827  * We allow a fragment shader to have more than the specified minimum
1828  * maximum number of fragment shader uniform components (64).  If
1829  * there are too many of these, they'd fill up all of register space.
1830  * So, this will push some of them out to the pull constant buffer and
1831  * update the program to load them.
1832  */
1833 void
1834 fs_visitor::assign_constant_locations()
1835 {
1836    /* Only the first compile (SIMD8 mode) gets to decide on locations. */
1837    if (dispatch_width != 8)
1838       return;
1839
1840    /* Find which UNIFORM registers are still in use. */
1841    bool is_live[uniforms];
1842    for (unsigned int i = 0; i < uniforms; i++) {
1843       is_live[i] = false;
1844    }
1845
1846    foreach_in_list(fs_inst, inst, &instructions) {
1847       for (int i = 0; i < inst->sources; i++) {
1848          if (inst->src[i].file != UNIFORM)
1849             continue;
1850
1851          int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1852          if (constant_nr >= 0 && constant_nr < (int) uniforms)
1853             is_live[constant_nr] = true;
1854       }
1855    }
1856
1857    /* Only allow 16 registers (128 uniform components) as push constants.
1858     *
1859     * Just demote the end of the list.  We could probably do better
1860     * here, demoting things that are rarely used in the program first.
1861     *
1862     * If changing this value, note the limitation about total_regs in
1863     * brw_curbe.c.
1864     */
1865    unsigned int max_push_components = 16 * 8;
1866    unsigned int num_push_constants = 0;
1867
1868    push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
1869
1870    for (unsigned int i = 0; i < uniforms; i++) {
1871       if (!is_live[i] || pull_constant_loc[i] != -1) {
1872          /* This UNIFORM register is either dead, or has already been demoted
1873           * to a pull const.  Mark it as no longer living in the param[] array.
1874           */
1875          push_constant_loc[i] = -1;
1876          continue;
1877       }
1878
1879       if (num_push_constants < max_push_components) {
1880          /* Retain as a push constant.  Record the location in the params[]
1881           * array.
1882           */
1883          push_constant_loc[i] = num_push_constants++;
1884       } else {
1885          /* Demote to a pull constant. */
1886          push_constant_loc[i] = -1;
1887
1888          int pull_index = stage_prog_data->nr_pull_params++;
1889          stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
1890          pull_constant_loc[i] = pull_index;
1891       }
1892    }
1893
1894    stage_prog_data->nr_params = num_push_constants;
1895
1896    /* Up until now, the param[] array has been indexed by reg + reg_offset
1897     * of UNIFORM registers.  Condense it to only contain the uniforms we
1898     * chose to upload as push constants.
1899     */
1900    for (unsigned int i = 0; i < uniforms; i++) {
1901       int remapped = push_constant_loc[i];
1902
1903       if (remapped == -1)
1904          continue;
1905
1906       assert(remapped <= (int)i);
1907       stage_prog_data->param[remapped] = stage_prog_data->param[i];
1908    }
1909 }
1910
1911 /**
1912  * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
1913  * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
1914  */
1915 void
1916 fs_visitor::demote_pull_constants()
1917 {
1918    foreach_in_list(fs_inst, inst, &instructions) {
1919       for (int i = 0; i < inst->sources; i++) {
1920          if (inst->src[i].file != UNIFORM)
1921             continue;
1922
1923          int pull_index = pull_constant_loc[inst->src[i].reg +
1924                                             inst->src[i].reg_offset];
1925          if (pull_index == -1)
1926             continue;
1927
1928          /* Set up the annotation tracking for new generated instructions. */
1929          base_ir = inst->ir;
1930          current_annotation = inst->annotation;
1931
1932          fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
1933          fs_reg dst = fs_reg(this, glsl_type::float_type);
1934
1935          /* Generate a pull load into dst. */
1936          if (inst->src[i].reladdr) {
1937             exec_list list = VARYING_PULL_CONSTANT_LOAD(dst,
1938                                                         surf_index,
1939                                                         *inst->src[i].reladdr,
1940                                                         pull_index);
1941             inst->insert_before(&list);
1942             inst->src[i].reladdr = NULL;
1943          } else {
1944             fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1945             fs_inst *pull =
1946                new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1947                                     dst, surf_index, offset);
1948             inst->insert_before(pull);
1949             inst->src[i].set_smear(pull_index & 3);
1950          }
1951
1952          /* Rewrite the instruction to use the temporary VGRF. */
1953          inst->src[i].file = GRF;
1954          inst->src[i].reg = dst.reg;
1955          inst->src[i].reg_offset = 0;
1956       }
1957    }
1958    invalidate_live_intervals();
1959 }
1960
1961 bool
1962 fs_visitor::opt_algebraic()
1963 {
1964    bool progress = false;
1965
1966    foreach_in_list(fs_inst, inst, &instructions) {
1967       switch (inst->opcode) {
1968       case BRW_OPCODE_MUL:
1969          if (inst->src[1].file != IMM)
1970             continue;
1971
1972          /* a * 1.0 = a */
1973          if (inst->src[1].is_one()) {
1974             inst->opcode = BRW_OPCODE_MOV;
1975             inst->src[1] = reg_undef;
1976             progress = true;
1977             break;
1978          }
1979
1980          /* a * 0.0 = 0.0 */
1981          if (inst->src[1].is_zero()) {
1982             inst->opcode = BRW_OPCODE_MOV;
1983             inst->src[0] = inst->src[1];
1984             inst->src[1] = reg_undef;
1985             progress = true;
1986             break;
1987          }
1988
1989          break;
1990       case BRW_OPCODE_ADD:
1991          if (inst->src[1].file != IMM)
1992             continue;
1993
1994          /* a + 0.0 = a */
1995          if (inst->src[1].is_zero()) {
1996             inst->opcode = BRW_OPCODE_MOV;
1997             inst->src[1] = reg_undef;
1998             progress = true;
1999             break;
2000          }
2001          break;
2002       case BRW_OPCODE_OR:
2003          if (inst->src[0].equals(inst->src[1])) {
2004             inst->opcode = BRW_OPCODE_MOV;
2005             inst->src[1] = reg_undef;
2006             progress = true;
2007             break;
2008          }
2009          break;
2010       case BRW_OPCODE_LRP:
2011          if (inst->src[1].equals(inst->src[2])) {
2012             inst->opcode = BRW_OPCODE_MOV;
2013             inst->src[0] = inst->src[1];
2014             inst->src[1] = reg_undef;
2015             inst->src[2] = reg_undef;
2016             progress = true;
2017             break;
2018          }
2019          break;
2020       case BRW_OPCODE_SEL:
2021          if (inst->src[0].equals(inst->src[1])) {
2022             inst->opcode = BRW_OPCODE_MOV;
2023             inst->src[1] = reg_undef;
2024             inst->predicate = BRW_PREDICATE_NONE;
2025             inst->predicate_inverse = false;
2026             progress = true;
2027          } else if (inst->saturate && inst->src[1].file == IMM) {
2028             switch (inst->conditional_mod) {
2029             case BRW_CONDITIONAL_LE:
2030             case BRW_CONDITIONAL_L:
2031                switch (inst->src[1].type) {
2032                case BRW_REGISTER_TYPE_F:
2033                   if (inst->src[1].fixed_hw_reg.dw1.f >= 1.0f) {
2034                      inst->opcode = BRW_OPCODE_MOV;
2035                      inst->src[1] = reg_undef;
2036                      progress = true;
2037                   }
2038                   break;
2039                default:
2040                   break;
2041                }
2042                break;
2043             case BRW_CONDITIONAL_GE:
2044             case BRW_CONDITIONAL_G:
2045                switch (inst->src[1].type) {
2046                case BRW_REGISTER_TYPE_F:
2047                   if (inst->src[1].fixed_hw_reg.dw1.f <= 0.0f) {
2048                      inst->opcode = BRW_OPCODE_MOV;
2049                      inst->src[1] = reg_undef;
2050                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
2051                      progress = true;
2052                   }
2053                   break;
2054                default:
2055                   break;
2056                }
2057             default:
2058                break;
2059             }
2060          }
2061          break;
2062       default:
2063          break;
2064       }
2065    }
2066
2067    return progress;
2068 }
2069
2070 bool
2071 fs_visitor::compute_to_mrf()
2072 {
2073    bool progress = false;
2074    int next_ip = 0;
2075
2076    calculate_live_intervals();
2077
2078    foreach_in_list_safe(fs_inst, inst, &instructions) {
2079       int ip = next_ip;
2080       next_ip++;
2081
2082       if (inst->opcode != BRW_OPCODE_MOV ||
2083           inst->is_partial_write() ||
2084           inst->dst.file != MRF || inst->src[0].file != GRF ||
2085           inst->dst.type != inst->src[0].type ||
2086           inst->src[0].abs || inst->src[0].negate ||
2087           !inst->src[0].is_contiguous() ||
2088           inst->src[0].subreg_offset)
2089          continue;
2090
2091       /* Work out which hardware MRF registers are written by this
2092        * instruction.
2093        */
2094       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2095       int mrf_high;
2096       if (inst->dst.reg & BRW_MRF_COMPR4) {
2097          mrf_high = mrf_low + 4;
2098       } else if (dispatch_width == 16 &&
2099                  (!inst->force_uncompressed && !inst->force_sechalf)) {
2100          mrf_high = mrf_low + 1;
2101       } else {
2102          mrf_high = mrf_low;
2103       }
2104
2105       /* Can't compute-to-MRF this GRF if someone else was going to
2106        * read it later.
2107        */
2108       if (this->virtual_grf_end[inst->src[0].reg] > ip)
2109          continue;
2110
2111       /* Found a move of a GRF to a MRF.  Let's see if we can go
2112        * rewrite the thing that made this GRF to write into the MRF.
2113        */
2114       fs_inst *scan_inst;
2115       for (scan_inst = (fs_inst *)inst->prev;
2116            !scan_inst->is_head_sentinel();
2117            scan_inst = (fs_inst *)scan_inst->prev) {
2118          if (scan_inst->dst.file == GRF &&
2119              scan_inst->dst.reg == inst->src[0].reg) {
2120             /* Found the last thing to write our reg we want to turn
2121              * into a compute-to-MRF.
2122              */
2123
2124             /* If this one instruction didn't populate all the
2125              * channels, bail.  We might be able to rewrite everything
2126              * that writes that reg, but it would require smarter
2127              * tracking to delay the rewriting until complete success.
2128              */
2129             if (scan_inst->is_partial_write())
2130                break;
2131
2132             /* Things returning more than one register would need us to
2133              * understand coalescing out more than one MOV at a time.
2134              */
2135             if (scan_inst->regs_written > 1)
2136                break;
2137
2138             /* SEND instructions can't have MRF as a destination. */
2139             if (scan_inst->mlen)
2140                break;
2141
2142             if (brw->gen == 6) {
2143                /* gen6 math instructions must have the destination be
2144                 * GRF, so no compute-to-MRF for them.
2145                 */
2146                if (scan_inst->is_math()) {
2147                   break;
2148                }
2149             }
2150
2151             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2152                /* Found the creator of our MRF's source value. */
2153                scan_inst->dst.file = MRF;
2154                scan_inst->dst.reg = inst->dst.reg;
2155                scan_inst->saturate |= inst->saturate;
2156                inst->remove();
2157                progress = true;
2158             }
2159             break;
2160          }
2161
2162          /* We don't handle control flow here.  Most computation of
2163           * values that end up in MRFs are shortly before the MRF
2164           * write anyway.
2165           */
2166          if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2167             break;
2168
2169          /* You can't read from an MRF, so if someone else reads our
2170           * MRF's source GRF that we wanted to rewrite, that stops us.
2171           */
2172          bool interfered = false;
2173          for (int i = 0; i < scan_inst->sources; i++) {
2174             if (scan_inst->src[i].file == GRF &&
2175                 scan_inst->src[i].reg == inst->src[0].reg &&
2176                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2177                interfered = true;
2178             }
2179          }
2180          if (interfered)
2181             break;
2182
2183          if (scan_inst->dst.file == MRF) {
2184             /* If somebody else writes our MRF here, we can't
2185              * compute-to-MRF before that.
2186              */
2187             int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2188             int scan_mrf_high;
2189
2190             if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2191                scan_mrf_high = scan_mrf_low + 4;
2192             } else if (dispatch_width == 16 &&
2193                        (!scan_inst->force_uncompressed &&
2194                         !scan_inst->force_sechalf)) {
2195                scan_mrf_high = scan_mrf_low + 1;
2196             } else {
2197                scan_mrf_high = scan_mrf_low;
2198             }
2199
2200             if (mrf_low == scan_mrf_low ||
2201                 mrf_low == scan_mrf_high ||
2202                 mrf_high == scan_mrf_low ||
2203                 mrf_high == scan_mrf_high) {
2204                break;
2205             }
2206          }
2207
2208          if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2209             /* Found a SEND instruction, which means that there are
2210              * live values in MRFs from base_mrf to base_mrf +
2211              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
2212              * above it.
2213              */
2214             if (mrf_low >= scan_inst->base_mrf &&
2215                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2216                break;
2217             }
2218             if (mrf_high >= scan_inst->base_mrf &&
2219                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2220                break;
2221             }
2222          }
2223       }
2224    }
2225
2226    if (progress)
2227       invalidate_live_intervals();
2228
2229    return progress;
2230 }
2231
2232 /**
2233  * Walks through basic blocks, looking for repeated MRF writes and
2234  * removing the later ones.
2235  */
2236 bool
2237 fs_visitor::remove_duplicate_mrf_writes()
2238 {
2239    fs_inst *last_mrf_move[16];
2240    bool progress = false;
2241
2242    /* Need to update the MRF tracking for compressed instructions. */
2243    if (dispatch_width == 16)
2244       return false;
2245
2246    memset(last_mrf_move, 0, sizeof(last_mrf_move));
2247
2248    foreach_in_list_safe(fs_inst, inst, &instructions) {
2249       if (inst->is_control_flow()) {
2250          memset(last_mrf_move, 0, sizeof(last_mrf_move));
2251       }
2252
2253       if (inst->opcode == BRW_OPCODE_MOV &&
2254           inst->dst.file == MRF) {
2255          fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2256          if (prev_inst && inst->equals(prev_inst)) {
2257             inst->remove();
2258             progress = true;
2259             continue;
2260          }
2261       }
2262
2263       /* Clear out the last-write records for MRFs that were overwritten. */
2264       if (inst->dst.file == MRF) {
2265          last_mrf_move[inst->dst.reg] = NULL;
2266       }
2267
2268       if (inst->mlen > 0 && inst->base_mrf != -1) {
2269          /* Found a SEND instruction, which will include two or fewer
2270           * implied MRF writes.  We could do better here.
2271           */
2272          for (int i = 0; i < implied_mrf_writes(inst); i++) {
2273             last_mrf_move[inst->base_mrf + i] = NULL;
2274          }
2275       }
2276
2277       /* Clear out any MRF move records whose sources got overwritten. */
2278       if (inst->dst.file == GRF) {
2279          for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2280             if (last_mrf_move[i] &&
2281                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2282                last_mrf_move[i] = NULL;
2283             }
2284          }
2285       }
2286
2287       if (inst->opcode == BRW_OPCODE_MOV &&
2288           inst->dst.file == MRF &&
2289           inst->src[0].file == GRF &&
2290           !inst->is_partial_write()) {
2291          last_mrf_move[inst->dst.reg] = inst;
2292       }
2293    }
2294
2295    if (progress)
2296       invalidate_live_intervals();
2297
2298    return progress;
2299 }
2300
2301 static void
2302 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2303                         int first_grf, int grf_len)
2304 {
2305    bool inst_simd16 = (dispatch_width > 8 &&
2306                        !inst->force_uncompressed &&
2307                        !inst->force_sechalf);
2308
2309    /* Clear the flag for registers that actually got read (as expected). */
2310    for (int i = 0; i < inst->sources; i++) {
2311       int grf;
2312       if (inst->src[i].file == GRF) {
2313          grf = inst->src[i].reg;
2314       } else if (inst->src[i].file == HW_REG &&
2315                  inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2316          grf = inst->src[i].fixed_hw_reg.nr;
2317       } else {
2318          continue;
2319       }
2320
2321       if (grf >= first_grf &&
2322           grf < first_grf + grf_len) {
2323          deps[grf - first_grf] = false;
2324          if (inst_simd16)
2325             deps[grf - first_grf + 1] = false;
2326       }
2327    }
2328 }
2329
2330 /**
2331  * Implements this workaround for the original 965:
2332  *
2333  *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2334  *      check for post destination dependencies on this instruction, software
2335  *      must ensure that there is no destination hazard for the case of ‘write
2336  *      followed by a posted write’ shown in the following example.
2337  *
2338  *      1. mov r3 0
2339  *      2. send r3.xy <rest of send instruction>
2340  *      3. mov r2 r3
2341  *
2342  *      Due to no post-destination dependency check on the ‘send’, the above
2343  *      code sequence could have two instructions (1 and 2) in flight at the
2344  *      same time that both consider ‘r3’ as the target of their final writes.
2345  */
2346 void
2347 fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
2348 {
2349    int reg_size = dispatch_width / 8;
2350    int write_len = inst->regs_written * reg_size;
2351    int first_write_grf = inst->dst.reg;
2352    bool needs_dep[BRW_MAX_MRF];
2353    assert(write_len < (int)sizeof(needs_dep) - 1);
2354
2355    memset(needs_dep, false, sizeof(needs_dep));
2356    memset(needs_dep, true, write_len);
2357
2358    clear_deps_for_inst_src(inst, dispatch_width,
2359                            needs_dep, first_write_grf, write_len);
2360
2361    /* Walk backwards looking for writes to registers we're writing which
2362     * aren't read since being written.  If we hit the start of the program,
2363     * we assume that there are no outstanding dependencies on entry to the
2364     * program.
2365     */
2366    for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2367         !scan_inst->is_head_sentinel();
2368         scan_inst = (fs_inst *)scan_inst->prev) {
2369
2370       /* If we hit control flow, assume that there *are* outstanding
2371        * dependencies, and force their cleanup before our instruction.
2372        */
2373       if (scan_inst->is_control_flow()) {
2374          for (int i = 0; i < write_len; i++) {
2375             if (needs_dep[i]) {
2376                inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2377             }
2378          }
2379          return;
2380       }
2381
2382       bool scan_inst_simd16 = (dispatch_width > 8 &&
2383                                !scan_inst->force_uncompressed &&
2384                                !scan_inst->force_sechalf);
2385
2386       /* We insert our reads as late as possible on the assumption that any
2387        * instruction but a MOV that might have left us an outstanding
2388        * dependency has more latency than a MOV.
2389        */
2390       if (scan_inst->dst.file == GRF) {
2391          for (int i = 0; i < scan_inst->regs_written; i++) {
2392             int reg = scan_inst->dst.reg + i * reg_size;
2393
2394             if (reg >= first_write_grf &&
2395                 reg < first_write_grf + write_len &&
2396                 needs_dep[reg - first_write_grf]) {
2397                inst->insert_before(DEP_RESOLVE_MOV(reg));
2398                needs_dep[reg - first_write_grf] = false;
2399                if (scan_inst_simd16)
2400                   needs_dep[reg - first_write_grf + 1] = false;
2401             }
2402          }
2403       }
2404
2405       /* Clear the flag for registers that actually got read (as expected). */
2406       clear_deps_for_inst_src(scan_inst, dispatch_width,
2407                               needs_dep, first_write_grf, write_len);
2408
2409       /* Continue the loop only if we haven't resolved all the dependencies */
2410       int i;
2411       for (i = 0; i < write_len; i++) {
2412          if (needs_dep[i])
2413             break;
2414       }
2415       if (i == write_len)
2416          return;
2417    }
2418 }
2419
2420 /**
2421  * Implements this workaround for the original 965:
2422  *
2423  *     "[DevBW, DevCL] Errata: A destination register from a send can not be
2424  *      used as a destination register until after it has been sourced by an
2425  *      instruction with a different destination register.
2426  */
2427 void
2428 fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
2429 {
2430    int write_len = inst->regs_written * dispatch_width / 8;
2431    int first_write_grf = inst->dst.reg;
2432    bool needs_dep[BRW_MAX_MRF];
2433    assert(write_len < (int)sizeof(needs_dep) - 1);
2434
2435    memset(needs_dep, false, sizeof(needs_dep));
2436    memset(needs_dep, true, write_len);
2437    /* Walk forwards looking for writes to registers we're writing which aren't
2438     * read before being written.
2439     */
2440    for (fs_inst *scan_inst = (fs_inst *)inst->next;
2441         !scan_inst->is_tail_sentinel();
2442         scan_inst = (fs_inst *)scan_inst->next) {
2443       /* If we hit control flow, force resolve all remaining dependencies. */
2444       if (scan_inst->is_control_flow()) {
2445          for (int i = 0; i < write_len; i++) {
2446             if (needs_dep[i])
2447                scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2448          }
2449          return;
2450       }
2451
2452       /* Clear the flag for registers that actually got read (as expected). */
2453       clear_deps_for_inst_src(scan_inst, dispatch_width,
2454                               needs_dep, first_write_grf, write_len);
2455
2456       /* We insert our reads as late as possible since they're reading the
2457        * result of a SEND, which has massive latency.
2458        */
2459       if (scan_inst->dst.file == GRF &&
2460           scan_inst->dst.reg >= first_write_grf &&
2461           scan_inst->dst.reg < first_write_grf + write_len &&
2462           needs_dep[scan_inst->dst.reg - first_write_grf]) {
2463          scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2464          needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2465       }
2466
2467       /* Continue the loop only if we haven't resolved all the dependencies */
2468       int i;
2469       for (i = 0; i < write_len; i++) {
2470          if (needs_dep[i])
2471             break;
2472       }
2473       if (i == write_len)
2474          return;
2475    }
2476
2477    /* If we hit the end of the program, resolve all remaining dependencies out
2478     * of paranoia.
2479     */
2480    fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2481    assert(last_inst->eot);
2482    for (int i = 0; i < write_len; i++) {
2483       if (needs_dep[i])
2484          last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2485    }
2486 }
2487
2488 void
2489 fs_visitor::insert_gen4_send_dependency_workarounds()
2490 {
2491    if (brw->gen != 4 || brw->is_g4x)
2492       return;
2493
2494    bool progress = false;
2495
2496    /* Note that we're done with register allocation, so GRF fs_regs always
2497     * have a .reg_offset of 0.
2498     */
2499
2500    foreach_in_list_safe(fs_inst, inst, &instructions) {
2501       if (inst->mlen != 0 && inst->dst.file == GRF) {
2502          insert_gen4_pre_send_dependency_workarounds(inst);
2503          insert_gen4_post_send_dependency_workarounds(inst);
2504          progress = true;
2505       }
2506    }
2507
2508    if (progress)
2509       invalidate_live_intervals();
2510 }
2511
2512 /**
2513  * Turns the generic expression-style uniform pull constant load instruction
2514  * into a hardware-specific series of instructions for loading a pull
2515  * constant.
2516  *
2517  * The expression style allows the CSE pass before this to optimize out
2518  * repeated loads from the same offset, and gives the pre-register-allocation
2519  * scheduling full flexibility, while the conversion to native instructions
2520  * allows the post-register-allocation scheduler the best information
2521  * possible.
2522  *
2523  * Note that execution masking for setting up pull constant loads is special:
2524  * the channels that need to be written are unrelated to the current execution
2525  * mask, since a later instruction will use one of the result channels as a
2526  * source operand for all 8 or 16 of its channels.
2527  */
2528 void
2529 fs_visitor::lower_uniform_pull_constant_loads()
2530 {
2531    foreach_in_list(fs_inst, inst, &instructions) {
2532       if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2533          continue;
2534
2535       if (brw->gen >= 7) {
2536          /* The offset arg before was a vec4-aligned byte offset.  We need to
2537           * turn it into a dword offset.
2538           */
2539          fs_reg const_offset_reg = inst->src[1];
2540          assert(const_offset_reg.file == IMM &&
2541                 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2542          const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
2543          fs_reg payload = fs_reg(this, glsl_type::uint_type);
2544
2545          /* This is actually going to be a MOV, but since only the first dword
2546           * is accessed, we have a special opcode to do just that one.  Note
2547           * that this needs to be an operation that will be considered a def
2548           * by live variable analysis, or register allocation will explode.
2549           */
2550          fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2551                                                payload, const_offset_reg);
2552          setup->force_writemask_all = true;
2553
2554          setup->ir = inst->ir;
2555          setup->annotation = inst->annotation;
2556          inst->insert_before(setup);
2557
2558          /* Similarly, this will only populate the first 4 channels of the
2559           * result register (since we only use smear values from 0-3), but we
2560           * don't tell the optimizer.
2561           */
2562          inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2563          inst->src[1] = payload;
2564
2565          invalidate_live_intervals();
2566       } else {
2567          /* Before register allocation, we didn't tell the scheduler about the
2568           * MRF we use.  We know it's safe to use this MRF because nothing
2569           * else does except for register spill/unspill, which generates and
2570           * uses its MRF within a single IR instruction.
2571           */
2572          inst->base_mrf = 14;
2573          inst->mlen = 1;
2574       }
2575    }
2576 }
2577
2578 bool
2579 fs_visitor::lower_load_payload()
2580 {
2581    bool progress = false;
2582
2583    foreach_in_list_safe(fs_inst, inst, &instructions) {
2584       if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
2585          fs_reg dst = inst->dst;
2586
2587          /* src[0] represents the (optional) message header. */
2588          if (inst->src[0].file != BAD_FILE) {
2589             inst->insert_before(MOV(dst, inst->src[0]));
2590          }
2591          dst.reg_offset++;
2592
2593          for (int i = 1; i < inst->sources; i++) {
2594             inst->insert_before(MOV(dst, inst->src[i]));
2595             dst.reg_offset++;
2596          }
2597
2598          inst->remove();
2599          progress = true;
2600       }
2601    }
2602
2603    if (progress)
2604       invalidate_live_intervals();
2605
2606    return progress;
2607 }
2608
2609 void
2610 fs_visitor::dump_instructions()
2611 {
2612    dump_instructions(NULL);
2613 }
2614
2615 void
2616 fs_visitor::dump_instructions(const char *name)
2617 {
2618    calculate_register_pressure();
2619    FILE *file = stderr;
2620    if (name && geteuid() != 0) {
2621       file = fopen(name, "w");
2622       if (!file)
2623          file = stderr;
2624    }
2625
2626    int ip = 0, max_pressure = 0;
2627    foreach_in_list(backend_instruction, inst, &instructions) {
2628       max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
2629       fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
2630       dump_instruction(inst, file);
2631       ++ip;
2632    }
2633    fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
2634
2635    if (file != stderr) {
2636       fclose(file);
2637    }
2638 }
2639
2640 void
2641 fs_visitor::dump_instruction(backend_instruction *be_inst)
2642 {
2643    dump_instruction(be_inst, stderr);
2644 }
2645
2646 void
2647 fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
2648 {
2649    fs_inst *inst = (fs_inst *)be_inst;
2650
2651    if (inst->predicate) {
2652       fprintf(file, "(%cf0.%d) ",
2653              inst->predicate_inverse ? '-' : '+',
2654              inst->flag_subreg);
2655    }
2656
2657    fprintf(file, "%s", brw_instruction_name(inst->opcode));
2658    if (inst->saturate)
2659       fprintf(file, ".sat");
2660    if (inst->conditional_mod) {
2661       fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
2662       if (!inst->predicate &&
2663           (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2664                               inst->opcode != BRW_OPCODE_IF &&
2665                               inst->opcode != BRW_OPCODE_WHILE))) {
2666          fprintf(file, ".f0.%d", inst->flag_subreg);
2667       }
2668    }
2669    fprintf(file, " ");
2670
2671
2672    switch (inst->dst.file) {
2673    case GRF:
2674       fprintf(file, "vgrf%d", inst->dst.reg);
2675       if (virtual_grf_sizes[inst->dst.reg] != 1 ||
2676           inst->dst.subreg_offset)
2677          fprintf(file, "+%d.%d",
2678                  inst->dst.reg_offset, inst->dst.subreg_offset);
2679       break;
2680    case MRF:
2681       fprintf(file, "m%d", inst->dst.reg);
2682       break;
2683    case BAD_FILE:
2684       fprintf(file, "(null)");
2685       break;
2686    case UNIFORM:
2687       fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
2688       break;
2689    case HW_REG:
2690       if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
2691          switch (inst->dst.fixed_hw_reg.nr) {
2692          case BRW_ARF_NULL:
2693             fprintf(file, "null");
2694             break;
2695          case BRW_ARF_ADDRESS:
2696             fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
2697             break;
2698          case BRW_ARF_ACCUMULATOR:
2699             fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
2700             break;
2701          case BRW_ARF_FLAG:
2702             fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
2703                              inst->dst.fixed_hw_reg.subnr);
2704             break;
2705          default:
2706             fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
2707                                inst->dst.fixed_hw_reg.subnr);
2708             break;
2709          }
2710       } else {
2711          fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
2712       }
2713       if (inst->dst.fixed_hw_reg.subnr)
2714          fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
2715       break;
2716    default:
2717       fprintf(file, "???");
2718       break;
2719    }
2720    fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
2721
2722    for (int i = 0; i < inst->sources && inst->src[i].file != BAD_FILE; i++) {
2723       if (inst->src[i].negate)
2724          fprintf(file, "-");
2725       if (inst->src[i].abs)
2726          fprintf(file, "|");
2727       switch (inst->src[i].file) {
2728       case GRF:
2729          fprintf(file, "vgrf%d", inst->src[i].reg);
2730          if (virtual_grf_sizes[inst->src[i].reg] != 1 ||
2731              inst->src[i].subreg_offset)
2732             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
2733                     inst->src[i].subreg_offset);
2734          break;
2735       case MRF:
2736          fprintf(file, "***m%d***", inst->src[i].reg);
2737          break;
2738       case UNIFORM:
2739          fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
2740          if (inst->src[i].reladdr) {
2741             fprintf(file, "+reladdr");
2742          } else if (virtual_grf_sizes[inst->src[i].reg] != 1 ||
2743              inst->src[i].subreg_offset) {
2744             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
2745                     inst->src[i].subreg_offset);
2746          }
2747          break;
2748       case BAD_FILE:
2749          fprintf(file, "(null)");
2750          break;
2751       case IMM:
2752          switch (inst->src[i].type) {
2753          case BRW_REGISTER_TYPE_F:
2754             fprintf(file, "%ff", inst->src[i].fixed_hw_reg.dw1.f);
2755             break;
2756          case BRW_REGISTER_TYPE_D:
2757             fprintf(file, "%dd", inst->src[i].fixed_hw_reg.dw1.d);
2758             break;
2759          case BRW_REGISTER_TYPE_UD:
2760             fprintf(file, "%uu", inst->src[i].fixed_hw_reg.dw1.ud);
2761             break;
2762          default:
2763             fprintf(file, "???");
2764             break;
2765          }
2766          break;
2767       case HW_REG:
2768          if (inst->src[i].fixed_hw_reg.negate)
2769             fprintf(file, "-");
2770          if (inst->src[i].fixed_hw_reg.abs)
2771             fprintf(file, "|");
2772          if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
2773             switch (inst->src[i].fixed_hw_reg.nr) {
2774             case BRW_ARF_NULL:
2775                fprintf(file, "null");
2776                break;
2777             case BRW_ARF_ADDRESS:
2778                fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
2779                break;
2780             case BRW_ARF_ACCUMULATOR:
2781                fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
2782                break;
2783             case BRW_ARF_FLAG:
2784                fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
2785                                 inst->src[i].fixed_hw_reg.subnr);
2786                break;
2787             default:
2788                fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
2789                                   inst->src[i].fixed_hw_reg.subnr);
2790                break;
2791             }
2792          } else {
2793             fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
2794          }
2795          if (inst->src[i].fixed_hw_reg.subnr)
2796             fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
2797          if (inst->src[i].fixed_hw_reg.abs)
2798             fprintf(file, "|");
2799          break;
2800       default:
2801          fprintf(file, "???");
2802          break;
2803       }
2804       if (inst->src[i].abs)
2805          fprintf(file, "|");
2806
2807       if (inst->src[i].file != IMM) {
2808          fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
2809       }
2810
2811       if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
2812          fprintf(file, ", ");
2813    }
2814
2815    fprintf(file, " ");
2816
2817    if (inst->force_uncompressed)
2818       fprintf(file, "1sthalf ");
2819
2820    if (inst->force_sechalf)
2821       fprintf(file, "2ndhalf ");
2822
2823    fprintf(file, "\n");
2824 }
2825
2826 /**
2827  * Possibly returns an instruction that set up @param reg.
2828  *
2829  * Sometimes we want to take the result of some expression/variable
2830  * dereference tree and rewrite the instruction generating the result
2831  * of the tree.  When processing the tree, we know that the
2832  * instructions generated are all writing temporaries that are dead
2833  * outside of this tree.  So, if we have some instructions that write
2834  * a temporary, we're free to point that temp write somewhere else.
2835  *
2836  * Note that this doesn't guarantee that the instruction generated
2837  * only reg -- it might be the size=4 destination of a texture instruction.
2838  */
2839 fs_inst *
2840 fs_visitor::get_instruction_generating_reg(fs_inst *start,
2841                                            fs_inst *end,
2842                                            const fs_reg &reg)
2843 {
2844    if (end == start ||
2845        end->is_partial_write() ||
2846        reg.reladdr ||
2847        !reg.equals(end->dst)) {
2848       return NULL;
2849    } else {
2850       return end;
2851    }
2852 }
2853
2854 void
2855 fs_visitor::setup_payload_gen6()
2856 {
2857    bool uses_depth =
2858       (fp->Base.InputsRead & (1 << VARYING_SLOT_POS)) != 0;
2859    unsigned barycentric_interp_modes = prog_data->barycentric_interp_modes;
2860
2861    assert(brw->gen >= 6);
2862
2863    /* R0-1: masks, pixel X/Y coordinates. */
2864    payload.num_regs = 2;
2865    /* R2: only for 32-pixel dispatch.*/
2866
2867    /* R3-26: barycentric interpolation coordinates.  These appear in the
2868     * same order that they appear in the brw_wm_barycentric_interp_mode
2869     * enum.  Each set of coordinates occupies 2 registers if dispatch width
2870     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
2871     * appear if they were enabled using the "Barycentric Interpolation
2872     * Mode" bits in WM_STATE.
2873     */
2874    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
2875       if (barycentric_interp_modes & (1 << i)) {
2876          payload.barycentric_coord_reg[i] = payload.num_regs;
2877          payload.num_regs += 2;
2878          if (dispatch_width == 16) {
2879             payload.num_regs += 2;
2880          }
2881       }
2882    }
2883
2884    /* R27: interpolated depth if uses source depth */
2885    if (uses_depth) {
2886       payload.source_depth_reg = payload.num_regs;
2887       payload.num_regs++;
2888       if (dispatch_width == 16) {
2889          /* R28: interpolated depth if not SIMD8. */
2890          payload.num_regs++;
2891       }
2892    }
2893    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
2894    if (uses_depth) {
2895       payload.source_w_reg = payload.num_regs;
2896       payload.num_regs++;
2897       if (dispatch_width == 16) {
2898          /* R30: interpolated W if not SIMD8. */
2899          payload.num_regs++;
2900       }
2901    }
2902
2903    prog_data->uses_pos_offset = key->compute_pos_offset;
2904    /* R31: MSAA position offsets. */
2905    if (prog_data->uses_pos_offset) {
2906       payload.sample_pos_reg = payload.num_regs;
2907       payload.num_regs++;
2908    }
2909
2910    /* R32: MSAA input coverage mask */
2911    if (fp->Base.SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
2912       assert(brw->gen >= 7);
2913       payload.sample_mask_in_reg = payload.num_regs;
2914       payload.num_regs++;
2915       if (dispatch_width == 16) {
2916          /* R33: input coverage mask if not SIMD8. */
2917          payload.num_regs++;
2918       }
2919    }
2920
2921    /* R34-: bary for 32-pixel. */
2922    /* R58-59: interp W for 32-pixel. */
2923
2924    if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
2925       source_depth_to_render_target = true;
2926    }
2927 }
2928
2929 void
2930 fs_visitor::assign_binding_table_offsets()
2931 {
2932    uint32_t next_binding_table_offset = 0;
2933
2934    /* If there are no color regions, we still perform an FB write to a null
2935     * renderbuffer, which we place at surface index 0.
2936     */
2937    prog_data->binding_table.render_target_start = next_binding_table_offset;
2938    next_binding_table_offset += MAX2(key->nr_color_regions, 1);
2939
2940    assign_common_binding_table_offsets(next_binding_table_offset);
2941 }
2942
2943 void
2944 fs_visitor::calculate_register_pressure()
2945 {
2946    invalidate_live_intervals();
2947    calculate_live_intervals();
2948
2949    unsigned num_instructions = instructions.length();
2950
2951    regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
2952
2953    for (int reg = 0; reg < virtual_grf_count; reg++) {
2954       for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
2955          regs_live_at_ip[ip] += virtual_grf_sizes[reg];
2956    }
2957 }
2958
2959 /**
2960  * Look for repeated FS_OPCODE_MOV_DISPATCH_TO_FLAGS and drop the later ones.
2961  *
2962  * The needs_unlit_centroid_workaround ends up producing one of these per
2963  * channel of centroid input, so it's good to clean them up.
2964  *
2965  * An assumption here is that nothing ever modifies the dispatched pixels
2966  * value that FS_OPCODE_MOV_DISPATCH_TO_FLAGS reads from, but the hardware
2967  * dictates that anyway.
2968  */
2969 void
2970 fs_visitor::opt_drop_redundant_mov_to_flags()
2971 {
2972    bool flag_mov_found[2] = {false};
2973
2974    foreach_in_list_safe(fs_inst, inst, &instructions) {
2975       if (inst->is_control_flow()) {
2976          memset(flag_mov_found, 0, sizeof(flag_mov_found));
2977       } else if (inst->opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS) {
2978          if (!flag_mov_found[inst->flag_subreg])
2979             flag_mov_found[inst->flag_subreg] = true;
2980          else
2981             inst->remove();
2982       } else if (inst->writes_flag()) {
2983          flag_mov_found[inst->flag_subreg] = false;
2984       }
2985    }
2986 }
2987
2988 bool
2989 fs_visitor::run()
2990 {
2991    sanity_param_count = fp->Base.Parameters->NumParameters;
2992    bool allocated_without_spills;
2993
2994    assign_binding_table_offsets();
2995
2996    if (brw->gen >= 6)
2997       setup_payload_gen6();
2998    else
2999       setup_payload_gen4();
3000
3001    if (0) {
3002       emit_dummy_fs();
3003    } else {
3004       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3005          emit_shader_time_begin();
3006
3007       calculate_urb_setup();
3008       if (fp->Base.InputsRead > 0) {
3009          if (brw->gen < 6)
3010             emit_interpolation_setup_gen4();
3011          else
3012             emit_interpolation_setup_gen6();
3013       }
3014
3015       /* We handle discards by keeping track of the still-live pixels in f0.1.
3016        * Initialize it with the dispatched pixels.
3017        */
3018       if (fp->UsesKill || key->alpha_test_func) {
3019          fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
3020          discard_init->flag_subreg = 1;
3021       }
3022
3023       /* Generate FS IR for main().  (the visitor only descends into
3024        * functions called "main").
3025        */
3026       if (shader) {
3027          foreach_in_list(ir_instruction, ir, shader->base.ir) {
3028             base_ir = ir;
3029             this->result = reg_undef;
3030             ir->accept(this);
3031          }
3032       } else {
3033          emit_fragment_program_code();
3034       }
3035       base_ir = NULL;
3036       if (failed)
3037          return false;
3038
3039       emit(FS_OPCODE_PLACEHOLDER_HALT);
3040
3041       if (key->alpha_test_func)
3042          emit_alpha_test();
3043
3044       emit_fb_writes();
3045
3046       split_virtual_grfs();
3047
3048       move_uniform_array_access_to_pull_constants();
3049       assign_constant_locations();
3050       demote_pull_constants();
3051
3052       opt_drop_redundant_mov_to_flags();
3053
3054 #define OPT(pass, args...) do {                                            \
3055       pass_num++;                                                          \
3056       bool this_progress = pass(args);                                     \
3057                                                                            \
3058       if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) {      \
3059          char filename[64];                                                \
3060          snprintf(filename, 64, "fs%d-%04d-%02d-%02d-" #pass,              \
3061                   dispatch_width, shader_prog->Name, iteration, pass_num); \
3062                                                                            \
3063          backend_visitor::dump_instructions(filename);                     \
3064       }                                                                    \
3065                                                                            \
3066       progress = progress || this_progress;                                \
3067    } while (false)
3068
3069       if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
3070          char filename[64];
3071          snprintf(filename, 64, "fs%d-%04d-00-start",
3072                   dispatch_width, shader_prog->Name);
3073
3074          backend_visitor::dump_instructions(filename);
3075       }
3076
3077       bool progress;
3078       int iteration = 0;
3079       do {
3080          progress = false;
3081          iteration++;
3082          int pass_num = 0;
3083
3084          compact_virtual_grfs();
3085
3086          OPT(remove_duplicate_mrf_writes);
3087
3088          OPT(opt_algebraic);
3089          OPT(opt_cse);
3090          OPT(opt_copy_propagate);
3091          OPT(opt_peephole_predicated_break);
3092          OPT(dead_code_eliminate);
3093          OPT(opt_peephole_sel);
3094          OPT(dead_control_flow_eliminate, this);
3095          OPT(opt_saturate_propagation);
3096          OPT(register_coalesce);
3097          OPT(compute_to_mrf);
3098       } while (progress);
3099
3100       if (lower_load_payload()) {
3101          register_coalesce();
3102          dead_code_eliminate();
3103       }
3104
3105       lower_uniform_pull_constant_loads();
3106
3107       assign_curb_setup();
3108       assign_urb_setup();
3109
3110       static enum instruction_scheduler_mode pre_modes[] = {
3111          SCHEDULE_PRE,
3112          SCHEDULE_PRE_NON_LIFO,
3113          SCHEDULE_PRE_LIFO,
3114       };
3115
3116       /* Try each scheduling heuristic to see if it can successfully register
3117        * allocate without spilling.  They should be ordered by decreasing
3118        * performance but increasing likelihood of allocating.
3119        */
3120       for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3121          schedule_instructions(pre_modes[i]);
3122
3123          if (0) {
3124             assign_regs_trivial();
3125             allocated_without_spills = true;
3126          } else {
3127             allocated_without_spills = assign_regs(false);
3128          }
3129          if (allocated_without_spills)
3130             break;
3131       }
3132
3133       if (!allocated_without_spills) {
3134          /* We assume that any spilling is worse than just dropping back to
3135           * SIMD8.  There's probably actually some intermediate point where
3136           * SIMD16 with a couple of spills is still better.
3137           */
3138          if (dispatch_width == 16) {
3139             fail("Failure to register allocate.  Reduce number of "
3140                  "live scalar values to avoid this.");
3141          } else {
3142             perf_debug("Fragment shader triggered register spilling.  "
3143                        "Try reducing the number of live scalar values to "
3144                        "improve performance.\n");
3145          }
3146
3147          /* Since we're out of heuristics, just go spill registers until we
3148           * get an allocation.
3149           */
3150          while (!assign_regs(true)) {
3151             if (failed)
3152                break;
3153          }
3154       }
3155    }
3156    assert(force_uncompressed_stack == 0);
3157
3158    /* This must come after all optimization and register allocation, since
3159     * it inserts dead code that happens to have side effects, and it does
3160     * so based on the actual physical registers in use.
3161     */
3162    insert_gen4_send_dependency_workarounds();
3163
3164    if (failed)
3165       return false;
3166
3167    if (!allocated_without_spills)
3168       schedule_instructions(SCHEDULE_POST);
3169
3170    if (last_scratch > 0) {
3171       prog_data->total_scratch = brw_get_scratch_size(last_scratch);
3172    }
3173
3174    if (dispatch_width == 8)
3175       prog_data->reg_blocks = brw_register_blocks(grf_used);
3176    else
3177       prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
3178
3179    /* If any state parameters were appended, then ParameterValues could have
3180     * been realloced, in which case the driver uniform storage set up by
3181     * _mesa_associate_uniform_storage() would point to freed memory.  Make
3182     * sure that didn't happen.
3183     */
3184    assert(sanity_param_count == fp->Base.Parameters->NumParameters);
3185
3186    return !failed;
3187 }
3188
3189 const unsigned *
3190 brw_wm_fs_emit(struct brw_context *brw,
3191                void *mem_ctx,
3192                const struct brw_wm_prog_key *key,
3193                struct brw_wm_prog_data *prog_data,
3194                struct gl_fragment_program *fp,
3195                struct gl_shader_program *prog,
3196                unsigned *final_assembly_size)
3197 {
3198    bool start_busy = false;
3199    double start_time = 0;
3200
3201    if (unlikely(brw->perf_debug)) {
3202       start_busy = (brw->batch.last_bo &&
3203                     drm_intel_bo_busy(brw->batch.last_bo));
3204       start_time = get_time();
3205    }
3206
3207    struct brw_shader *shader = NULL;
3208    if (prog)
3209       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3210
3211    if (unlikely(INTEL_DEBUG & DEBUG_WM))
3212       brw_dump_ir(brw, "fragment", prog, &shader->base, &fp->Base);
3213
3214    /* Now the main event: Visit the shader IR and generate our FS IR for it.
3215     */
3216    fs_visitor v(brw, mem_ctx, key, prog_data, prog, fp, 8);
3217    if (!v.run()) {
3218       if (prog) {
3219          prog->LinkStatus = false;
3220          ralloc_strcat(&prog->InfoLog, v.fail_msg);
3221       }
3222
3223       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3224                     v.fail_msg);
3225
3226       return NULL;
3227    }
3228
3229    exec_list *simd16_instructions = NULL;
3230    fs_visitor v2(brw, mem_ctx, key, prog_data, prog, fp, 16);
3231    if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16))) {
3232       if (!v.simd16_unsupported) {
3233          /* Try a SIMD16 compile */
3234          v2.import_uniforms(&v);
3235          if (!v2.run()) {
3236             perf_debug("SIMD16 shader failed to compile, falling back to "
3237                        "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
3238          } else {
3239             simd16_instructions = &v2.instructions;
3240          }
3241       } else {
3242          perf_debug("SIMD16 shader unsupported, falling back to "
3243                     "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
3244       }
3245    }
3246
3247    const unsigned *assembly = NULL;
3248    if (brw->gen >= 8) {
3249       gen8_fs_generator g(brw, mem_ctx, key, prog_data, prog, fp, v.do_dual_src);
3250       assembly = g.generate_assembly(&v.instructions, simd16_instructions,
3251                                      final_assembly_size);
3252    } else {
3253       fs_generator g(brw, mem_ctx, key, prog_data, prog, fp, v.do_dual_src,
3254                      v.runtime_check_aads_emit, INTEL_DEBUG & DEBUG_WM);
3255       assembly = g.generate_assembly(&v.instructions, simd16_instructions,
3256                                      final_assembly_size);
3257    }
3258
3259    if (unlikely(brw->perf_debug) && shader) {
3260       if (shader->compiled_once)
3261          brw_wm_debug_recompile(brw, prog, key);
3262       shader->compiled_once = true;
3263
3264       if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
3265          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3266                     (get_time() - start_time) * 1000);
3267       }
3268    }
3269
3270    return assembly;
3271 }
3272
3273 bool
3274 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
3275 {
3276    struct brw_context *brw = brw_context(ctx);
3277    struct brw_wm_prog_key key;
3278
3279    if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
3280       return true;
3281
3282    struct gl_fragment_program *fp = (struct gl_fragment_program *)
3283       prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
3284    struct brw_fragment_program *bfp = brw_fragment_program(fp);
3285    bool program_uses_dfdy = fp->UsesDFdy;
3286
3287    memset(&key, 0, sizeof(key));
3288
3289    if (brw->gen < 6) {
3290       if (fp->UsesKill)
3291          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3292
3293       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3294          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3295
3296       /* Just assume depth testing. */
3297       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3298       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3299    }
3300
3301    if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
3302                                          BRW_FS_VARYING_INPUT_MASK) > 16)
3303       key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
3304
3305    unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
3306    for (unsigned i = 0; i < sampler_count; i++) {
3307       if (fp->Base.ShadowSamplers & (1 << i)) {
3308          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3309          key.tex.swizzles[i] =
3310             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3311       } else {
3312          /* Color sampler: assume no swizzling. */
3313          key.tex.swizzles[i] = SWIZZLE_XYZW;
3314       }
3315    }
3316
3317    if (fp->Base.InputsRead & VARYING_BIT_POS) {
3318       key.drawable_height = ctx->DrawBuffer->Height;
3319    }
3320
3321    key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
3322          ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
3323          BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
3324
3325    if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3326       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
3327                           key.nr_color_regions > 1;
3328    }
3329
3330    /* GL_FRAGMENT_SHADER_DERIVATIVE_HINT is almost always GL_DONT_CARE.  The
3331     * quality of the derivatives is likely to be determined by the driconf
3332     * option.
3333     */
3334    key.high_quality_derivatives = brw->disable_derivative_optimization;
3335
3336    key.program_string_id = bfp->id;
3337
3338    uint32_t old_prog_offset = brw->wm.base.prog_offset;
3339    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3340
3341    bool success = do_wm_prog(brw, prog, bfp, &key);
3342
3343    brw->wm.base.prog_offset = old_prog_offset;
3344    brw->wm.prog_data = old_prog_data;
3345
3346    return success;
3347 }