src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 extern "C" {
  32
  33 #include <sys/types.h>
  34
  35 #include "util/hash_table.h"
  36 #include "main/macros.h"
  37 #include "main/shaderobj.h"
  38 #include "main/fbobject.h"
  39 #include "program/prog_parameter.h"
  40 #include "program/prog_print.h"
  41 #include "program/register_allocate.h"
  42 #include "program/sampler.h"
  43 #include "program/hash_table.h"
  44 #include "brw_context.h"
  45 #include "brw_eu.h"
  46 #include "brw_wm.h"
  47 }
  48 #include "brw_fs.h"
  49 #include "brw_dead_control_flow.h"
  50 #include "main/uniforms.h"
  51 #include "brw_fs_live_variables.h"
  52 #include "glsl/glsl_types.h"
  53
  54 void
  55 fs_inst::init(enum opcode opcode, const fs_reg &dst, fs_reg *src, int sources)
  56 {
  57    memset(this, 0, sizeof(*this));
  58
  59    this->opcode = opcode;
  60    this->dst = dst;
  61    this->src = src;
  62    this->sources = sources;
  63
  64    this->conditional_mod = BRW_CONDITIONAL_NONE;
  65
  66    /* This will be the case for almost all instructions. */
  67    this->regs_written = 1;
  68
  69    this->writes_accumulator = false;
  70 }
  71
  72 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst)
  73 {
  74    fs_reg *src = ralloc_array(this, fs_reg, 3);
  75    init(opcode, dst, src, 0);
  76 }
  77
  78 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
  79 {
  80    fs_reg *src = ralloc_array(this, fs_reg, 3);
  81    src[0] = src0;
  82    init(opcode, dst, src, 1);
  83 }
  84
  85 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
  86                  const fs_reg &src1)
  87 {
  88    fs_reg *src = ralloc_array(this, fs_reg, 3);
  89    src[0] = src0;
  90    src[1] = src1;
  91    init(opcode, dst, src, 2);
  92 }
  93
  94 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
  95                  const fs_reg &src1, const fs_reg &src2)
  96 {
  97    fs_reg *src = ralloc_array(this, fs_reg, 3);
  98    src[0] = src0;
  99    src[1] = src1;
 100    src[2] = src2;
 101    init(opcode, dst, src, 3);
 102 }
 103
 104 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, fs_reg src[], int sources)
 105 {
 106    init(opcode, dst, src, sources);
 107 }
 108
 109 fs_inst::fs_inst(const fs_inst &that)
 110 {
 111    memcpy(this, &that, sizeof(that));
 112
 113    this->src = ralloc_array(this, fs_reg, that.sources);
 114
 115    for (int i = 0; i < that.sources; i++)
 116       this->src[i] = that.src[i];
 117 }
 118
 119 void
 120 fs_inst::resize_sources(uint8_t num_sources)
 121 {
 122    if (this->sources != num_sources) {
 123       this->src = reralloc(this, this->src, fs_reg, num_sources);
 124       this->sources = num_sources;
 125    }
 126 }
 127
 128 #define ALU1(op)                                                        \
 129    fs_inst *                                                            \
 130    fs_visitor::op(const fs_reg &dst, const fs_reg &src0)                \
 131    {                                                                    \
 132       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
 133    }
 134
 135 #define ALU2(op)                                                        \
 136    fs_inst *                                                            \
 137    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 138                   const fs_reg &src1)                                   \
 139    {                                                                    \
 140       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
 141    }
 142
 143 #define ALU2_ACC(op)                                                    \
 144    fs_inst *                                                            \
 145    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 146                   const fs_reg &src1)                                   \
 147    {                                                                    \
 148       fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
 149       inst->writes_accumulator = true;                                  \
 150       return inst;                                                      \
 151    }
 152
 153 #define ALU3(op)                                                        \
 154    fs_inst *                                                            \
 155    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 156                   const fs_reg &src1, const fs_reg &src2)               \
 157    {                                                                    \
 158       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
 159    }
 160
 161 ALU1(NOT)
 162 ALU1(MOV)
 163 ALU1(FRC)
 164 ALU1(RNDD)
 165 ALU1(RNDE)
 166 ALU1(RNDZ)
 167 ALU2(ADD)
 168 ALU2(MUL)
 169 ALU2_ACC(MACH)
 170 ALU2(AND)
 171 ALU2(OR)
 172 ALU2(XOR)
 173 ALU2(SHL)
 174 ALU2(SHR)
 175 ALU2(ASR)
 176 ALU3(LRP)
 177 ALU1(BFREV)
 178 ALU3(BFE)
 179 ALU2(BFI1)
 180 ALU3(BFI2)
 181 ALU1(FBH)
 182 ALU1(FBL)
 183 ALU1(CBIT)
 184 ALU3(MAD)
 185 ALU2_ACC(ADDC)
 186 ALU2_ACC(SUBB)
 187 ALU2(SEL)
 188 ALU2(MAC)
 189
 190 /** Gen4 predicated IF. */
 191 fs_inst *
 192 fs_visitor::IF(enum brw_predicate predicate)
 193 {
 194    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
 195    inst->predicate = predicate;
 196    return inst;
 197 }
 198
 199 /** Gen6 IF with embedded comparison. */
 200 fs_inst *
 201 fs_visitor::IF(const fs_reg &src0, const fs_reg &src1,
 202                enum brw_conditional_mod condition)
 203 {
 204    assert(brw->gen == 6);
 205    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
 206                                         reg_null_d, src0, src1);
 207    inst->conditional_mod = condition;
 208    return inst;
 209 }
 210
 211 /**
 212  * CMP: Sets the low bit of the destination channels with the result
 213  * of the comparison, while the upper bits are undefined, and updates
 214  * the flag register with the packed 16 bits of the result.
 215  */
 216 fs_inst *
 217 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1,
 218                 enum brw_conditional_mod condition)
 219 {
 220    fs_inst *inst;
 221
 222    /* Take the instruction:
 223     *
 224     * CMP null<d> src0<f> src1<f>
 225     *
 226     * Original gen4 does type conversion to the destination type before
 227     * comparison, producing garbage results for floating point comparisons.
 228     * gen5 does the comparison on the execution type (resolved source types),
 229     * so dst type doesn't matter.  gen6 does comparison and then uses the
 230     * result as if it was the dst type with no conversion, which happens to
 231     * mostly work out for float-interpreted-as-int since our comparisons are
 232     * for >0, =0, <0.
 233     */
 234    if (brw->gen == 4) {
 235       dst.type = src0.type;
 236       if (dst.file == HW_REG)
 237          dst.fixed_hw_reg.type = dst.type;
 238    }
 239
 240    resolve_ud_negate(&src0);
 241    resolve_ud_negate(&src1);
 242
 243    inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
 244    inst->conditional_mod = condition;
 245
 246    return inst;
 247 }
 248
 249 fs_inst *
 250 fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources)
 251 {
 252    fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, dst, src,
 253                                         sources);
 254    inst->regs_written = sources;
 255
 256    return inst;
 257 }
 258
 259 exec_list
 260 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
 261                                        const fs_reg &surf_index,
 262                                        const fs_reg &varying_offset,
 263                                        uint32_t const_offset)
 264 {
 265    exec_list instructions;
 266    fs_inst *inst;
 267
 268    /* We have our constant surface use a pitch of 4 bytes, so our index can
 269     * be any component of a vector, and then we load 4 contiguous
 270     * components starting from that.
 271     *
 272     * We break down the const_offset to a portion added to the variable
 273     * offset and a portion done using reg_offset, which means that if you
 274     * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
 275     * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
 276     * CSE can later notice that those loads are all the same and eliminate
 277     * the redundant ones.
 278     */
 279    fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
 280    instructions.push_tail(ADD(vec4_offset,
 281                               varying_offset, const_offset & ~3));
 282
 283    int scale = 1;
 284    if (brw->gen == 4 && dispatch_width == 8) {
 285       /* Pre-gen5, we can either use a SIMD8 message that requires (header,
 286        * u, v, r) as parameters, or we can just use the SIMD16 message
 287        * consisting of (header, u).  We choose the second, at the cost of a
 288        * longer return length.
 289        */
 290       scale = 2;
 291    }
 292
 293    enum opcode op;
 294    if (brw->gen >= 7)
 295       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
 296    else
 297       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
 298    fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4 * scale), dst.type);
 299    inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
 300    inst->regs_written = 4 * scale;
 301    instructions.push_tail(inst);
 302
 303    if (brw->gen < 7) {
 304       inst->base_mrf = 13;
 305       inst->header_present = true;
 306       if (brw->gen == 4)
 307          inst->mlen = 3;
 308       else
 309          inst->mlen = 1 + dispatch_width / 8;
 310    }
 311
 312    vec4_result.reg_offset += (const_offset & 3) * scale;
 313    instructions.push_tail(MOV(dst, vec4_result));
 314
 315    return instructions;
 316 }
 317
 318 /**
 319  * A helper for MOV generation for fixing up broken hardware SEND dependency
 320  * handling.
 321  */
 322 fs_inst *
 323 fs_visitor::DEP_RESOLVE_MOV(int grf)
 324 {
 325    fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
 326
 327    inst->ir = NULL;
 328    inst->annotation = "send dependency resolve";
 329
 330    /* The caller always wants uncompressed to emit the minimal extra
 331     * dependencies, and to avoid having to deal with aligning its regs to 2.
 332     */
 333    inst->force_uncompressed = true;
 334
 335    return inst;
 336 }
 337
 338 bool
 339 fs_inst::equals(fs_inst *inst) const
 340 {
 341    return (opcode == inst->opcode &&
 342            dst.equals(inst->dst) &&
 343            src[0].equals(inst->src[0]) &&
 344            src[1].equals(inst->src[1]) &&
 345            src[2].equals(inst->src[2]) &&
 346            saturate == inst->saturate &&
 347            predicate == inst->predicate &&
 348            conditional_mod == inst->conditional_mod &&
 349            mlen == inst->mlen &&
 350            base_mrf == inst->base_mrf &&
 351            target == inst->target &&
 352            eot == inst->eot &&
 353            header_present == inst->header_present &&
 354            shadow_compare == inst->shadow_compare &&
 355            offset == inst->offset);
 356 }
 357
 358 bool
 359 fs_inst::overwrites_reg(const fs_reg &reg) const
 360 {
 361    return (reg.file == dst.file &&
 362            reg.reg == dst.reg &&
 363            reg.reg_offset >= dst.reg_offset  &&
 364            reg.reg_offset < dst.reg_offset + regs_written);
 365 }
 366
 367 bool
 368 fs_inst::is_send_from_grf() const
 369 {
 370    return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
 371            opcode == SHADER_OPCODE_SHADER_TIME_ADD ||
 372            opcode == FS_OPCODE_INTERPOLATE_AT_CENTROID ||
 373            opcode == FS_OPCODE_INTERPOLATE_AT_SAMPLE ||
 374            opcode == FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET ||
 375            opcode == FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET ||
 376            (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
 377             src[1].file == GRF) ||
 378            (is_tex() && src[0].file == GRF));
 379 }
 380
 381 bool
 382 fs_inst::can_do_source_mods(struct brw_context *brw)
 383 {
 384    if (brw->gen == 6 && is_math())
 385       return false;
 386
 387    if (is_send_from_grf())
 388       return false;
 389
 390    if (!backend_instruction::can_do_source_mods())
 391       return false;
 392
 393    return true;
 394 }
 395
 396 void
 397 fs_reg::init()
 398 {
 399    memset(this, 0, sizeof(*this));
 400    stride = 1;
 401 }
 402
 403 /** Generic unset register constructor. */
 404 fs_reg::fs_reg()
 405 {
 406    init();
 407    this->file = BAD_FILE;
 408 }
 409
 410 /** Immediate value constructor. */
 411 fs_reg::fs_reg(float f)
 412 {
 413    init();
 414    this->file = IMM;
 415    this->type = BRW_REGISTER_TYPE_F;
 416    this->fixed_hw_reg.dw1.f = f;
 417 }
 418
 419 /** Immediate value constructor. */
 420 fs_reg::fs_reg(int32_t i)
 421 {
 422    init();
 423    this->file = IMM;
 424    this->type = BRW_REGISTER_TYPE_D;
 425    this->fixed_hw_reg.dw1.d = i;
 426 }
 427
 428 /** Immediate value constructor. */
 429 fs_reg::fs_reg(uint32_t u)
 430 {
 431    init();
 432    this->file = IMM;
 433    this->type = BRW_REGISTER_TYPE_UD;
 434    this->fixed_hw_reg.dw1.ud = u;
 435 }
 436
 437 /** Fixed brw_reg. */
 438 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
 439 {
 440    init();
 441    this->file = HW_REG;
 442    this->fixed_hw_reg = fixed_hw_reg;
 443    this->type = fixed_hw_reg.type;
 444 }
 445
 446 bool
 447 fs_reg::equals(const fs_reg &r) const
 448 {
 449    return (file == r.file &&
 450            reg == r.reg &&
 451            reg_offset == r.reg_offset &&
 452            subreg_offset == r.subreg_offset &&
 453            type == r.type &&
 454            negate == r.negate &&
 455            abs == r.abs &&
 456            !reladdr && !r.reladdr &&
 457            memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
 458                   sizeof(fixed_hw_reg)) == 0 &&
 459            stride == r.stride);
 460 }
 461
 462 fs_reg &
 463 fs_reg::apply_stride(unsigned stride)
 464 {
 465    assert((this->stride * stride) <= 4 &&
 466           (is_power_of_two(stride) || stride == 0) &&
 467           file != HW_REG && file != IMM);
 468    this->stride *= stride;
 469    return *this;
 470 }
 471
 472 fs_reg &
 473 fs_reg::set_smear(unsigned subreg)
 474 {
 475    assert(file != HW_REG && file != IMM);
 476    subreg_offset = subreg * type_sz(type);
 477    stride = 0;
 478    return *this;
 479 }
 480
 481 bool
 482 fs_reg::is_contiguous() const
 483 {
 484    return stride == 1;
 485 }
 486
 487 bool
 488 fs_reg::is_valid_3src() const
 489 {
 490    return file == GRF || file == UNIFORM;
 491 }
 492
 493 int
 494 fs_visitor::type_size(const struct glsl_type *type)
 495 {
 496    unsigned int size, i;
 497
 498    switch (type->base_type) {
 499    case GLSL_TYPE_UINT:
 500    case GLSL_TYPE_INT:
 501    case GLSL_TYPE_FLOAT:
 502    case GLSL_TYPE_BOOL:
 503       return type->components();
 504    case GLSL_TYPE_ARRAY:
 505       return type_size(type->fields.array) * type->length;
 506    case GLSL_TYPE_STRUCT:
 507       size = 0;
 508       for (i = 0; i < type->length; i++) {
 509          size += type_size(type->fields.structure[i].type);
 510       }
 511       return size;
 512    case GLSL_TYPE_SAMPLER:
 513       /* Samplers take up no register space, since they're baked in at
 514        * link time.
 515        */
 516       return 0;
 517    case GLSL_TYPE_ATOMIC_UINT:
 518       return 0;
 519    case GLSL_TYPE_IMAGE:
 520    case GLSL_TYPE_VOID:
 521    case GLSL_TYPE_ERROR:
 522    case GLSL_TYPE_INTERFACE:
 523       unreachable("not reached");
 524    }
 525
 526    return 0;
 527 }
 528
 529 fs_reg
 530 fs_visitor::get_timestamp()
 531 {
 532    assert(brw->gen >= 7);
 533
 534    fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 535                                           BRW_ARF_TIMESTAMP,
 536                                           0),
 537                              BRW_REGISTER_TYPE_UD));
 538
 539    fs_reg dst = fs_reg(this, glsl_type::uint_type);
 540
 541    fs_inst *mov = emit(MOV(dst, ts));
 542    /* We want to read the 3 fields we care about (mostly field 0, but also 2)
 543     * even if it's not enabled in the dispatch.
 544     */
 545    mov->force_writemask_all = true;
 546    mov->force_uncompressed = true;
 547
 548    /* The caller wants the low 32 bits of the timestamp.  Since it's running
 549     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
 550     * which is plenty of time for our purposes.  It is identical across the
 551     * EUs, but since it's tracking GPU core speed it will increment at a
 552     * varying rate as render P-states change.
 553     *
 554     * The caller could also check if render P-states have changed (or anything
 555     * else that might disrupt timing) by setting smear to 2 and checking if
 556     * that field is != 0.
 557     */
 558    dst.set_smear(0);
 559
 560    return dst;
 561 }
 562
 563 void
 564 fs_visitor::emit_shader_time_begin()
 565 {
 566    current_annotation = "shader time start";
 567    shader_start_time = get_timestamp();
 568 }
 569
 570 void
 571 fs_visitor::emit_shader_time_end()
 572 {
 573    current_annotation = "shader time end";
 574
 575    enum shader_time_shader_type type, written_type, reset_type;
 576    if (dispatch_width == 8) {
 577       type = ST_FS8;
 578       written_type = ST_FS8_WRITTEN;
 579       reset_type = ST_FS8_RESET;
 580    } else {
 581       assert(dispatch_width == 16);
 582       type = ST_FS16;
 583       written_type = ST_FS16_WRITTEN;
 584       reset_type = ST_FS16_RESET;
 585    }
 586
 587    fs_reg shader_end_time = get_timestamp();
 588
 589    /* Check that there weren't any timestamp reset events (assuming these
 590     * were the only two timestamp reads that happened).
 591     */
 592    fs_reg reset = shader_end_time;
 593    reset.set_smear(2);
 594    fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
 595    test->conditional_mod = BRW_CONDITIONAL_Z;
 596    emit(IF(BRW_PREDICATE_NORMAL));
 597
 598    push_force_uncompressed();
 599    fs_reg start = shader_start_time;
 600    start.negate = true;
 601    fs_reg diff = fs_reg(this, glsl_type::uint_type);
 602    emit(ADD(diff, start, shader_end_time));
 603
 604    /* If there were no instructions between the two timestamp gets, the diff
 605     * is 2 cycles.  Remove that overhead, so I can forget about that when
 606     * trying to determine the time taken for single instructions.
 607     */
 608    emit(ADD(diff, diff, fs_reg(-2u)));
 609
 610    emit_shader_time_write(type, diff);
 611    emit_shader_time_write(written_type, fs_reg(1u));
 612    emit(BRW_OPCODE_ELSE);
 613    emit_shader_time_write(reset_type, fs_reg(1u));
 614    emit(BRW_OPCODE_ENDIF);
 615
 616    pop_force_uncompressed();
 617 }
 618
 619 void
 620 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
 621                                    fs_reg value)
 622 {
 623    int shader_time_index =
 624       brw_get_shader_time_index(brw, shader_prog, &fp->Base, type);
 625    fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
 626
 627    fs_reg payload;
 628    if (dispatch_width == 8)
 629       payload = fs_reg(this, glsl_type::uvec2_type);
 630    else
 631       payload = fs_reg(this, glsl_type::uint_type);
 632
 633    emit(new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
 634                              fs_reg(), payload, offset, value));
 635 }
 636
 637 void
 638 fs_visitor::vfail(const char *format, va_list va)
 639 {
 640    char *msg;
 641
 642    if (failed)
 643       return;
 644
 645    failed = true;
 646
 647    msg = ralloc_vasprintf(mem_ctx, format, va);
 648    msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
 649
 650    this->fail_msg = msg;
 651
 652    if (INTEL_DEBUG & DEBUG_WM) {
 653       fprintf(stderr, "%s",  msg);
 654    }
 655 }
 656
 657 void
 658 fs_visitor::fail(const char *format, ...)
 659 {
 660    va_list va;
 661
 662    va_start(va, format);
 663    vfail(format, va);
 664    va_end(va);
 665 }
 666
 667 /**
 668  * Mark this program as impossible to compile in SIMD16 mode.
 669  *
 670  * During the SIMD8 compile (which happens first), we can detect and flag
 671  * things that are unsupported in SIMD16 mode, so the compiler can skip
 672  * the SIMD16 compile altogether.
 673  *
 674  * During a SIMD16 compile (if one happens anyway), this just calls fail().
 675  */
 676 void
 677 fs_visitor::no16(const char *format, ...)
 678 {
 679    va_list va;
 680
 681    va_start(va, format);
 682
 683    if (dispatch_width == 16) {
 684       vfail(format, va);
 685    } else {
 686       simd16_unsupported = true;
 687
 688       if (brw->perf_debug) {
 689          if (no16_msg)
 690             ralloc_vasprintf_append(&no16_msg, format, va);
 691          else
 692             no16_msg = ralloc_vasprintf(mem_ctx, format, va);
 693       }
 694    }
 695
 696    va_end(va);
 697 }
 698
 699 fs_inst *
 700 fs_visitor::emit(enum opcode opcode)
 701 {
 702    return emit(new(mem_ctx) fs_inst(opcode));
 703 }
 704
 705 fs_inst *
 706 fs_visitor::emit(enum opcode opcode, const fs_reg &dst)
 707 {
 708    return emit(new(mem_ctx) fs_inst(opcode, dst));
 709 }
 710
 711 fs_inst *
 712 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
 713 {
 714    return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
 715 }
 716
 717 fs_inst *
 718 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 719                  const fs_reg &src1)
 720 {
 721    return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
 722 }
 723
 724 fs_inst *
 725 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 726                  const fs_reg &src1, const fs_reg &src2)
 727 {
 728    return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
 729 }
 730
 731 fs_inst *
 732 fs_visitor::emit(enum opcode opcode, const fs_reg &dst,
 733                  fs_reg src[], int sources)
 734 {
 735    return emit(new(mem_ctx) fs_inst(opcode, dst, src, sources));
 736 }
 737
 738 void
 739 fs_visitor::push_force_uncompressed()
 740 {
 741    force_uncompressed_stack++;
 742 }
 743
 744 void
 745 fs_visitor::pop_force_uncompressed()
 746 {
 747    force_uncompressed_stack--;
 748    assert(force_uncompressed_stack >= 0);
 749 }
 750
 751 /**
 752  * Returns true if the instruction has a flag that means it won't
 753  * update an entire destination register.
 754  *
 755  * For example, dead code elimination and live variable analysis want to know
 756  * when a write to a variable screens off any preceding values that were in
 757  * it.
 758  */
 759 bool
 760 fs_inst::is_partial_write() const
 761 {
 762    return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
 763            this->force_uncompressed ||
 764            this->force_sechalf || !this->dst.is_contiguous());
 765 }
 766
 767 int
 768 fs_inst::regs_read(fs_visitor *v, int arg) const
 769 {
 770    if (is_tex() && arg == 0 && src[0].file == GRF) {
 771       if (v->dispatch_width == 16)
 772          return (mlen + 1) / 2;
 773       else
 774          return mlen;
 775    }
 776    return 1;
 777 }
 778
 779 bool
 780 fs_inst::reads_flag() const
 781 {
 782    return predicate;
 783 }
 784
 785 bool
 786 fs_inst::writes_flag() const
 787 {
 788    return (conditional_mod && opcode != BRW_OPCODE_SEL) ||
 789           opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
 790 }
 791
 792 /**
 793  * Returns how many MRFs an FS opcode will write over.
 794  *
 795  * Note that this is not the 0 or 1 implied writes in an actual gen
 796  * instruction -- the FS opcodes often generate MOVs in addition.
 797  */
 798 int
 799 fs_visitor::implied_mrf_writes(fs_inst *inst)
 800 {
 801    if (inst->mlen == 0)
 802       return 0;
 803
 804    if (inst->base_mrf == -1)
 805       return 0;
 806
 807    switch (inst->opcode) {
 808    case SHADER_OPCODE_RCP:
 809    case SHADER_OPCODE_RSQ:
 810    case SHADER_OPCODE_SQRT:
 811    case SHADER_OPCODE_EXP2:
 812    case SHADER_OPCODE_LOG2:
 813    case SHADER_OPCODE_SIN:
 814    case SHADER_OPCODE_COS:
 815       return 1 * dispatch_width / 8;
 816    case SHADER_OPCODE_POW:
 817    case SHADER_OPCODE_INT_QUOTIENT:
 818    case SHADER_OPCODE_INT_REMAINDER:
 819       return 2 * dispatch_width / 8;
 820    case SHADER_OPCODE_TEX:
 821    case FS_OPCODE_TXB:
 822    case SHADER_OPCODE_TXD:
 823    case SHADER_OPCODE_TXF:
 824    case SHADER_OPCODE_TXF_CMS:
 825    case SHADER_OPCODE_TXF_MCS:
 826    case SHADER_OPCODE_TG4:
 827    case SHADER_OPCODE_TG4_OFFSET:
 828    case SHADER_OPCODE_TXL:
 829    case SHADER_OPCODE_TXS:
 830    case SHADER_OPCODE_LOD:
 831       return 1;
 832    case FS_OPCODE_FB_WRITE:
 833       return 2;
 834    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 835    case SHADER_OPCODE_GEN4_SCRATCH_READ:
 836       return 1;
 837    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
 838       return inst->mlen;
 839    case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
 840       return 2;
 841    case SHADER_OPCODE_UNTYPED_ATOMIC:
 842    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
 843    case FS_OPCODE_INTERPOLATE_AT_CENTROID:
 844    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
 845    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
 846    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
 847       return 0;
 848    default:
 849       unreachable("not reached");
 850    }
 851 }
 852
 853 int
 854 fs_visitor::virtual_grf_alloc(int size)
 855 {
 856    if (virtual_grf_array_size <= virtual_grf_count) {
 857       if (virtual_grf_array_size == 0)
 858          virtual_grf_array_size = 16;
 859       else
 860          virtual_grf_array_size *= 2;
 861       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 862                                    virtual_grf_array_size);
 863    }
 864    virtual_grf_sizes[virtual_grf_count] = size;
 865    return virtual_grf_count++;
 866 }
 867
 868 /** Fixed HW reg constructor. */
 869 fs_reg::fs_reg(enum register_file file, int reg)
 870 {
 871    init();
 872    this->file = file;
 873    this->reg = reg;
 874    this->type = BRW_REGISTER_TYPE_F;
 875 }
 876
 877 /** Fixed HW reg constructor. */
 878 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type)
 879 {
 880    init();
 881    this->file = file;
 882    this->reg = reg;
 883    this->type = type;
 884 }
 885
 886 /** Automatic reg constructor. */
 887 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
 888 {
 889    init();
 890
 891    this->file = GRF;
 892    this->reg = v->virtual_grf_alloc(v->type_size(type));
 893    this->reg_offset = 0;
 894    this->type = brw_type_for_base_type(type);
 895 }
 896
 897 fs_reg *
 898 fs_visitor::variable_storage(ir_variable *var)
 899 {
 900    return (fs_reg *)hash_table_find(this->variable_ht, var);
 901 }
 902
 903 void
 904 import_uniforms_callback(const void *key,
 905                          void *data,
 906                          void *closure)
 907 {
 908    struct hash_table *dst_ht = (struct hash_table *)closure;
 909    const fs_reg *reg = (const fs_reg *)data;
 910
 911    if (reg->file != UNIFORM)
 912       return;
 913
 914    hash_table_insert(dst_ht, data, key);
 915 }
 916
 917 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
 918  * This brings in those uniform definitions
 919  */
 920 void
 921 fs_visitor::import_uniforms(fs_visitor *v)
 922 {
 923    hash_table_call_foreach(v->variable_ht,
 924                            import_uniforms_callback,
 925                            variable_ht);
 926    this->push_constant_loc = v->push_constant_loc;
 927    this->pull_constant_loc = v->pull_constant_loc;
 928    this->uniforms = v->uniforms;
 929    this->param_size = v->param_size;
 930 }
 931
 932 /* Our support for uniforms is piggy-backed on the struct
 933  * gl_fragment_program, because that's where the values actually
 934  * get stored, rather than in some global gl_shader_program uniform
 935  * store.
 936  */
 937 void
 938 fs_visitor::setup_uniform_values(ir_variable *ir)
 939 {
 940    int namelen = strlen(ir->name);
 941
 942    /* The data for our (non-builtin) uniforms is stored in a series of
 943     * gl_uniform_driver_storage structs for each subcomponent that
 944     * glGetUniformLocation() could name.  We know it's been set up in the same
 945     * order we'd walk the type, so walk the list of storage and find anything
 946     * with our name, or the prefix of a component that starts with our name.
 947     */
 948    unsigned params_before = uniforms;
 949    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
 950       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 951
 952       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 953           (storage->name[namelen] != 0 &&
 954            storage->name[namelen] != '.' &&
 955            storage->name[namelen] != '[')) {
 956          continue;
 957       }
 958
 959       unsigned slots = storage->type->component_slots();
 960       if (storage->array_elements)
 961          slots *= storage->array_elements;
 962
 963       for (unsigned i = 0; i < slots; i++) {
 964          stage_prog_data->param[uniforms++] = &storage->storage[i];
 965       }
 966    }
 967
 968    /* Make sure we actually initialized the right amount of stuff here. */
 969    assert(params_before + ir->type->component_slots() == uniforms);
 970    (void)params_before;
 971 }
 972
 973
 974 /* Our support for builtin uniforms is even scarier than non-builtin.
 975  * It sits on top of the PROG_STATE_VAR parameters that are
 976  * automatically updated from GL context state.
 977  */
 978 void
 979 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
 980 {
 981    const ir_state_slot *const slots = ir->state_slots;
 982    assert(ir->state_slots != NULL);
 983
 984    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 985       /* This state reference has already been setup by ir_to_mesa, but we'll
 986        * get the same index back here.
 987        */
 988       int index = _mesa_add_state_reference(this->fp->Base.Parameters,
 989                                             (gl_state_index *)slots[i].tokens);
 990
 991       /* Add each of the unique swizzles of the element as a parameter.
 992        * This'll end up matching the expected layout of the
 993        * array/matrix/structure we're trying to fill in.
 994        */
 995       int last_swiz = -1;
 996       for (unsigned int j = 0; j < 4; j++) {
 997          int swiz = GET_SWZ(slots[i].swizzle, j);
 998          if (swiz == last_swiz)
 999             break;
1000          last_swiz = swiz;
1001
1002          stage_prog_data->param[uniforms++] =
1003             &fp->Base.Parameters->ParameterValues[index][swiz];
1004       }
1005    }
1006 }
1007
1008 fs_reg *
1009 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
1010 {
1011    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1012    fs_reg wpos = *reg;
1013    bool flip = !ir->data.origin_upper_left ^ key->render_to_fbo;
1014
1015    /* gl_FragCoord.x */
1016    if (ir->data.pixel_center_integer) {
1017       emit(MOV(wpos, this->pixel_x));
1018    } else {
1019       emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
1020    }
1021    wpos.reg_offset++;
1022
1023    /* gl_FragCoord.y */
1024    if (!flip && ir->data.pixel_center_integer) {
1025       emit(MOV(wpos, this->pixel_y));
1026    } else {
1027       fs_reg pixel_y = this->pixel_y;
1028       float offset = (ir->data.pixel_center_integer ? 0.0 : 0.5);
1029
1030       if (flip) {
1031          pixel_y.negate = true;
1032          offset += key->drawable_height - 1.0;
1033       }
1034
1035       emit(ADD(wpos, pixel_y, fs_reg(offset)));
1036    }
1037    wpos.reg_offset++;
1038
1039    /* gl_FragCoord.z */
1040    if (brw->gen >= 6) {
1041       emit(MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
1042    } else {
1043       emit(FS_OPCODE_LINTERP, wpos,
1044            this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1045            this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1046            interp_reg(VARYING_SLOT_POS, 2));
1047    }
1048    wpos.reg_offset++;
1049
1050    /* gl_FragCoord.w: Already set up in emit_interpolation */
1051    emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1052
1053    return reg;
1054 }
1055
1056 fs_inst *
1057 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1058                          glsl_interp_qualifier interpolation_mode,
1059                          bool is_centroid, bool is_sample)
1060 {
1061    brw_wm_barycentric_interp_mode barycoord_mode;
1062    if (brw->gen >= 6) {
1063       if (is_centroid) {
1064          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1065             barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1066          else
1067             barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1068       } else if (is_sample) {
1069           if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1070             barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1071          else
1072             barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1073       } else {
1074          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1075             barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1076          else
1077             barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1078       }
1079    } else {
1080       /* On Ironlake and below, there is only one interpolation mode.
1081        * Centroid interpolation doesn't mean anything on this hardware --
1082        * there is no multisampling.
1083        */
1084       barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1085    }
1086    return emit(FS_OPCODE_LINTERP, attr,
1087                this->delta_x[barycoord_mode],
1088                this->delta_y[barycoord_mode], interp);
1089 }
1090
1091 fs_reg *
1092 fs_visitor::emit_general_interpolation(ir_variable *ir)
1093 {
1094    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1095    reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
1096    fs_reg attr = *reg;
1097
1098    unsigned int array_elements;
1099    const glsl_type *type;
1100
1101    if (ir->type->is_array()) {
1102       array_elements = ir->type->length;
1103       if (array_elements == 0) {
1104          fail("dereferenced array '%s' has length 0\n", ir->name);
1105       }
1106       type = ir->type->fields.array;
1107    } else {
1108       array_elements = 1;
1109       type = ir->type;
1110    }
1111
1112    glsl_interp_qualifier interpolation_mode =
1113       ir->determine_interpolation_mode(key->flat_shade);
1114
1115    int location = ir->data.location;
1116    for (unsigned int i = 0; i < array_elements; i++) {
1117       for (unsigned int j = 0; j < type->matrix_columns; j++) {
1118          if (prog_data->urb_setup[location] == -1) {
1119             /* If there's no incoming setup data for this slot, don't
1120              * emit interpolation for it.
1121              */
1122             attr.reg_offset += type->vector_elements;
1123             location++;
1124             continue;
1125          }
1126
1127          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1128             /* Constant interpolation (flat shading) case. The SF has
1129              * handed us defined values in only the constant offset
1130              * field of the setup reg.
1131              */
1132             for (unsigned int k = 0; k < type->vector_elements; k++) {
1133                struct brw_reg interp = interp_reg(location, k);
1134                interp = suboffset(interp, 3);
1135                interp.type = reg->type;
1136                emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1137                attr.reg_offset++;
1138             }
1139          } else {
1140             /* Smooth/noperspective interpolation case. */
1141             for (unsigned int k = 0; k < type->vector_elements; k++) {
1142                struct brw_reg interp = interp_reg(location, k);
1143                if (brw->needs_unlit_centroid_workaround && ir->data.centroid) {
1144                   /* Get the pixel/sample mask into f0 so that we know
1145                    * which pixels are lit.  Then, for each channel that is
1146                    * unlit, replace the centroid data with non-centroid
1147                    * data.
1148                    */
1149                   emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1150
1151                   fs_inst *inst;
1152                   inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1153                                       false, false);
1154                   inst->predicate = BRW_PREDICATE_NORMAL;
1155                   inst->predicate_inverse = true;
1156                   if (brw->has_pln)
1157                      inst->no_dd_clear = true;
1158
1159                   inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1160                                       ir->data.centroid && !key->persample_shading,
1161                                       ir->data.sample || key->persample_shading);
1162                   inst->predicate = BRW_PREDICATE_NORMAL;
1163                   inst->predicate_inverse = false;
1164                   if (brw->has_pln)
1165                      inst->no_dd_check = true;
1166
1167                } else {
1168                   emit_linterp(attr, fs_reg(interp), interpolation_mode,
1169                                ir->data.centroid && !key->persample_shading,
1170                                ir->data.sample || key->persample_shading);
1171                }
1172                if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1173                   emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1174                }
1175                attr.reg_offset++;
1176             }
1177
1178          }
1179          location++;
1180       }
1181    }
1182
1183    return reg;
1184 }
1185
1186 fs_reg *
1187 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1188 {
1189    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1190
1191    /* The frontfacing comes in as a bit in the thread payload. */
1192    if (brw->gen >= 6) {
1193       emit(BRW_OPCODE_ASR, *reg,
1194            fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1195            fs_reg(15));
1196       emit(BRW_OPCODE_NOT, *reg, *reg);
1197       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1198    } else {
1199       struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1200       /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1201        * us front face
1202        */
1203       emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1204       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1205    }
1206
1207    return reg;
1208 }
1209
1210 void
1211 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1212 {
1213    assert(dst.type == BRW_REGISTER_TYPE_F);
1214
1215    if (key->compute_pos_offset) {
1216       /* Convert int_sample_pos to floating point */
1217       emit(MOV(dst, int_sample_pos));
1218       /* Scale to the range [0, 1] */
1219       emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1220    }
1221    else {
1222       /* From ARB_sample_shading specification:
1223        * "When rendering to a non-multisample buffer, or if multisample
1224        *  rasterization is disabled, gl_SamplePosition will always be
1225        *  (0.5, 0.5).
1226        */
1227       emit(MOV(dst, fs_reg(0.5f)));
1228    }
1229 }
1230
1231 fs_reg *
1232 fs_visitor::emit_samplepos_setup(ir_variable *ir)
1233 {
1234    assert(brw->gen >= 6);
1235    assert(ir->type == glsl_type::vec2_type);
1236
1237    this->current_annotation = "compute sample position";
1238    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1239    fs_reg pos = *reg;
1240    fs_reg int_sample_x = fs_reg(this, glsl_type::int_type);
1241    fs_reg int_sample_y = fs_reg(this, glsl_type::int_type);
1242
1243    /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1244     * mode will be enabled.
1245     *
1246     * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1247     * R31.1:0         Position Offset X/Y for Slot[3:0]
1248     * R31.3:2         Position Offset X/Y for Slot[7:4]
1249     * .....
1250     *
1251     * The X, Y sample positions come in as bytes in  thread payload. So, read
1252     * the positions using vstride=16, width=8, hstride=2.
1253     */
1254    struct brw_reg sample_pos_reg =
1255       stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
1256                     BRW_REGISTER_TYPE_B), 16, 8, 2);
1257
1258    fs_inst *inst = emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1259    if (dispatch_width == 16) {
1260       inst->force_uncompressed = true;
1261       inst = emit(MOV(half(int_sample_x, 1),
1262                       fs_reg(suboffset(sample_pos_reg, 16))));
1263       inst->force_sechalf = true;
1264    }
1265    /* Compute gl_SamplePosition.x */
1266    compute_sample_position(pos, int_sample_x);
1267    pos.reg_offset++;
1268    inst = emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1269    if (dispatch_width == 16) {
1270       inst->force_uncompressed = true;
1271       inst = emit(MOV(half(int_sample_y, 1),
1272                       fs_reg(suboffset(sample_pos_reg, 17))));
1273       inst->force_sechalf = true;
1274    }
1275    /* Compute gl_SamplePosition.y */
1276    compute_sample_position(pos, int_sample_y);
1277    return reg;
1278 }
1279
1280 fs_reg *
1281 fs_visitor::emit_sampleid_setup(ir_variable *ir)
1282 {
1283    assert(brw->gen >= 6);
1284
1285    this->current_annotation = "compute sample id";
1286    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1287
1288    if (key->compute_sample_id) {
1289       fs_reg t1 = fs_reg(this, glsl_type::int_type);
1290       fs_reg t2 = fs_reg(this, glsl_type::int_type);
1291       t2.type = BRW_REGISTER_TYPE_UW;
1292
1293       /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1294        * 8x multisampling, subspan 0 will represent sample N (where N
1295        * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1296        * 7. We can find the value of N by looking at R0.0 bits 7:6
1297        * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1298        * (since samples are always delivered in pairs). That is, we
1299        * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1300        * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1301        * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1302        * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1303        * populating a temporary variable with the sequence (0, 1, 2, 3),
1304        * and then reading from it using vstride=1, width=4, hstride=0.
1305        * These computations hold good for 4x multisampling as well.
1306        *
1307        * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
1308        * the first four slots are sample 0 of subspan 0; the next four
1309        * are sample 1 of subspan 0; the third group is sample 0 of
1310        * subspan 1, and finally sample 1 of subspan 1.
1311        */
1312       fs_inst *inst;
1313       inst = emit(BRW_OPCODE_AND, t1,
1314                   fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
1315                   fs_reg(0xc0));
1316       inst->force_writemask_all = true;
1317       inst = emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1318       inst->force_writemask_all = true;
1319       /* This works for both SIMD8 and SIMD16 */
1320       inst = emit(MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210)));
1321       inst->force_writemask_all = true;
1322       /* This special instruction takes care of setting vstride=1,
1323        * width=4, hstride=0 of t2 during an ADD instruction.
1324        */
1325       emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1326    } else {
1327       /* As per GL_ARB_sample_shading specification:
1328        * "When rendering to a non-multisample buffer, or if multisample
1329        *  rasterization is disabled, gl_SampleID will always be zero."
1330        */
1331       emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1332    }
1333
1334    return reg;
1335 }
1336
1337 fs_reg
1338 fs_visitor::fix_math_operand(fs_reg src)
1339 {
1340    /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1341     * might be able to do better by doing execsize = 1 math and then
1342     * expanding that result out, but we would need to be careful with
1343     * masking.
1344     *
1345     * The hardware ignores source modifiers (negate and abs) on math
1346     * instructions, so we also move to a temp to set those up.
1347     */
1348    if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1349        !src.abs && !src.negate)
1350       return src;
1351
1352    /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1353     * operands to math
1354     */
1355    if (brw->gen >= 7 && src.file != IMM)
1356       return src;
1357
1358    fs_reg expanded = fs_reg(this, glsl_type::float_type);
1359    expanded.type = src.type;
1360    emit(BRW_OPCODE_MOV, expanded, src);
1361    return expanded;
1362 }
1363
1364 fs_inst *
1365 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1366 {
1367    switch (opcode) {
1368    case SHADER_OPCODE_RCP:
1369    case SHADER_OPCODE_RSQ:
1370    case SHADER_OPCODE_SQRT:
1371    case SHADER_OPCODE_EXP2:
1372    case SHADER_OPCODE_LOG2:
1373    case SHADER_OPCODE_SIN:
1374    case SHADER_OPCODE_COS:
1375       break;
1376    default:
1377       unreachable("not reached: bad math opcode");
1378    }
1379
1380    /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
1381     * might be able to do better by doing execsize = 1 math and then
1382     * expanding that result out, but we would need to be careful with
1383     * masking.
1384     *
1385     * Gen 6 hardware ignores source modifiers (negate and abs) on math
1386     * instructions, so we also move to a temp to set those up.
1387     */
1388    if (brw->gen == 6 || brw->gen == 7)
1389       src = fix_math_operand(src);
1390
1391    fs_inst *inst = emit(opcode, dst, src);
1392
1393    if (brw->gen < 6) {
1394       inst->base_mrf = 2;
1395       inst->mlen = dispatch_width / 8;
1396    }
1397
1398    return inst;
1399 }
1400
1401 fs_inst *
1402 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1403 {
1404    int base_mrf = 2;
1405    fs_inst *inst;
1406
1407    if (brw->gen >= 8) {
1408       inst = emit(opcode, dst, src0, src1);
1409    } else if (brw->gen >= 6) {
1410       src0 = fix_math_operand(src0);
1411       src1 = fix_math_operand(src1);
1412
1413       inst = emit(opcode, dst, src0, src1);
1414    } else {
1415       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1416        * "Message Payload":
1417        *
1418        * "Operand0[7].  For the INT DIV functions, this operand is the
1419        *  denominator."
1420        *  ...
1421        * "Operand1[7].  For the INT DIV functions, this operand is the
1422        *  numerator."
1423        */
1424       bool is_int_div = opcode != SHADER_OPCODE_POW;
1425       fs_reg &op0 = is_int_div ? src1 : src0;
1426       fs_reg &op1 = is_int_div ? src0 : src1;
1427
1428       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1429       inst = emit(opcode, dst, op0, reg_null_f);
1430
1431       inst->base_mrf = base_mrf;
1432       inst->mlen = 2 * dispatch_width / 8;
1433    }
1434    return inst;
1435 }
1436
1437 void
1438 fs_visitor::assign_curb_setup()
1439 {
1440    if (dispatch_width == 8) {
1441       prog_data->base.dispatch_grf_start_reg = payload.num_regs;
1442    } else {
1443       prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1444    }
1445
1446    prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1447
1448    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1449    foreach_in_list(fs_inst, inst, &instructions) {
1450       for (unsigned int i = 0; i < inst->sources; i++) {
1451          if (inst->src[i].file == UNIFORM) {
1452             int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1453             int constant_nr;
1454             if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1455                constant_nr = push_constant_loc[uniform_nr];
1456             } else {
1457                /* Section 5.11 of the OpenGL 4.1 spec says:
1458                 * "Out-of-bounds reads return undefined values, which include
1459                 *  values from other variables of the active program or zero."
1460                 * Just return the first push constant.
1461                 */
1462                constant_nr = 0;
1463             }
1464
1465             struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1466                                                   constant_nr / 8,
1467                                                   constant_nr % 8);
1468
1469             inst->src[i].file = HW_REG;
1470             inst->src[i].fixed_hw_reg = byte_offset(
1471                retype(brw_reg, inst->src[i].type),
1472                inst->src[i].subreg_offset);
1473          }
1474       }
1475    }
1476 }
1477
1478 void
1479 fs_visitor::calculate_urb_setup()
1480 {
1481    for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1482       prog_data->urb_setup[i] = -1;
1483    }
1484
1485    int urb_next = 0;
1486    /* Figure out where each of the incoming setup attributes lands. */
1487    if (brw->gen >= 6) {
1488       if (_mesa_bitcount_64(fp->Base.InputsRead &
1489                             BRW_FS_VARYING_INPUT_MASK) <= 16) {
1490          /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1491           * first 16 varying inputs, so we can put them wherever we want.
1492           * Just put them in order.
1493           *
1494           * This is useful because it means that (a) inputs not used by the
1495           * fragment shader won't take up valuable register space, and (b) we
1496           * won't have to recompile the fragment shader if it gets paired with
1497           * a different vertex (or geometry) shader.
1498           */
1499          for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1500             if (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1501                 BITFIELD64_BIT(i)) {
1502                prog_data->urb_setup[i] = urb_next++;
1503             }
1504          }
1505       } else {
1506          /* We have enough input varyings that the SF/SBE pipeline stage can't
1507           * arbitrarily rearrange them to suit our whim; we have to put them
1508           * in an order that matches the output of the previous pipeline stage
1509           * (geometry or vertex shader).
1510           */
1511          struct brw_vue_map prev_stage_vue_map;
1512          brw_compute_vue_map(brw, &prev_stage_vue_map,
1513                              key->input_slots_valid);
1514          int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1515          assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1516          for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1517               slot++) {
1518             int varying = prev_stage_vue_map.slot_to_varying[slot];
1519             /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1520              * unused.
1521              */
1522             if (varying != BRW_VARYING_SLOT_COUNT &&
1523                 (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1524                  BITFIELD64_BIT(varying))) {
1525                prog_data->urb_setup[varying] = slot - first_slot;
1526             }
1527          }
1528          urb_next = prev_stage_vue_map.num_slots - first_slot;
1529       }
1530    } else {
1531       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1532       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1533          /* Point size is packed into the header, not as a general attribute */
1534          if (i == VARYING_SLOT_PSIZ)
1535             continue;
1536
1537          if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1538             /* The back color slot is skipped when the front color is
1539              * also written to.  In addition, some slots can be
1540              * written in the vertex shader and not read in the
1541              * fragment shader.  So the register number must always be
1542              * incremented, mapped or not.
1543              */
1544             if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1545                prog_data->urb_setup[i] = urb_next;
1546             urb_next++;
1547          }
1548       }
1549
1550       /*
1551        * It's a FS only attribute, and we did interpolation for this attribute
1552        * in SF thread. So, count it here, too.
1553        *
1554        * See compile_sf_prog() for more info.
1555        */
1556       if (fp->Base.InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1557          prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1558    }
1559
1560    prog_data->num_varying_inputs = urb_next;
1561 }
1562
1563 void
1564 fs_visitor::assign_urb_setup()
1565 {
1566    int urb_start = payload.num_regs + prog_data->curb_read_length;
1567
1568    /* Offset all the urb_setup[] index by the actual position of the
1569     * setup regs, now that the location of the constants has been chosen.
1570     */
1571    foreach_in_list(fs_inst, inst, &instructions) {
1572       if (inst->opcode == FS_OPCODE_LINTERP) {
1573          assert(inst->src[2].file == HW_REG);
1574          inst->src[2].fixed_hw_reg.nr += urb_start;
1575       }
1576
1577       if (inst->opcode == FS_OPCODE_CINTERP) {
1578          assert(inst->src[0].file == HW_REG);
1579          inst->src[0].fixed_hw_reg.nr += urb_start;
1580       }
1581    }
1582
1583    /* Each attribute is 4 setup channels, each of which is half a reg. */
1584    this->first_non_payload_grf =
1585       urb_start + prog_data->num_varying_inputs * 2;
1586 }
1587
1588 /**
1589  * Split large virtual GRFs into separate components if we can.
1590  *
1591  * This is mostly duplicated with what brw_fs_vector_splitting does,
1592  * but that's really conservative because it's afraid of doing
1593  * splitting that doesn't result in real progress after the rest of
1594  * the optimization phases, which would cause infinite looping in
1595  * optimization.  We can do it once here, safely.  This also has the
1596  * opportunity to split interpolated values, or maybe even uniforms,
1597  * which we don't have at the IR level.
1598  *
1599  * We want to split, because virtual GRFs are what we register
1600  * allocate and spill (due to contiguousness requirements for some
1601  * instructions), and they're what we naturally generate in the
1602  * codegen process, but most virtual GRFs don't actually need to be
1603  * contiguous sets of GRFs.  If we split, we'll end up with reduced
1604  * live intervals and better dead code elimination and coalescing.
1605  */
1606 void
1607 fs_visitor::split_virtual_grfs()
1608 {
1609    int num_vars = this->virtual_grf_count;
1610    bool split_grf[num_vars];
1611    int new_virtual_grf[num_vars];
1612
1613    /* Try to split anything > 0 sized. */
1614    for (int i = 0; i < num_vars; i++) {
1615       if (this->virtual_grf_sizes[i] != 1)
1616          split_grf[i] = true;
1617       else
1618          split_grf[i] = false;
1619    }
1620
1621    if (brw->has_pln &&
1622        this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1623       /* PLN opcodes rely on the delta_xy being contiguous.  We only have to
1624        * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1625        * Gen6, that was the only supported interpolation mode, and since Gen6,
1626        * delta_x and delta_y are in fixed hardware registers.
1627        */
1628       split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1629          false;
1630    }
1631
1632    foreach_in_list(fs_inst, inst, &instructions) {
1633       /* If there's a SEND message that requires contiguous destination
1634        * registers, no splitting is allowed.
1635        */
1636       if (inst->regs_written > 1) {
1637          split_grf[inst->dst.reg] = false;
1638       }
1639
1640       /* If we're sending from a GRF, don't split it, on the assumption that
1641        * the send is reading the whole thing.
1642        */
1643       if (inst->is_send_from_grf()) {
1644          for (int i = 0; i < inst->sources; i++) {
1645             if (inst->src[i].file == GRF) {
1646                split_grf[inst->src[i].reg] = false;
1647             }
1648          }
1649       }
1650    }
1651
1652    /* Allocate new space for split regs.  Note that the virtual
1653     * numbers will be contiguous.
1654     */
1655    for (int i = 0; i < num_vars; i++) {
1656       if (split_grf[i]) {
1657          new_virtual_grf[i] = virtual_grf_alloc(1);
1658          for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1659             int reg = virtual_grf_alloc(1);
1660             assert(reg == new_virtual_grf[i] + j - 1);
1661             (void) reg;
1662          }
1663          this->virtual_grf_sizes[i] = 1;
1664       }
1665    }
1666
1667    foreach_in_list(fs_inst, inst, &instructions) {
1668       if (inst->dst.file == GRF &&
1669           split_grf[inst->dst.reg] &&
1670           inst->dst.reg_offset != 0) {
1671          inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1672                           inst->dst.reg_offset - 1);
1673          inst->dst.reg_offset = 0;
1674       }
1675       for (int i = 0; i < inst->sources; i++) {
1676          if (inst->src[i].file == GRF &&
1677              split_grf[inst->src[i].reg] &&
1678              inst->src[i].reg_offset != 0) {
1679             inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1680                                 inst->src[i].reg_offset - 1);
1681             inst->src[i].reg_offset = 0;
1682          }
1683       }
1684    }
1685    invalidate_live_intervals();
1686 }
1687
1688 /**
1689  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1690  *
1691  * During code generation, we create tons of temporary variables, many of
1692  * which get immediately killed and are never used again.  Yet, in later
1693  * optimization and analysis passes, such as compute_live_intervals, we need
1694  * to loop over all the virtual GRFs.  Compacting them can save a lot of
1695  * overhead.
1696  */
1697 void
1698 fs_visitor::compact_virtual_grfs()
1699 {
1700    if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER))
1701       return;
1702
1703    /* Mark which virtual GRFs are used, and count how many. */
1704    int remap_table[this->virtual_grf_count];
1705    memset(remap_table, -1, sizeof(remap_table));
1706
1707    foreach_in_list(const fs_inst, inst, &instructions) {
1708       if (inst->dst.file == GRF)
1709          remap_table[inst->dst.reg] = 0;
1710
1711       for (int i = 0; i < inst->sources; i++) {
1712          if (inst->src[i].file == GRF)
1713             remap_table[inst->src[i].reg] = 0;
1714       }
1715    }
1716
1717    /* Compact the GRF arrays. */
1718    int new_index = 0;
1719    for (int i = 0; i < this->virtual_grf_count; i++) {
1720       if (remap_table[i] != -1) {
1721          remap_table[i] = new_index;
1722          virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1723          invalidate_live_intervals();
1724          ++new_index;
1725       }
1726    }
1727
1728    this->virtual_grf_count = new_index;
1729
1730    /* Patch all the instructions to use the newly renumbered registers */
1731    foreach_in_list(fs_inst, inst, &instructions) {
1732       if (inst->dst.file == GRF)
1733          inst->dst.reg = remap_table[inst->dst.reg];
1734
1735       for (int i = 0; i < inst->sources; i++) {
1736          if (inst->src[i].file == GRF)
1737             inst->src[i].reg = remap_table[inst->src[i].reg];
1738       }
1739    }
1740
1741    /* Patch all the references to delta_x/delta_y, since they're used in
1742     * register allocation.
1743     */
1744    for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
1745       if (delta_x[i].file == GRF && remap_table[delta_x[i].reg] != -1) {
1746          delta_x[i].reg = remap_table[delta_x[i].reg];
1747       }
1748    }
1749    for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
1750       if (delta_y[i].file == GRF && remap_table[delta_y[i].reg] != -1) {
1751          delta_y[i].reg = remap_table[delta_y[i].reg];
1752       }
1753    }
1754 }
1755
1756 /*
1757  * Implements array access of uniforms by inserting a
1758  * PULL_CONSTANT_LOAD instruction.
1759  *
1760  * Unlike temporary GRF array access (where we don't support it due to
1761  * the difficulty of doing relative addressing on instruction
1762  * destinations), we could potentially do array access of uniforms
1763  * that were loaded in GRF space as push constants.  In real-world
1764  * usage we've seen, though, the arrays being used are always larger
1765  * than we could load as push constants, so just always move all
1766  * uniform array access out to a pull constant buffer.
1767  */
1768 void
1769 fs_visitor::move_uniform_array_access_to_pull_constants()
1770 {
1771    if (dispatch_width != 8)
1772       return;
1773
1774    pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
1775
1776    for (unsigned int i = 0; i < uniforms; i++) {
1777       pull_constant_loc[i] = -1;
1778    }
1779
1780    /* Walk through and find array access of uniforms.  Put a copy of that
1781     * uniform in the pull constant buffer.
1782     *
1783     * Note that we don't move constant-indexed accesses to arrays.  No
1784     * testing has been done of the performance impact of this choice.
1785     */
1786    foreach_in_list_safe(fs_inst, inst, &instructions) {
1787       for (int i = 0 ; i < inst->sources; i++) {
1788          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1789             continue;
1790
1791          int uniform = inst->src[i].reg;
1792
1793          /* If this array isn't already present in the pull constant buffer,
1794           * add it.
1795           */
1796          if (pull_constant_loc[uniform] == -1) {
1797             const gl_constant_value **values = &stage_prog_data->param[uniform];
1798
1799             assert(param_size[uniform]);
1800
1801             for (int j = 0; j < param_size[uniform]; j++) {
1802                pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
1803
1804                stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
1805                   values[j];
1806             }
1807          }
1808       }
1809    }
1810 }
1811
1812 /**
1813  * Assign UNIFORM file registers to either push constants or pull constants.
1814  *
1815  * We allow a fragment shader to have more than the specified minimum
1816  * maximum number of fragment shader uniform components (64).  If
1817  * there are too many of these, they'd fill up all of register space.
1818  * So, this will push some of them out to the pull constant buffer and
1819  * update the program to load them.
1820  */
1821 void
1822 fs_visitor::assign_constant_locations()
1823 {
1824    /* Only the first compile (SIMD8 mode) gets to decide on locations. */
1825    if (dispatch_width != 8)
1826       return;
1827
1828    /* Find which UNIFORM registers are still in use. */
1829    bool is_live[uniforms];
1830    for (unsigned int i = 0; i < uniforms; i++) {
1831       is_live[i] = false;
1832    }
1833
1834    foreach_in_list(fs_inst, inst, &instructions) {
1835       for (int i = 0; i < inst->sources; i++) {
1836          if (inst->src[i].file != UNIFORM)
1837             continue;
1838
1839          int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1840          if (constant_nr >= 0 && constant_nr < (int) uniforms)
1841             is_live[constant_nr] = true;
1842       }
1843    }
1844
1845    /* Only allow 16 registers (128 uniform components) as push constants.
1846     *
1847     * Just demote the end of the list.  We could probably do better
1848     * here, demoting things that are rarely used in the program first.
1849     *
1850     * If changing this value, note the limitation about total_regs in
1851     * brw_curbe.c.
1852     */
1853    unsigned int max_push_components = 16 * 8;
1854    unsigned int num_push_constants = 0;
1855
1856    push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
1857
1858    for (unsigned int i = 0; i < uniforms; i++) {
1859       if (!is_live[i] || pull_constant_loc[i] != -1) {
1860          /* This UNIFORM register is either dead, or has already been demoted
1861           * to a pull const.  Mark it as no longer living in the param[] array.
1862           */
1863          push_constant_loc[i] = -1;
1864          continue;
1865       }
1866
1867       if (num_push_constants < max_push_components) {
1868          /* Retain as a push constant.  Record the location in the params[]
1869           * array.
1870           */
1871          push_constant_loc[i] = num_push_constants++;
1872       } else {
1873          /* Demote to a pull constant. */
1874          push_constant_loc[i] = -1;
1875
1876          int pull_index = stage_prog_data->nr_pull_params++;
1877          stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
1878          pull_constant_loc[i] = pull_index;
1879       }
1880    }
1881
1882    stage_prog_data->nr_params = num_push_constants;
1883
1884    /* Up until now, the param[] array has been indexed by reg + reg_offset
1885     * of UNIFORM registers.  Condense it to only contain the uniforms we
1886     * chose to upload as push constants.
1887     */
1888    for (unsigned int i = 0; i < uniforms; i++) {
1889       int remapped = push_constant_loc[i];
1890
1891       if (remapped == -1)
1892          continue;
1893
1894       assert(remapped <= (int)i);
1895       stage_prog_data->param[remapped] = stage_prog_data->param[i];
1896    }
1897 }
1898
1899 /**
1900  * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
1901  * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
1902  */
1903 void
1904 fs_visitor::demote_pull_constants()
1905 {
1906    foreach_in_list(fs_inst, inst, &instructions) {
1907       for (int i = 0; i < inst->sources; i++) {
1908          if (inst->src[i].file != UNIFORM)
1909             continue;
1910
1911          int pull_index = pull_constant_loc[inst->src[i].reg +
1912                                             inst->src[i].reg_offset];
1913          if (pull_index == -1)
1914             continue;
1915
1916          /* Set up the annotation tracking for new generated instructions. */
1917          base_ir = inst->ir;
1918          current_annotation = inst->annotation;
1919
1920          fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
1921          fs_reg dst = fs_reg(this, glsl_type::float_type);
1922
1923          /* Generate a pull load into dst. */
1924          if (inst->src[i].reladdr) {
1925             exec_list list = VARYING_PULL_CONSTANT_LOAD(dst,
1926                                                         surf_index,
1927                                                         *inst->src[i].reladdr,
1928                                                         pull_index);
1929             inst->insert_before(&list);
1930             inst->src[i].reladdr = NULL;
1931          } else {
1932             fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1933             fs_inst *pull =
1934                new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1935                                     dst, surf_index, offset);
1936             inst->insert_before(pull);
1937             inst->src[i].set_smear(pull_index & 3);
1938          }
1939
1940          /* Rewrite the instruction to use the temporary VGRF. */
1941          inst->src[i].file = GRF;
1942          inst->src[i].reg = dst.reg;
1943          inst->src[i].reg_offset = 0;
1944       }
1945    }
1946    invalidate_live_intervals();
1947 }
1948
1949 bool
1950 fs_visitor::opt_algebraic()
1951 {
1952    bool progress = false;
1953
1954    foreach_in_list(fs_inst, inst, &instructions) {
1955       switch (inst->opcode) {
1956       case BRW_OPCODE_MUL:
1957          if (inst->src[1].file != IMM)
1958             continue;
1959
1960          /* a * 1.0 = a */
1961          if (inst->src[1].is_one()) {
1962             inst->opcode = BRW_OPCODE_MOV;
1963             inst->src[1] = reg_undef;
1964             progress = true;
1965             break;
1966          }
1967
1968          /* a * 0.0 = 0.0 */
1969          if (inst->src[1].is_zero()) {
1970             inst->opcode = BRW_OPCODE_MOV;
1971             inst->src[0] = inst->src[1];
1972             inst->src[1] = reg_undef;
1973             progress = true;
1974             break;
1975          }
1976
1977          break;
1978       case BRW_OPCODE_ADD:
1979          if (inst->src[1].file != IMM)
1980             continue;
1981
1982          /* a + 0.0 = a */
1983          if (inst->src[1].is_zero()) {
1984             inst->opcode = BRW_OPCODE_MOV;
1985             inst->src[1] = reg_undef;
1986             progress = true;
1987             break;
1988          }
1989          break;
1990       case BRW_OPCODE_OR:
1991          if (inst->src[0].equals(inst->src[1])) {
1992             inst->opcode = BRW_OPCODE_MOV;
1993             inst->src[1] = reg_undef;
1994             progress = true;
1995             break;
1996          }
1997          break;
1998       case BRW_OPCODE_LRP:
1999          if (inst->src[1].equals(inst->src[2])) {
2000             inst->opcode = BRW_OPCODE_MOV;
2001             inst->src[0] = inst->src[1];
2002             inst->src[1] = reg_undef;
2003             inst->src[2] = reg_undef;
2004             progress = true;
2005             break;
2006          }
2007          break;
2008       case BRW_OPCODE_SEL:
2009          if (inst->src[0].equals(inst->src[1])) {
2010             inst->opcode = BRW_OPCODE_MOV;
2011             inst->src[1] = reg_undef;
2012             inst->predicate = BRW_PREDICATE_NONE;
2013             inst->predicate_inverse = false;
2014             progress = true;
2015          } else if (inst->saturate && inst->src[1].file == IMM) {
2016             switch (inst->conditional_mod) {
2017             case BRW_CONDITIONAL_LE:
2018             case BRW_CONDITIONAL_L:
2019                switch (inst->src[1].type) {
2020                case BRW_REGISTER_TYPE_F:
2021                   if (inst->src[1].fixed_hw_reg.dw1.f >= 1.0f) {
2022                      inst->opcode = BRW_OPCODE_MOV;
2023                      inst->src[1] = reg_undef;
2024                      progress = true;
2025                   }
2026                   break;
2027                default:
2028                   break;
2029                }
2030                break;
2031             case BRW_CONDITIONAL_GE:
2032             case BRW_CONDITIONAL_G:
2033                switch (inst->src[1].type) {
2034                case BRW_REGISTER_TYPE_F:
2035                   if (inst->src[1].fixed_hw_reg.dw1.f <= 0.0f) {
2036                      inst->opcode = BRW_OPCODE_MOV;
2037                      inst->src[1] = reg_undef;
2038                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
2039                      progress = true;
2040                   }
2041                   break;
2042                default:
2043                   break;
2044                }
2045             default:
2046                break;
2047             }
2048          }
2049          break;
2050       default:
2051          break;
2052       }
2053    }
2054
2055    return progress;
2056 }
2057
2058 bool
2059 fs_visitor::opt_register_renaming()
2060 {
2061    bool progress = false;
2062    int depth = 0;
2063
2064    int remap[virtual_grf_count];
2065    memset(remap, -1, sizeof(int) * virtual_grf_count);
2066
2067    foreach_in_list(fs_inst, inst, &this->instructions) {
2068       if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
2069          depth++;
2070       } else if (inst->opcode == BRW_OPCODE_ENDIF ||
2071                  inst->opcode == BRW_OPCODE_WHILE) {
2072          depth--;
2073       }
2074
2075       /* Rewrite instruction sources. */
2076       for (int i = 0; i < inst->sources; i++) {
2077          if (inst->src[i].file == GRF &&
2078              remap[inst->src[i].reg] != -1 &&
2079              remap[inst->src[i].reg] != inst->src[i].reg) {
2080             inst->src[i].reg = remap[inst->src[i].reg];
2081             progress = true;
2082          }
2083       }
2084
2085       const int dst = inst->dst.reg;
2086
2087       if (depth == 0 &&
2088           inst->dst.file == GRF &&
2089           virtual_grf_sizes[inst->dst.reg] == 1 &&
2090           !inst->is_partial_write()) {
2091          if (remap[dst] == -1) {
2092             remap[dst] = dst;
2093          } else {
2094             remap[dst] = virtual_grf_alloc(1);
2095             inst->dst.reg = remap[dst];
2096             progress = true;
2097          }
2098       } else if (inst->dst.file == GRF &&
2099                  remap[dst] != -1 &&
2100                  remap[dst] != dst) {
2101          inst->dst.reg = remap[dst];
2102          progress = true;
2103       }
2104    }
2105
2106    if (progress) {
2107       invalidate_live_intervals();
2108
2109       for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
2110          if (delta_x[i].file == GRF && remap[delta_x[i].reg] != -1) {
2111             delta_x[i].reg = remap[delta_x[i].reg];
2112          }
2113       }
2114       for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
2115          if (delta_y[i].file == GRF && remap[delta_y[i].reg] != -1) {
2116             delta_y[i].reg = remap[delta_y[i].reg];
2117          }
2118       }
2119    }
2120
2121    return progress;
2122 }
2123
2124 bool
2125 fs_visitor::compute_to_mrf()
2126 {
2127    bool progress = false;
2128    int next_ip = 0;
2129
2130    calculate_live_intervals();
2131
2132    foreach_in_list_safe(fs_inst, inst, &instructions) {
2133       int ip = next_ip;
2134       next_ip++;
2135
2136       if (inst->opcode != BRW_OPCODE_MOV ||
2137           inst->is_partial_write() ||
2138           inst->dst.file != MRF || inst->src[0].file != GRF ||
2139           inst->dst.type != inst->src[0].type ||
2140           inst->src[0].abs || inst->src[0].negate ||
2141           !inst->src[0].is_contiguous() ||
2142           inst->src[0].subreg_offset)
2143          continue;
2144
2145       /* Work out which hardware MRF registers are written by this
2146        * instruction.
2147        */
2148       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2149       int mrf_high;
2150       if (inst->dst.reg & BRW_MRF_COMPR4) {
2151          mrf_high = mrf_low + 4;
2152       } else if (dispatch_width == 16 &&
2153                  (!inst->force_uncompressed && !inst->force_sechalf)) {
2154          mrf_high = mrf_low + 1;
2155       } else {
2156          mrf_high = mrf_low;
2157       }
2158
2159       /* Can't compute-to-MRF this GRF if someone else was going to
2160        * read it later.
2161        */
2162       if (this->virtual_grf_end[inst->src[0].reg] > ip)
2163          continue;
2164
2165       /* Found a move of a GRF to a MRF.  Let's see if we can go
2166        * rewrite the thing that made this GRF to write into the MRF.
2167        */
2168       fs_inst *scan_inst;
2169       for (scan_inst = (fs_inst *)inst->prev;
2170            !scan_inst->is_head_sentinel();
2171            scan_inst = (fs_inst *)scan_inst->prev) {
2172          if (scan_inst->dst.file == GRF &&
2173              scan_inst->dst.reg == inst->src[0].reg) {
2174             /* Found the last thing to write our reg we want to turn
2175              * into a compute-to-MRF.
2176              */
2177
2178             /* If this one instruction didn't populate all the
2179              * channels, bail.  We might be able to rewrite everything
2180              * that writes that reg, but it would require smarter
2181              * tracking to delay the rewriting until complete success.
2182              */
2183             if (scan_inst->is_partial_write())
2184                break;
2185
2186             /* Things returning more than one register would need us to
2187              * understand coalescing out more than one MOV at a time.
2188              */
2189             if (scan_inst->regs_written > 1)
2190                break;
2191
2192             /* SEND instructions can't have MRF as a destination. */
2193             if (scan_inst->mlen)
2194                break;
2195
2196             if (brw->gen == 6) {
2197                /* gen6 math instructions must have the destination be
2198                 * GRF, so no compute-to-MRF for them.
2199                 */
2200                if (scan_inst->is_math()) {
2201                   break;
2202                }
2203             }
2204
2205             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2206                /* Found the creator of our MRF's source value. */
2207                scan_inst->dst.file = MRF;
2208                scan_inst->dst.reg = inst->dst.reg;
2209                scan_inst->saturate |= inst->saturate;
2210                inst->remove();
2211                progress = true;
2212             }
2213             break;
2214          }
2215
2216          /* We don't handle control flow here.  Most computation of
2217           * values that end up in MRFs are shortly before the MRF
2218           * write anyway.
2219           */
2220          if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2221             break;
2222
2223          /* You can't read from an MRF, so if someone else reads our
2224           * MRF's source GRF that we wanted to rewrite, that stops us.
2225           */
2226          bool interfered = false;
2227          for (int i = 0; i < scan_inst->sources; i++) {
2228             if (scan_inst->src[i].file == GRF &&
2229                 scan_inst->src[i].reg == inst->src[0].reg &&
2230                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2231                interfered = true;
2232             }
2233          }
2234          if (interfered)
2235             break;
2236
2237          if (scan_inst->dst.file == MRF) {
2238             /* If somebody else writes our MRF here, we can't
2239              * compute-to-MRF before that.
2240              */
2241             int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2242             int scan_mrf_high;
2243
2244             if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2245                scan_mrf_high = scan_mrf_low + 4;
2246             } else if (dispatch_width == 16 &&
2247                        (!scan_inst->force_uncompressed &&
2248                         !scan_inst->force_sechalf)) {
2249                scan_mrf_high = scan_mrf_low + 1;
2250             } else {
2251                scan_mrf_high = scan_mrf_low;
2252             }
2253
2254             if (mrf_low == scan_mrf_low ||
2255                 mrf_low == scan_mrf_high ||
2256                 mrf_high == scan_mrf_low ||
2257                 mrf_high == scan_mrf_high) {
2258                break;
2259             }
2260          }
2261
2262          if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2263             /* Found a SEND instruction, which means that there are
2264              * live values in MRFs from base_mrf to base_mrf +
2265              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
2266              * above it.
2267              */
2268             if (mrf_low >= scan_inst->base_mrf &&
2269                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2270                break;
2271             }
2272             if (mrf_high >= scan_inst->base_mrf &&
2273                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2274                break;
2275             }
2276          }
2277       }
2278    }
2279
2280    if (progress)
2281       invalidate_live_intervals();
2282
2283    return progress;
2284 }
2285
2286 /**
2287  * Walks through basic blocks, looking for repeated MRF writes and
2288  * removing the later ones.
2289  */
2290 bool
2291 fs_visitor::remove_duplicate_mrf_writes()
2292 {
2293    fs_inst *last_mrf_move[16];
2294    bool progress = false;
2295
2296    /* Need to update the MRF tracking for compressed instructions. */
2297    if (dispatch_width == 16)
2298       return false;
2299
2300    memset(last_mrf_move, 0, sizeof(last_mrf_move));
2301
2302    foreach_in_list_safe(fs_inst, inst, &instructions) {
2303       if (inst->is_control_flow()) {
2304          memset(last_mrf_move, 0, sizeof(last_mrf_move));
2305       }
2306
2307       if (inst->opcode == BRW_OPCODE_MOV &&
2308           inst->dst.file == MRF) {
2309          fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2310          if (prev_inst && inst->equals(prev_inst)) {
2311             inst->remove();
2312             progress = true;
2313             continue;
2314          }
2315       }
2316
2317       /* Clear out the last-write records for MRFs that were overwritten. */
2318       if (inst->dst.file == MRF) {
2319          last_mrf_move[inst->dst.reg] = NULL;
2320       }
2321
2322       if (inst->mlen > 0 && inst->base_mrf != -1) {
2323          /* Found a SEND instruction, which will include two or fewer
2324           * implied MRF writes.  We could do better here.
2325           */
2326          for (int i = 0; i < implied_mrf_writes(inst); i++) {
2327             last_mrf_move[inst->base_mrf + i] = NULL;
2328          }
2329       }
2330
2331       /* Clear out any MRF move records whose sources got overwritten. */
2332       if (inst->dst.file == GRF) {
2333          for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2334             if (last_mrf_move[i] &&
2335                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2336                last_mrf_move[i] = NULL;
2337             }
2338          }
2339       }
2340
2341       if (inst->opcode == BRW_OPCODE_MOV &&
2342           inst->dst.file == MRF &&
2343           inst->src[0].file == GRF &&
2344           !inst->is_partial_write()) {
2345          last_mrf_move[inst->dst.reg] = inst;
2346       }
2347    }
2348
2349    if (progress)
2350       invalidate_live_intervals();
2351
2352    return progress;
2353 }
2354
2355 static void
2356 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2357                         int first_grf, int grf_len)
2358 {
2359    bool inst_simd16 = (dispatch_width > 8 &&
2360                        !inst->force_uncompressed &&
2361                        !inst->force_sechalf);
2362
2363    /* Clear the flag for registers that actually got read (as expected). */
2364    for (int i = 0; i < inst->sources; i++) {
2365       int grf;
2366       if (inst->src[i].file == GRF) {
2367          grf = inst->src[i].reg;
2368       } else if (inst->src[i].file == HW_REG &&
2369                  inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2370          grf = inst->src[i].fixed_hw_reg.nr;
2371       } else {
2372          continue;
2373       }
2374
2375       if (grf >= first_grf &&
2376           grf < first_grf + grf_len) {
2377          deps[grf - first_grf] = false;
2378          if (inst_simd16)
2379             deps[grf - first_grf + 1] = false;
2380       }
2381    }
2382 }
2383
2384 /**
2385  * Implements this workaround for the original 965:
2386  *
2387  *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2388  *      check for post destination dependencies on this instruction, software
2389  *      must ensure that there is no destination hazard for the case of ‘write
2390  *      followed by a posted write’ shown in the following example.
2391  *
2392  *      1. mov r3 0
2393  *      2. send r3.xy <rest of send instruction>
2394  *      3. mov r2 r3
2395  *
2396  *      Due to no post-destination dependency check on the ‘send’, the above
2397  *      code sequence could have two instructions (1 and 2) in flight at the
2398  *      same time that both consider ‘r3’ as the target of their final writes.
2399  */
2400 void
2401 fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
2402 {
2403    int reg_size = dispatch_width / 8;
2404    int write_len = inst->regs_written * reg_size;
2405    int first_write_grf = inst->dst.reg;
2406    bool needs_dep[BRW_MAX_MRF];
2407    assert(write_len < (int)sizeof(needs_dep) - 1);
2408
2409    memset(needs_dep, false, sizeof(needs_dep));
2410    memset(needs_dep, true, write_len);
2411
2412    clear_deps_for_inst_src(inst, dispatch_width,
2413                            needs_dep, first_write_grf, write_len);
2414
2415    /* Walk backwards looking for writes to registers we're writing which
2416     * aren't read since being written.  If we hit the start of the program,
2417     * we assume that there are no outstanding dependencies on entry to the
2418     * program.
2419     */
2420    for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2421         !scan_inst->is_head_sentinel();
2422         scan_inst = (fs_inst *)scan_inst->prev) {
2423
2424       /* If we hit control flow, assume that there *are* outstanding
2425        * dependencies, and force their cleanup before our instruction.
2426        */
2427       if (scan_inst->is_control_flow()) {
2428          for (int i = 0; i < write_len; i++) {
2429             if (needs_dep[i]) {
2430                inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2431             }
2432          }
2433          return;
2434       }
2435
2436       bool scan_inst_simd16 = (dispatch_width > 8 &&
2437                                !scan_inst->force_uncompressed &&
2438                                !scan_inst->force_sechalf);
2439
2440       /* We insert our reads as late as possible on the assumption that any
2441        * instruction but a MOV that might have left us an outstanding
2442        * dependency has more latency than a MOV.
2443        */
2444       if (scan_inst->dst.file == GRF) {
2445          for (int i = 0; i < scan_inst->regs_written; i++) {
2446             int reg = scan_inst->dst.reg + i * reg_size;
2447
2448             if (reg >= first_write_grf &&
2449                 reg < first_write_grf + write_len &&
2450                 needs_dep[reg - first_write_grf]) {
2451                inst->insert_before(DEP_RESOLVE_MOV(reg));
2452                needs_dep[reg - first_write_grf] = false;
2453                if (scan_inst_simd16)
2454                   needs_dep[reg - first_write_grf + 1] = false;
2455             }
2456          }
2457       }
2458
2459       /* Clear the flag for registers that actually got read (as expected). */
2460       clear_deps_for_inst_src(scan_inst, dispatch_width,
2461                               needs_dep, first_write_grf, write_len);
2462
2463       /* Continue the loop only if we haven't resolved all the dependencies */
2464       int i;
2465       for (i = 0; i < write_len; i++) {
2466          if (needs_dep[i])
2467             break;
2468       }
2469       if (i == write_len)
2470          return;
2471    }
2472 }
2473
2474 /**
2475  * Implements this workaround for the original 965:
2476  *
2477  *     "[DevBW, DevCL] Errata: A destination register from a send can not be
2478  *      used as a destination register until after it has been sourced by an
2479  *      instruction with a different destination register.
2480  */
2481 void
2482 fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
2483 {
2484    int write_len = inst->regs_written * dispatch_width / 8;
2485    int first_write_grf = inst->dst.reg;
2486    bool needs_dep[BRW_MAX_MRF];
2487    assert(write_len < (int)sizeof(needs_dep) - 1);
2488
2489    memset(needs_dep, false, sizeof(needs_dep));
2490    memset(needs_dep, true, write_len);
2491    /* Walk forwards looking for writes to registers we're writing which aren't
2492     * read before being written.
2493     */
2494    for (fs_inst *scan_inst = (fs_inst *)inst->next;
2495         !scan_inst->is_tail_sentinel();
2496         scan_inst = (fs_inst *)scan_inst->next) {
2497       /* If we hit control flow, force resolve all remaining dependencies. */
2498       if (scan_inst->is_control_flow()) {
2499          for (int i = 0; i < write_len; i++) {
2500             if (needs_dep[i])
2501                scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2502          }
2503          return;
2504       }
2505
2506       /* Clear the flag for registers that actually got read (as expected). */
2507       clear_deps_for_inst_src(scan_inst, dispatch_width,
2508                               needs_dep, first_write_grf, write_len);
2509
2510       /* We insert our reads as late as possible since they're reading the
2511        * result of a SEND, which has massive latency.
2512        */
2513       if (scan_inst->dst.file == GRF &&
2514           scan_inst->dst.reg >= first_write_grf &&
2515           scan_inst->dst.reg < first_write_grf + write_len &&
2516           needs_dep[scan_inst->dst.reg - first_write_grf]) {
2517          scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2518          needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2519       }
2520
2521       /* Continue the loop only if we haven't resolved all the dependencies */
2522       int i;
2523       for (i = 0; i < write_len; i++) {
2524          if (needs_dep[i])
2525             break;
2526       }
2527       if (i == write_len)
2528          return;
2529    }
2530
2531    /* If we hit the end of the program, resolve all remaining dependencies out
2532     * of paranoia.
2533     */
2534    fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2535    assert(last_inst->eot);
2536    for (int i = 0; i < write_len; i++) {
2537       if (needs_dep[i])
2538          last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2539    }
2540 }
2541
2542 void
2543 fs_visitor::insert_gen4_send_dependency_workarounds()
2544 {
2545    if (brw->gen != 4 || brw->is_g4x)
2546       return;
2547
2548    bool progress = false;
2549
2550    /* Note that we're done with register allocation, so GRF fs_regs always
2551     * have a .reg_offset of 0.
2552     */
2553
2554    foreach_in_list_safe(fs_inst, inst, &instructions) {
2555       if (inst->mlen != 0 && inst->dst.file == GRF) {
2556          insert_gen4_pre_send_dependency_workarounds(inst);
2557          insert_gen4_post_send_dependency_workarounds(inst);
2558          progress = true;
2559       }
2560    }
2561
2562    if (progress)
2563       invalidate_live_intervals();
2564 }
2565
2566 /**
2567  * Turns the generic expression-style uniform pull constant load instruction
2568  * into a hardware-specific series of instructions for loading a pull
2569  * constant.
2570  *
2571  * The expression style allows the CSE pass before this to optimize out
2572  * repeated loads from the same offset, and gives the pre-register-allocation
2573  * scheduling full flexibility, while the conversion to native instructions
2574  * allows the post-register-allocation scheduler the best information
2575  * possible.
2576  *
2577  * Note that execution masking for setting up pull constant loads is special:
2578  * the channels that need to be written are unrelated to the current execution
2579  * mask, since a later instruction will use one of the result channels as a
2580  * source operand for all 8 or 16 of its channels.
2581  */
2582 void
2583 fs_visitor::lower_uniform_pull_constant_loads()
2584 {
2585    foreach_in_list(fs_inst, inst, &instructions) {
2586       if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2587          continue;
2588
2589       if (brw->gen >= 7) {
2590          /* The offset arg before was a vec4-aligned byte offset.  We need to
2591           * turn it into a dword offset.
2592           */
2593          fs_reg const_offset_reg = inst->src[1];
2594          assert(const_offset_reg.file == IMM &&
2595                 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2596          const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
2597          fs_reg payload = fs_reg(this, glsl_type::uint_type);
2598
2599          /* This is actually going to be a MOV, but since only the first dword
2600           * is accessed, we have a special opcode to do just that one.  Note
2601           * that this needs to be an operation that will be considered a def
2602           * by live variable analysis, or register allocation will explode.
2603           */
2604          fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2605                                                payload, const_offset_reg);
2606          setup->force_writemask_all = true;
2607
2608          setup->ir = inst->ir;
2609          setup->annotation = inst->annotation;
2610          inst->insert_before(setup);
2611
2612          /* Similarly, this will only populate the first 4 channels of the
2613           * result register (since we only use smear values from 0-3), but we
2614           * don't tell the optimizer.
2615           */
2616          inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2617          inst->src[1] = payload;
2618
2619          invalidate_live_intervals();
2620       } else {
2621          /* Before register allocation, we didn't tell the scheduler about the
2622           * MRF we use.  We know it's safe to use this MRF because nothing
2623           * else does except for register spill/unspill, which generates and
2624           * uses its MRF within a single IR instruction.
2625           */
2626          inst->base_mrf = 14;
2627          inst->mlen = 1;
2628       }
2629    }
2630 }
2631
2632 bool
2633 fs_visitor::lower_load_payload()
2634 {
2635    bool progress = false;
2636
2637    foreach_in_list_safe(fs_inst, inst, &instructions) {
2638       if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
2639          fs_reg dst = inst->dst;
2640
2641          /* src[0] represents the (optional) message header. */
2642          if (inst->src[0].file != BAD_FILE) {
2643             inst->insert_before(MOV(dst, inst->src[0]));
2644          }
2645          dst.reg_offset++;
2646
2647          for (int i = 1; i < inst->sources; i++) {
2648             inst->insert_before(MOV(dst, inst->src[i]));
2649             dst.reg_offset++;
2650          }
2651
2652          inst->remove();
2653          progress = true;
2654       }
2655    }
2656
2657    if (progress)
2658       invalidate_live_intervals();
2659
2660    return progress;
2661 }
2662
2663 void
2664 fs_visitor::dump_instructions()
2665 {
2666    dump_instructions(NULL);
2667 }
2668
2669 void
2670 fs_visitor::dump_instructions(const char *name)
2671 {
2672    calculate_register_pressure();
2673    FILE *file = stderr;
2674    if (name && geteuid() != 0) {
2675       file = fopen(name, "w");
2676       if (!file)
2677          file = stderr;
2678    }
2679
2680    int ip = 0, max_pressure = 0;
2681    foreach_in_list(backend_instruction, inst, &instructions) {
2682       max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
2683       fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
2684       dump_instruction(inst, file);
2685       ++ip;
2686    }
2687    fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
2688
2689    if (file != stderr) {
2690       fclose(file);
2691    }
2692 }
2693
2694 void
2695 fs_visitor::dump_instruction(backend_instruction *be_inst)
2696 {
2697    dump_instruction(be_inst, stderr);
2698 }
2699
2700 void
2701 fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
2702 {
2703    fs_inst *inst = (fs_inst *)be_inst;
2704
2705    if (inst->predicate) {
2706       fprintf(file, "(%cf0.%d) ",
2707              inst->predicate_inverse ? '-' : '+',
2708              inst->flag_subreg);
2709    }
2710
2711    fprintf(file, "%s", brw_instruction_name(inst->opcode));
2712    if (inst->saturate)
2713       fprintf(file, ".sat");
2714    if (inst->conditional_mod) {
2715       fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
2716       if (!inst->predicate &&
2717           (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2718                               inst->opcode != BRW_OPCODE_IF &&
2719                               inst->opcode != BRW_OPCODE_WHILE))) {
2720          fprintf(file, ".f0.%d", inst->flag_subreg);
2721       }
2722    }
2723    fprintf(file, " ");
2724
2725
2726    switch (inst->dst.file) {
2727    case GRF:
2728       fprintf(file, "vgrf%d", inst->dst.reg);
2729       if (virtual_grf_sizes[inst->dst.reg] != 1 ||
2730           inst->dst.subreg_offset)
2731          fprintf(file, "+%d.%d",
2732                  inst->dst.reg_offset, inst->dst.subreg_offset);
2733       break;
2734    case MRF:
2735       fprintf(file, "m%d", inst->dst.reg);
2736       break;
2737    case BAD_FILE:
2738       fprintf(file, "(null)");
2739       break;
2740    case UNIFORM:
2741       fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
2742       break;
2743    case HW_REG:
2744       if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
2745          switch (inst->dst.fixed_hw_reg.nr) {
2746          case BRW_ARF_NULL:
2747             fprintf(file, "null");
2748             break;
2749          case BRW_ARF_ADDRESS:
2750             fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
2751             break;
2752          case BRW_ARF_ACCUMULATOR:
2753             fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
2754             break;
2755          case BRW_ARF_FLAG:
2756             fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
2757                              inst->dst.fixed_hw_reg.subnr);
2758             break;
2759          default:
2760             fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
2761                                inst->dst.fixed_hw_reg.subnr);
2762             break;
2763          }
2764       } else {
2765          fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
2766       }
2767       if (inst->dst.fixed_hw_reg.subnr)
2768          fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
2769       break;
2770    default:
2771       fprintf(file, "???");
2772       break;
2773    }
2774    fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
2775
2776    for (int i = 0; i < inst->sources && inst->src[i].file != BAD_FILE; i++) {
2777       if (inst->src[i].negate)
2778          fprintf(file, "-");
2779       if (inst->src[i].abs)
2780          fprintf(file, "|");
2781       switch (inst->src[i].file) {
2782       case GRF:
2783          fprintf(file, "vgrf%d", inst->src[i].reg);
2784          if (virtual_grf_sizes[inst->src[i].reg] != 1 ||
2785              inst->src[i].subreg_offset)
2786             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
2787                     inst->src[i].subreg_offset);
2788          break;
2789       case MRF:
2790          fprintf(file, "***m%d***", inst->src[i].reg);
2791          break;
2792       case UNIFORM:
2793          fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
2794          if (inst->src[i].reladdr) {
2795             fprintf(file, "+reladdr");
2796          } else if (virtual_grf_sizes[inst->src[i].reg] != 1 ||
2797              inst->src[i].subreg_offset) {
2798             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
2799                     inst->src[i].subreg_offset);
2800          }
2801          break;
2802       case BAD_FILE:
2803          fprintf(file, "(null)");
2804          break;
2805       case IMM:
2806          switch (inst->src[i].type) {
2807          case BRW_REGISTER_TYPE_F:
2808             fprintf(file, "%ff", inst->src[i].fixed_hw_reg.dw1.f);
2809             break;
2810          case BRW_REGISTER_TYPE_D:
2811             fprintf(file, "%dd", inst->src[i].fixed_hw_reg.dw1.d);
2812             break;
2813          case BRW_REGISTER_TYPE_UD:
2814             fprintf(file, "%uu", inst->src[i].fixed_hw_reg.dw1.ud);
2815             break;
2816          default:
2817             fprintf(file, "???");
2818             break;
2819          }
2820          break;
2821       case HW_REG:
2822          if (inst->src[i].fixed_hw_reg.negate)
2823             fprintf(file, "-");
2824          if (inst->src[i].fixed_hw_reg.abs)
2825             fprintf(file, "|");
2826          if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
2827             switch (inst->src[i].fixed_hw_reg.nr) {
2828             case BRW_ARF_NULL:
2829                fprintf(file, "null");
2830                break;
2831             case BRW_ARF_ADDRESS:
2832                fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
2833                break;
2834             case BRW_ARF_ACCUMULATOR:
2835                fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
2836                break;
2837             case BRW_ARF_FLAG:
2838                fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
2839                                 inst->src[i].fixed_hw_reg.subnr);
2840                break;
2841             default:
2842                fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
2843                                   inst->src[i].fixed_hw_reg.subnr);
2844                break;
2845             }
2846          } else {
2847             fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
2848          }
2849          if (inst->src[i].fixed_hw_reg.subnr)
2850             fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
2851          if (inst->src[i].fixed_hw_reg.abs)
2852             fprintf(file, "|");
2853          break;
2854       default:
2855          fprintf(file, "???");
2856          break;
2857       }
2858       if (inst->src[i].abs)
2859          fprintf(file, "|");
2860
2861       if (inst->src[i].file != IMM) {
2862          fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
2863       }
2864
2865       if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
2866          fprintf(file, ", ");
2867    }
2868
2869    fprintf(file, " ");
2870
2871    if (inst->force_uncompressed)
2872       fprintf(file, "1sthalf ");
2873
2874    if (inst->force_sechalf)
2875       fprintf(file, "2ndhalf ");
2876
2877    fprintf(file, "\n");
2878 }
2879
2880 /**
2881  * Possibly returns an instruction that set up @param reg.
2882  *
2883  * Sometimes we want to take the result of some expression/variable
2884  * dereference tree and rewrite the instruction generating the result
2885  * of the tree.  When processing the tree, we know that the
2886  * instructions generated are all writing temporaries that are dead
2887  * outside of this tree.  So, if we have some instructions that write
2888  * a temporary, we're free to point that temp write somewhere else.
2889  *
2890  * Note that this doesn't guarantee that the instruction generated
2891  * only reg -- it might be the size=4 destination of a texture instruction.
2892  */
2893 fs_inst *
2894 fs_visitor::get_instruction_generating_reg(fs_inst *start,
2895                                            fs_inst *end,
2896                                            const fs_reg &reg)
2897 {
2898    if (end == start ||
2899        end->is_partial_write() ||
2900        reg.reladdr ||
2901        !reg.equals(end->dst)) {
2902       return NULL;
2903    } else {
2904       return end;
2905    }
2906 }
2907
2908 void
2909 fs_visitor::setup_payload_gen6()
2910 {
2911    bool uses_depth =
2912       (fp->Base.InputsRead & (1 << VARYING_SLOT_POS)) != 0;
2913    unsigned barycentric_interp_modes = prog_data->barycentric_interp_modes;
2914
2915    assert(brw->gen >= 6);
2916
2917    /* R0-1: masks, pixel X/Y coordinates. */
2918    payload.num_regs = 2;
2919    /* R2: only for 32-pixel dispatch.*/
2920
2921    /* R3-26: barycentric interpolation coordinates.  These appear in the
2922     * same order that they appear in the brw_wm_barycentric_interp_mode
2923     * enum.  Each set of coordinates occupies 2 registers if dispatch width
2924     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
2925     * appear if they were enabled using the "Barycentric Interpolation
2926     * Mode" bits in WM_STATE.
2927     */
2928    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
2929       if (barycentric_interp_modes & (1 << i)) {
2930          payload.barycentric_coord_reg[i] = payload.num_regs;
2931          payload.num_regs += 2;
2932          if (dispatch_width == 16) {
2933             payload.num_regs += 2;
2934          }
2935       }
2936    }
2937
2938    /* R27: interpolated depth if uses source depth */
2939    if (uses_depth) {
2940       payload.source_depth_reg = payload.num_regs;
2941       payload.num_regs++;
2942       if (dispatch_width == 16) {
2943          /* R28: interpolated depth if not SIMD8. */
2944          payload.num_regs++;
2945       }
2946    }
2947    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
2948    if (uses_depth) {
2949       payload.source_w_reg = payload.num_regs;
2950       payload.num_regs++;
2951       if (dispatch_width == 16) {
2952          /* R30: interpolated W if not SIMD8. */
2953          payload.num_regs++;
2954       }
2955    }
2956
2957    prog_data->uses_pos_offset = key->compute_pos_offset;
2958    /* R31: MSAA position offsets. */
2959    if (prog_data->uses_pos_offset) {
2960       payload.sample_pos_reg = payload.num_regs;
2961       payload.num_regs++;
2962    }
2963
2964    /* R32: MSAA input coverage mask */
2965    if (fp->Base.SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
2966       assert(brw->gen >= 7);
2967       payload.sample_mask_in_reg = payload.num_regs;
2968       payload.num_regs++;
2969       if (dispatch_width == 16) {
2970          /* R33: input coverage mask if not SIMD8. */
2971          payload.num_regs++;
2972       }
2973    }
2974
2975    /* R34-: bary for 32-pixel. */
2976    /* R58-59: interp W for 32-pixel. */
2977
2978    if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
2979       source_depth_to_render_target = true;
2980    }
2981 }
2982
2983 void
2984 fs_visitor::assign_binding_table_offsets()
2985 {
2986    uint32_t next_binding_table_offset = 0;
2987
2988    /* If there are no color regions, we still perform an FB write to a null
2989     * renderbuffer, which we place at surface index 0.
2990     */
2991    prog_data->binding_table.render_target_start = next_binding_table_offset;
2992    next_binding_table_offset += MAX2(key->nr_color_regions, 1);
2993
2994    assign_common_binding_table_offsets(next_binding_table_offset);
2995 }
2996
2997 void
2998 fs_visitor::calculate_register_pressure()
2999 {
3000    invalidate_live_intervals();
3001    calculate_live_intervals();
3002
3003    unsigned num_instructions = instructions.length();
3004
3005    regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
3006
3007    for (int reg = 0; reg < virtual_grf_count; reg++) {
3008       for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
3009          regs_live_at_ip[ip] += virtual_grf_sizes[reg];
3010    }
3011 }
3012
3013 /**
3014  * Look for repeated FS_OPCODE_MOV_DISPATCH_TO_FLAGS and drop the later ones.
3015  *
3016  * The needs_unlit_centroid_workaround ends up producing one of these per
3017  * channel of centroid input, so it's good to clean them up.
3018  *
3019  * An assumption here is that nothing ever modifies the dispatched pixels
3020  * value that FS_OPCODE_MOV_DISPATCH_TO_FLAGS reads from, but the hardware
3021  * dictates that anyway.
3022  */
3023 void
3024 fs_visitor::opt_drop_redundant_mov_to_flags()
3025 {
3026    bool flag_mov_found[2] = {false};
3027
3028    foreach_in_list_safe(fs_inst, inst, &instructions) {
3029       if (inst->is_control_flow()) {
3030          memset(flag_mov_found, 0, sizeof(flag_mov_found));
3031       } else if (inst->opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS) {
3032          if (!flag_mov_found[inst->flag_subreg])
3033             flag_mov_found[inst->flag_subreg] = true;
3034          else
3035             inst->remove();
3036       } else if (inst->writes_flag()) {
3037          flag_mov_found[inst->flag_subreg] = false;
3038       }
3039    }
3040 }
3041
3042 bool
3043 fs_visitor::run()
3044 {
3045    sanity_param_count = fp->Base.Parameters->NumParameters;
3046    bool allocated_without_spills;
3047
3048    assign_binding_table_offsets();
3049
3050    if (brw->gen >= 6)
3051       setup_payload_gen6();
3052    else
3053       setup_payload_gen4();
3054
3055    if (0) {
3056       emit_dummy_fs();
3057    } else {
3058       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3059          emit_shader_time_begin();
3060
3061       calculate_urb_setup();
3062       if (fp->Base.InputsRead > 0) {
3063          if (brw->gen < 6)
3064             emit_interpolation_setup_gen4();
3065          else
3066             emit_interpolation_setup_gen6();
3067       }
3068
3069       /* We handle discards by keeping track of the still-live pixels in f0.1.
3070        * Initialize it with the dispatched pixels.
3071        */
3072       if (fp->UsesKill || key->alpha_test_func) {
3073          fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
3074          discard_init->flag_subreg = 1;
3075       }
3076
3077       /* Generate FS IR for main().  (the visitor only descends into
3078        * functions called "main").
3079        */
3080       if (shader) {
3081          foreach_in_list(ir_instruction, ir, shader->base.ir) {
3082             base_ir = ir;
3083             this->result = reg_undef;
3084             ir->accept(this);
3085          }
3086       } else {
3087          emit_fragment_program_code();
3088       }
3089       base_ir = NULL;
3090       if (failed)
3091          return false;
3092
3093       emit(FS_OPCODE_PLACEHOLDER_HALT);
3094
3095       if (key->alpha_test_func)
3096          emit_alpha_test();
3097
3098       emit_fb_writes();
3099
3100       split_virtual_grfs();
3101
3102       move_uniform_array_access_to_pull_constants();
3103       assign_constant_locations();
3104       demote_pull_constants();
3105
3106       opt_drop_redundant_mov_to_flags();
3107
3108 #define OPT(pass, args...) do {                                            \
3109       pass_num++;                                                          \
3110       bool this_progress = pass(args);                                     \
3111                                                                            \
3112       if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) {      \
3113          char filename[64];                                                \
3114          snprintf(filename, 64, "fs%d-%04d-%02d-%02d-" #pass,              \
3115                   dispatch_width, shader_prog->Name, iteration, pass_num); \
3116                                                                            \
3117          backend_visitor::dump_instructions(filename);                     \
3118       }                                                                    \
3119                                                                            \
3120       progress = progress || this_progress;                                \
3121    } while (false)
3122
3123       if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
3124          char filename[64];
3125          snprintf(filename, 64, "fs%d-%04d-00-start",
3126                   dispatch_width, shader_prog->Name);
3127
3128          backend_visitor::dump_instructions(filename);
3129       }
3130
3131       bool progress;
3132       int iteration = 0;
3133       do {
3134          progress = false;
3135          iteration++;
3136          int pass_num = 0;
3137
3138          compact_virtual_grfs();
3139
3140          OPT(remove_duplicate_mrf_writes);
3141
3142          OPT(opt_algebraic);
3143          OPT(opt_cse);
3144          OPT(opt_copy_propagate);
3145          OPT(opt_peephole_predicated_break);
3146          OPT(dead_code_eliminate);
3147          OPT(opt_peephole_sel);
3148          OPT(dead_control_flow_eliminate, this);
3149          OPT(opt_register_renaming);
3150          OPT(opt_saturate_propagation);
3151          OPT(register_coalesce);
3152          OPT(compute_to_mrf);
3153       } while (progress);
3154
3155       if (lower_load_payload()) {
3156          register_coalesce();
3157          dead_code_eliminate();
3158       }
3159
3160       lower_uniform_pull_constant_loads();
3161
3162       assign_curb_setup();
3163       assign_urb_setup();
3164
3165       static enum instruction_scheduler_mode pre_modes[] = {
3166          SCHEDULE_PRE,
3167          SCHEDULE_PRE_NON_LIFO,
3168          SCHEDULE_PRE_LIFO,
3169       };
3170
3171       /* Try each scheduling heuristic to see if it can successfully register
3172        * allocate without spilling.  They should be ordered by decreasing
3173        * performance but increasing likelihood of allocating.
3174        */
3175       for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3176          schedule_instructions(pre_modes[i]);
3177
3178          if (0) {
3179             assign_regs_trivial();
3180             allocated_without_spills = true;
3181          } else {
3182             allocated_without_spills = assign_regs(false);
3183          }
3184          if (allocated_without_spills)
3185             break;
3186       }
3187
3188       if (!allocated_without_spills) {
3189          /* We assume that any spilling is worse than just dropping back to
3190           * SIMD8.  There's probably actually some intermediate point where
3191           * SIMD16 with a couple of spills is still better.
3192           */
3193          if (dispatch_width == 16) {
3194             fail("Failure to register allocate.  Reduce number of "
3195                  "live scalar values to avoid this.");
3196          } else {
3197             perf_debug("Fragment shader triggered register spilling.  "
3198                        "Try reducing the number of live scalar values to "
3199                        "improve performance.\n");
3200          }
3201
3202          /* Since we're out of heuristics, just go spill registers until we
3203           * get an allocation.
3204           */
3205          while (!assign_regs(true)) {
3206             if (failed)
3207                break;
3208          }
3209       }
3210    }
3211    assert(force_uncompressed_stack == 0);
3212
3213    /* This must come after all optimization and register allocation, since
3214     * it inserts dead code that happens to have side effects, and it does
3215     * so based on the actual physical registers in use.
3216     */
3217    insert_gen4_send_dependency_workarounds();
3218
3219    if (failed)
3220       return false;
3221
3222    if (!allocated_without_spills)
3223       schedule_instructions(SCHEDULE_POST);
3224
3225    if (last_scratch > 0) {
3226       prog_data->total_scratch = brw_get_scratch_size(last_scratch);
3227    }
3228
3229    if (dispatch_width == 8)
3230       prog_data->reg_blocks = brw_register_blocks(grf_used);
3231    else
3232       prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
3233
3234    /* If any state parameters were appended, then ParameterValues could have
3235     * been realloced, in which case the driver uniform storage set up by
3236     * _mesa_associate_uniform_storage() would point to freed memory.  Make
3237     * sure that didn't happen.
3238     */
3239    assert(sanity_param_count == fp->Base.Parameters->NumParameters);
3240
3241    return !failed;
3242 }
3243
3244 const unsigned *
3245 brw_wm_fs_emit(struct brw_context *brw,
3246                void *mem_ctx,
3247                const struct brw_wm_prog_key *key,
3248                struct brw_wm_prog_data *prog_data,
3249                struct gl_fragment_program *fp,
3250                struct gl_shader_program *prog,
3251                unsigned *final_assembly_size)
3252 {
3253    bool start_busy = false;
3254    double start_time = 0;
3255
3256    if (unlikely(brw->perf_debug)) {
3257       start_busy = (brw->batch.last_bo &&
3258                     drm_intel_bo_busy(brw->batch.last_bo));
3259       start_time = get_time();
3260    }
3261
3262    struct brw_shader *shader = NULL;
3263    if (prog)
3264       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3265
3266    if (unlikely(INTEL_DEBUG & DEBUG_WM))
3267       brw_dump_ir(brw, "fragment", prog, &shader->base, &fp->Base);
3268
3269    /* Now the main event: Visit the shader IR and generate our FS IR for it.
3270     */
3271    fs_visitor v(brw, mem_ctx, key, prog_data, prog, fp, 8);
3272    if (!v.run()) {
3273       if (prog) {
3274          prog->LinkStatus = false;
3275          ralloc_strcat(&prog->InfoLog, v.fail_msg);
3276       }
3277
3278       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3279                     v.fail_msg);
3280
3281       return NULL;
3282    }
3283
3284    exec_list *simd16_instructions = NULL;
3285    fs_visitor v2(brw, mem_ctx, key, prog_data, prog, fp, 16);
3286    if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16))) {
3287       if (!v.simd16_unsupported) {
3288          /* Try a SIMD16 compile */
3289          v2.import_uniforms(&v);
3290          if (!v2.run()) {
3291             perf_debug("SIMD16 shader failed to compile, falling back to "
3292                        "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
3293          } else {
3294             simd16_instructions = &v2.instructions;
3295          }
3296       } else {
3297          perf_debug("SIMD16 shader unsupported, falling back to "
3298                     "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
3299       }
3300    }
3301
3302    exec_list *simd8_instructions;
3303    int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || brw->no_simd8;
3304    if (no_simd8 && simd16_instructions) {
3305       simd8_instructions = NULL;
3306       prog_data->no_8 = true;
3307    } else {
3308       simd8_instructions = &v.instructions;
3309       prog_data->no_8 = false;
3310    }
3311
3312    const unsigned *assembly = NULL;
3313    fs_generator g(brw, mem_ctx, key, prog_data, prog, fp,
3314                   v.runtime_check_aads_emit, INTEL_DEBUG & DEBUG_WM);
3315    assembly = g.generate_assembly(simd8_instructions, simd16_instructions,
3316                                   final_assembly_size);
3317
3318    if (unlikely(brw->perf_debug) && shader) {
3319       if (shader->compiled_once)
3320          brw_wm_debug_recompile(brw, prog, key);
3321       shader->compiled_once = true;
3322
3323       if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
3324          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3325                     (get_time() - start_time) * 1000);
3326       }
3327    }
3328
3329    return assembly;
3330 }
3331
3332 bool
3333 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
3334 {
3335    struct brw_context *brw = brw_context(ctx);
3336    struct brw_wm_prog_key key;
3337
3338    if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
3339       return true;
3340
3341    struct gl_fragment_program *fp = (struct gl_fragment_program *)
3342       prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
3343    struct brw_fragment_program *bfp = brw_fragment_program(fp);
3344    bool program_uses_dfdy = fp->UsesDFdy;
3345
3346    memset(&key, 0, sizeof(key));
3347
3348    if (brw->gen < 6) {
3349       if (fp->UsesKill)
3350          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3351
3352       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3353          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3354
3355       /* Just assume depth testing. */
3356       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3357       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3358    }
3359
3360    if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
3361                                          BRW_FS_VARYING_INPUT_MASK) > 16)
3362       key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
3363
3364    unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
3365    for (unsigned i = 0; i < sampler_count; i++) {
3366       if (fp->Base.ShadowSamplers & (1 << i)) {
3367          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3368          key.tex.swizzles[i] =
3369             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3370       } else {
3371          /* Color sampler: assume no swizzling. */
3372          key.tex.swizzles[i] = SWIZZLE_XYZW;
3373       }
3374    }
3375
3376    if (fp->Base.InputsRead & VARYING_BIT_POS) {
3377       key.drawable_height = ctx->DrawBuffer->Height;
3378    }
3379
3380    key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
3381          ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
3382          BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
3383
3384    if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3385       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
3386                           key.nr_color_regions > 1;
3387    }
3388
3389    /* GL_FRAGMENT_SHADER_DERIVATIVE_HINT is almost always GL_DONT_CARE.  The
3390     * quality of the derivatives is likely to be determined by the driconf
3391     * option.
3392     */
3393    key.high_quality_derivatives = brw->disable_derivative_optimization;
3394
3395    key.program_string_id = bfp->id;
3396
3397    uint32_t old_prog_offset = brw->wm.base.prog_offset;
3398    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3399
3400    bool success = do_wm_prog(brw, prog, bfp, &key);
3401
3402    brw->wm.base.prog_offset = old_prog_offset;
3403    brw->wm.prog_data = old_prog_data;
3404
3405    return success;
3406 }