src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 extern "C" {
  32
  33 #include <sys/types.h>
  34
  35 #include "main/hash_table.h"
  36 #include "main/macros.h"
  37 #include "main/shaderobj.h"
  38 #include "main/fbobject.h"
  39 #include "program/prog_parameter.h"
  40 #include "program/prog_print.h"
  41 #include "program/register_allocate.h"
  42 #include "program/sampler.h"
  43 #include "program/hash_table.h"
  44 #include "brw_context.h"
  45 #include "brw_eu.h"
  46 #include "brw_wm.h"
  47 }
  48 #include "brw_fs.h"
  49 #include "brw_dead_control_flow.h"
  50 #include "main/uniforms.h"
  51 #include "brw_fs_live_variables.h"
  52 #include "glsl/glsl_types.h"
  53
  54 void
  55 fs_inst::init()
  56 {
  57    memset(this, 0, sizeof(*this));
  58    this->conditional_mod = BRW_CONDITIONAL_NONE;
  59
  60    this->dst = reg_undef;
  61    this->src[0] = reg_undef;
  62    this->src[1] = reg_undef;
  63    this->src[2] = reg_undef;
  64
  65    /* This will be the case for almost all instructions. */
  66    this->regs_written = 1;
  67 }
  68
  69 fs_inst::fs_inst()
  70 {
  71    init();
  72    this->opcode = BRW_OPCODE_NOP;
  73 }
  74
  75 fs_inst::fs_inst(enum opcode opcode)
  76 {
  77    init();
  78    this->opcode = opcode;
  79 }
  80
  81 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
  82 {
  83    init();
  84    this->opcode = opcode;
  85    this->dst = dst;
  86
  87    if (dst.file == GRF)
  88       assert(dst.reg_offset >= 0);
  89 }
  90
  91 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
  92 {
  93    init();
  94    this->opcode = opcode;
  95    this->dst = dst;
  96    this->src[0] = src0;
  97
  98    if (dst.file == GRF)
  99       assert(dst.reg_offset >= 0);
 100    if (src[0].file == GRF)
 101       assert(src[0].reg_offset >= 0);
 102 }
 103
 104 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 105 {
 106    init();
 107    this->opcode = opcode;
 108    this->dst = dst;
 109    this->src[0] = src0;
 110    this->src[1] = src1;
 111
 112    if (dst.file == GRF)
 113       assert(dst.reg_offset >= 0);
 114    if (src[0].file == GRF)
 115       assert(src[0].reg_offset >= 0);
 116    if (src[1].file == GRF)
 117       assert(src[1].reg_offset >= 0);
 118 }
 119
 120 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
 121                  fs_reg src0, fs_reg src1, fs_reg src2)
 122 {
 123    init();
 124    this->opcode = opcode;
 125    this->dst = dst;
 126    this->src[0] = src0;
 127    this->src[1] = src1;
 128    this->src[2] = src2;
 129
 130    if (dst.file == GRF)
 131       assert(dst.reg_offset >= 0);
 132    if (src[0].file == GRF)
 133       assert(src[0].reg_offset >= 0);
 134    if (src[1].file == GRF)
 135       assert(src[1].reg_offset >= 0);
 136    if (src[2].file == GRF)
 137       assert(src[2].reg_offset >= 0);
 138 }
 139
 140 #define ALU1(op)                                                        \
 141    fs_inst *                                                            \
 142    fs_visitor::op(fs_reg dst, fs_reg src0)                              \
 143    {                                                                    \
 144       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
 145    }
 146
 147 #define ALU2(op)                                                        \
 148    fs_inst *                                                            \
 149    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1)                 \
 150    {                                                                    \
 151       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
 152    }
 153
 154 #define ALU3(op)                                                        \
 155    fs_inst *                                                            \
 156    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1, fs_reg src2)    \
 157    {                                                                    \
 158       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
 159    }
 160
 161 ALU1(NOT)
 162 ALU1(MOV)
 163 ALU1(FRC)
 164 ALU1(RNDD)
 165 ALU1(RNDE)
 166 ALU1(RNDZ)
 167 ALU2(ADD)
 168 ALU2(MUL)
 169 ALU2(MACH)
 170 ALU2(AND)
 171 ALU2(OR)
 172 ALU2(XOR)
 173 ALU2(SHL)
 174 ALU2(SHR)
 175 ALU2(ASR)
 176 ALU3(LRP)
 177 ALU1(BFREV)
 178 ALU3(BFE)
 179 ALU2(BFI1)
 180 ALU3(BFI2)
 181 ALU1(FBH)
 182 ALU1(FBL)
 183 ALU1(CBIT)
 184 ALU3(MAD)
 185 ALU2(ADDC)
 186 ALU2(SUBB)
 187 ALU2(SEL)
 188
 189 /** Gen4 predicated IF. */
 190 fs_inst *
 191 fs_visitor::IF(uint32_t predicate)
 192 {
 193    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
 194    inst->predicate = predicate;
 195    return inst;
 196 }
 197
 198 /** Gen6 IF with embedded comparison. */
 199 fs_inst *
 200 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
 201 {
 202    assert(brw->gen == 6);
 203    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
 204                                         reg_null_d, src0, src1);
 205    inst->conditional_mod = condition;
 206    return inst;
 207 }
 208
 209 /**
 210  * CMP: Sets the low bit of the destination channels with the result
 211  * of the comparison, while the upper bits are undefined, and updates
 212  * the flag register with the packed 16 bits of the result.
 213  */
 214 fs_inst *
 215 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
 216 {
 217    fs_inst *inst;
 218
 219    /* Take the instruction:
 220     *
 221     * CMP null<d> src0<f> src1<f>
 222     *
 223     * Original gen4 does type conversion to the destination type before
 224     * comparison, producing garbage results for floating point comparisons.
 225     * gen5 does the comparison on the execution type (resolved source types),
 226     * so dst type doesn't matter.  gen6 does comparison and then uses the
 227     * result as if it was the dst type with no conversion, which happens to
 228     * mostly work out for float-interpreted-as-int since our comparisons are
 229     * for >0, =0, <0.
 230     */
 231    if (brw->gen == 4) {
 232       dst.type = src0.type;
 233       if (dst.file == HW_REG)
 234          dst.fixed_hw_reg.type = dst.type;
 235    }
 236
 237    resolve_ud_negate(&src0);
 238    resolve_ud_negate(&src1);
 239
 240    inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
 241    inst->conditional_mod = condition;
 242
 243    return inst;
 244 }
 245
 246 exec_list
 247 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
 248                                        const fs_reg &surf_index,
 249                                        const fs_reg &varying_offset,
 250                                        uint32_t const_offset)
 251 {
 252    exec_list instructions;
 253    fs_inst *inst;
 254
 255    /* We have our constant surface use a pitch of 4 bytes, so our index can
 256     * be any component of a vector, and then we load 4 contiguous
 257     * components starting from that.
 258     *
 259     * We break down the const_offset to a portion added to the variable
 260     * offset and a portion done using reg_offset, which means that if you
 261     * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
 262     * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
 263     * CSE can later notice that those loads are all the same and eliminate
 264     * the redundant ones.
 265     */
 266    fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
 267    instructions.push_tail(ADD(vec4_offset,
 268                               varying_offset, const_offset & ~3));
 269
 270    int scale = 1;
 271    if (brw->gen == 4 && dispatch_width == 8) {
 272       /* Pre-gen5, we can either use a SIMD8 message that requires (header,
 273        * u, v, r) as parameters, or we can just use the SIMD16 message
 274        * consisting of (header, u).  We choose the second, at the cost of a
 275        * longer return length.
 276        */
 277       scale = 2;
 278    }
 279
 280    enum opcode op;
 281    if (brw->gen >= 7)
 282       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
 283    else
 284       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
 285    fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4 * scale), dst.type);
 286    inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
 287    inst->regs_written = 4 * scale;
 288    instructions.push_tail(inst);
 289
 290    if (brw->gen < 7) {
 291       inst->base_mrf = 13;
 292       inst->header_present = true;
 293       if (brw->gen == 4)
 294          inst->mlen = 3;
 295       else
 296          inst->mlen = 1 + dispatch_width / 8;
 297    }
 298
 299    vec4_result.reg_offset += (const_offset & 3) * scale;
 300    instructions.push_tail(MOV(dst, vec4_result));
 301
 302    return instructions;
 303 }
 304
 305 /**
 306  * A helper for MOV generation for fixing up broken hardware SEND dependency
 307  * handling.
 308  */
 309 fs_inst *
 310 fs_visitor::DEP_RESOLVE_MOV(int grf)
 311 {
 312    fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
 313
 314    inst->ir = NULL;
 315    inst->annotation = "send dependency resolve";
 316
 317    /* The caller always wants uncompressed to emit the minimal extra
 318     * dependencies, and to avoid having to deal with aligning its regs to 2.
 319     */
 320    inst->force_uncompressed = true;
 321
 322    return inst;
 323 }
 324
 325 bool
 326 fs_inst::equals(fs_inst *inst) const
 327 {
 328    return (opcode == inst->opcode &&
 329            dst.equals(inst->dst) &&
 330            src[0].equals(inst->src[0]) &&
 331            src[1].equals(inst->src[1]) &&
 332            src[2].equals(inst->src[2]) &&
 333            saturate == inst->saturate &&
 334            predicate == inst->predicate &&
 335            conditional_mod == inst->conditional_mod &&
 336            mlen == inst->mlen &&
 337            base_mrf == inst->base_mrf &&
 338            sampler == inst->sampler &&
 339            target == inst->target &&
 340            eot == inst->eot &&
 341            header_present == inst->header_present &&
 342            shadow_compare == inst->shadow_compare &&
 343            offset == inst->offset);
 344 }
 345
 346 bool
 347 fs_inst::overwrites_reg(const fs_reg &reg) const
 348 {
 349    return (reg.file == dst.file &&
 350            reg.reg == dst.reg &&
 351            reg.reg_offset >= dst.reg_offset  &&
 352            reg.reg_offset < dst.reg_offset + regs_written);
 353 }
 354
 355 bool
 356 fs_inst::is_send_from_grf() const
 357 {
 358    return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
 359            opcode == SHADER_OPCODE_SHADER_TIME_ADD ||
 360            (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
 361             src[1].file == GRF) ||
 362            (is_tex() && src[0].file == GRF));
 363 }
 364
 365 bool
 366 fs_visitor::can_do_source_mods(fs_inst *inst)
 367 {
 368    if (brw->gen == 6 && inst->is_math())
 369       return false;
 370
 371    if (inst->is_send_from_grf())
 372       return false;
 373
 374    if (!inst->can_do_source_mods())
 375       return false;
 376
 377    return true;
 378 }
 379
 380 void
 381 fs_reg::init()
 382 {
 383    memset(this, 0, sizeof(*this));
 384    stride = 1;
 385 }
 386
 387 /** Generic unset register constructor. */
 388 fs_reg::fs_reg()
 389 {
 390    init();
 391    this->file = BAD_FILE;
 392 }
 393
 394 /** Immediate value constructor. */
 395 fs_reg::fs_reg(float f)
 396 {
 397    init();
 398    this->file = IMM;
 399    this->type = BRW_REGISTER_TYPE_F;
 400    this->imm.f = f;
 401 }
 402
 403 /** Immediate value constructor. */
 404 fs_reg::fs_reg(int32_t i)
 405 {
 406    init();
 407    this->file = IMM;
 408    this->type = BRW_REGISTER_TYPE_D;
 409    this->imm.i = i;
 410 }
 411
 412 /** Immediate value constructor. */
 413 fs_reg::fs_reg(uint32_t u)
 414 {
 415    init();
 416    this->file = IMM;
 417    this->type = BRW_REGISTER_TYPE_UD;
 418    this->imm.u = u;
 419 }
 420
 421 /** Fixed brw_reg. */
 422 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
 423 {
 424    init();
 425    this->file = HW_REG;
 426    this->fixed_hw_reg = fixed_hw_reg;
 427    this->type = fixed_hw_reg.type;
 428 }
 429
 430 bool
 431 fs_reg::equals(const fs_reg &r) const
 432 {
 433    return (file == r.file &&
 434            reg == r.reg &&
 435            reg_offset == r.reg_offset &&
 436            subreg_offset == r.subreg_offset &&
 437            type == r.type &&
 438            negate == r.negate &&
 439            abs == r.abs &&
 440            !reladdr && !r.reladdr &&
 441            memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
 442                   sizeof(fixed_hw_reg)) == 0 &&
 443            stride == r.stride &&
 444            imm.u == r.imm.u);
 445 }
 446
 447 fs_reg &
 448 fs_reg::apply_stride(unsigned stride)
 449 {
 450    assert((this->stride * stride) <= 4 &&
 451           (is_power_of_two(stride) || stride == 0) &&
 452           file != HW_REG && file != IMM);
 453    this->stride *= stride;
 454    return *this;
 455 }
 456
 457 fs_reg &
 458 fs_reg::set_smear(unsigned subreg)
 459 {
 460    assert(file != HW_REG && file != IMM);
 461    subreg_offset = subreg * type_sz(type);
 462    stride = 0;
 463    return *this;
 464 }
 465
 466 bool
 467 fs_reg::is_contiguous() const
 468 {
 469    return stride == 1;
 470 }
 471
 472 bool
 473 fs_reg::is_zero() const
 474 {
 475    if (file != IMM)
 476       return false;
 477
 478    return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
 479 }
 480
 481 bool
 482 fs_reg::is_one() const
 483 {
 484    if (file != IMM)
 485       return false;
 486
 487    return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
 488 }
 489
 490 bool
 491 fs_reg::is_null() const
 492 {
 493    return file == HW_REG &&
 494           fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
 495           fixed_hw_reg.nr == BRW_ARF_NULL;
 496 }
 497
 498 bool
 499 fs_reg::is_valid_3src() const
 500 {
 501    return file == GRF || file == UNIFORM;
 502 }
 503
 504 int
 505 fs_visitor::type_size(const struct glsl_type *type)
 506 {
 507    unsigned int size, i;
 508
 509    switch (type->base_type) {
 510    case GLSL_TYPE_UINT:
 511    case GLSL_TYPE_INT:
 512    case GLSL_TYPE_FLOAT:
 513    case GLSL_TYPE_BOOL:
 514       return type->components();
 515    case GLSL_TYPE_ARRAY:
 516       return type_size(type->fields.array) * type->length;
 517    case GLSL_TYPE_STRUCT:
 518       size = 0;
 519       for (i = 0; i < type->length; i++) {
 520          size += type_size(type->fields.structure[i].type);
 521       }
 522       return size;
 523    case GLSL_TYPE_SAMPLER:
 524       /* Samplers take up no register space, since they're baked in at
 525        * link time.
 526        */
 527       return 0;
 528    case GLSL_TYPE_ATOMIC_UINT:
 529       return 0;
 530    case GLSL_TYPE_IMAGE:
 531    case GLSL_TYPE_VOID:
 532    case GLSL_TYPE_ERROR:
 533    case GLSL_TYPE_INTERFACE:
 534       assert(!"not reached");
 535       break;
 536    }
 537
 538    return 0;
 539 }
 540
 541 fs_reg
 542 fs_visitor::get_timestamp()
 543 {
 544    assert(brw->gen >= 7);
 545
 546    fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 547                                           BRW_ARF_TIMESTAMP,
 548                                           0),
 549                              BRW_REGISTER_TYPE_UD));
 550
 551    fs_reg dst = fs_reg(this, glsl_type::uint_type);
 552
 553    fs_inst *mov = emit(MOV(dst, ts));
 554    /* We want to read the 3 fields we care about (mostly field 0, but also 2)
 555     * even if it's not enabled in the dispatch.
 556     */
 557    mov->force_writemask_all = true;
 558    mov->force_uncompressed = true;
 559
 560    /* The caller wants the low 32 bits of the timestamp.  Since it's running
 561     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
 562     * which is plenty of time for our purposes.  It is identical across the
 563     * EUs, but since it's tracking GPU core speed it will increment at a
 564     * varying rate as render P-states change.
 565     *
 566     * The caller could also check if render P-states have changed (or anything
 567     * else that might disrupt timing) by setting smear to 2 and checking if
 568     * that field is != 0.
 569     */
 570    dst.set_smear(0);
 571
 572    return dst;
 573 }
 574
 575 void
 576 fs_visitor::emit_shader_time_begin()
 577 {
 578    current_annotation = "shader time start";
 579    shader_start_time = get_timestamp();
 580 }
 581
 582 void
 583 fs_visitor::emit_shader_time_end()
 584 {
 585    current_annotation = "shader time end";
 586
 587    enum shader_time_shader_type type, written_type, reset_type;
 588    if (dispatch_width == 8) {
 589       type = ST_FS8;
 590       written_type = ST_FS8_WRITTEN;
 591       reset_type = ST_FS8_RESET;
 592    } else {
 593       assert(dispatch_width == 16);
 594       type = ST_FS16;
 595       written_type = ST_FS16_WRITTEN;
 596       reset_type = ST_FS16_RESET;
 597    }
 598
 599    fs_reg shader_end_time = get_timestamp();
 600
 601    /* Check that there weren't any timestamp reset events (assuming these
 602     * were the only two timestamp reads that happened).
 603     */
 604    fs_reg reset = shader_end_time;
 605    reset.set_smear(2);
 606    fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
 607    test->conditional_mod = BRW_CONDITIONAL_Z;
 608    emit(IF(BRW_PREDICATE_NORMAL));
 609
 610    push_force_uncompressed();
 611    fs_reg start = shader_start_time;
 612    start.negate = true;
 613    fs_reg diff = fs_reg(this, glsl_type::uint_type);
 614    emit(ADD(diff, start, shader_end_time));
 615
 616    /* If there were no instructions between the two timestamp gets, the diff
 617     * is 2 cycles.  Remove that overhead, so I can forget about that when
 618     * trying to determine the time taken for single instructions.
 619     */
 620    emit(ADD(diff, diff, fs_reg(-2u)));
 621
 622    emit_shader_time_write(type, diff);
 623    emit_shader_time_write(written_type, fs_reg(1u));
 624    emit(BRW_OPCODE_ELSE);
 625    emit_shader_time_write(reset_type, fs_reg(1u));
 626    emit(BRW_OPCODE_ENDIF);
 627
 628    pop_force_uncompressed();
 629 }
 630
 631 void
 632 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
 633                                    fs_reg value)
 634 {
 635    int shader_time_index =
 636       brw_get_shader_time_index(brw, shader_prog, &fp->Base, type);
 637    fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
 638
 639    fs_reg payload;
 640    if (dispatch_width == 8)
 641       payload = fs_reg(this, glsl_type::uvec2_type);
 642    else
 643       payload = fs_reg(this, glsl_type::uint_type);
 644
 645    emit(new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
 646                              fs_reg(), payload, offset, value));
 647 }
 648
 649 void
 650 fs_visitor::vfail(const char *format, va_list va)
 651 {
 652    char *msg;
 653
 654    if (failed)
 655       return;
 656
 657    failed = true;
 658
 659    msg = ralloc_vasprintf(mem_ctx, format, va);
 660    msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
 661
 662    this->fail_msg = msg;
 663
 664    if (INTEL_DEBUG & DEBUG_WM) {
 665       fprintf(stderr, "%s",  msg);
 666    }
 667 }
 668
 669 void
 670 fs_visitor::fail(const char *format, ...)
 671 {
 672    va_list va;
 673
 674    va_start(va, format);
 675    vfail(format, va);
 676    va_end(va);
 677 }
 678
 679 /**
 680  * Mark this program as impossible to compile in SIMD16 mode.
 681  *
 682  * During the SIMD8 compile (which happens first), we can detect and flag
 683  * things that are unsupported in SIMD16 mode, so the compiler can skip
 684  * the SIMD16 compile altogether.
 685  *
 686  * During a SIMD16 compile (if one happens anyway), this just calls fail().
 687  */
 688 void
 689 fs_visitor::no16(const char *format, ...)
 690 {
 691    va_list va;
 692
 693    va_start(va, format);
 694
 695    if (dispatch_width == 16) {
 696       vfail(format, va);
 697    } else {
 698       simd16_unsupported = true;
 699
 700       if (brw->perf_debug) {
 701          if (no16_msg)
 702             ralloc_vasprintf_append(&no16_msg, format, va);
 703          else
 704             no16_msg = ralloc_vasprintf(mem_ctx, format, va);
 705       }
 706    }
 707
 708    va_end(va);
 709 }
 710
 711 fs_inst *
 712 fs_visitor::emit(enum opcode opcode)
 713 {
 714    return emit(new(mem_ctx) fs_inst(opcode));
 715 }
 716
 717 fs_inst *
 718 fs_visitor::emit(enum opcode opcode, fs_reg dst)
 719 {
 720    return emit(new(mem_ctx) fs_inst(opcode, dst));
 721 }
 722
 723 fs_inst *
 724 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
 725 {
 726    return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
 727 }
 728
 729 fs_inst *
 730 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 731 {
 732    return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
 733 }
 734
 735 fs_inst *
 736 fs_visitor::emit(enum opcode opcode, fs_reg dst,
 737                  fs_reg src0, fs_reg src1, fs_reg src2)
 738 {
 739    return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
 740 }
 741
 742 void
 743 fs_visitor::push_force_uncompressed()
 744 {
 745    force_uncompressed_stack++;
 746 }
 747
 748 void
 749 fs_visitor::pop_force_uncompressed()
 750 {
 751    force_uncompressed_stack--;
 752    assert(force_uncompressed_stack >= 0);
 753 }
 754
 755 /**
 756  * Returns true if the instruction has a flag that means it won't
 757  * update an entire destination register.
 758  *
 759  * For example, dead code elimination and live variable analysis want to know
 760  * when a write to a variable screens off any preceding values that were in
 761  * it.
 762  */
 763 bool
 764 fs_inst::is_partial_write() const
 765 {
 766    return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
 767            this->force_uncompressed ||
 768            this->force_sechalf || !this->dst.is_contiguous());
 769 }
 770
 771 int
 772 fs_inst::regs_read(fs_visitor *v, int arg) const
 773 {
 774    if (is_tex() && arg == 0 && src[0].file == GRF) {
 775       if (v->dispatch_width == 16)
 776          return (mlen + 1) / 2;
 777       else
 778          return mlen;
 779    }
 780    return 1;
 781 }
 782
 783 bool
 784 fs_inst::reads_flag() const
 785 {
 786    return predicate;
 787 }
 788
 789 bool
 790 fs_inst::writes_flag() const
 791 {
 792    return (conditional_mod && opcode != BRW_OPCODE_SEL) ||
 793           opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
 794 }
 795
 796 /**
 797  * Returns how many MRFs an FS opcode will write over.
 798  *
 799  * Note that this is not the 0 or 1 implied writes in an actual gen
 800  * instruction -- the FS opcodes often generate MOVs in addition.
 801  */
 802 int
 803 fs_visitor::implied_mrf_writes(fs_inst *inst)
 804 {
 805    if (inst->mlen == 0)
 806       return 0;
 807
 808    if (inst->base_mrf == -1)
 809       return 0;
 810
 811    switch (inst->opcode) {
 812    case SHADER_OPCODE_RCP:
 813    case SHADER_OPCODE_RSQ:
 814    case SHADER_OPCODE_SQRT:
 815    case SHADER_OPCODE_EXP2:
 816    case SHADER_OPCODE_LOG2:
 817    case SHADER_OPCODE_SIN:
 818    case SHADER_OPCODE_COS:
 819       return 1 * dispatch_width / 8;
 820    case SHADER_OPCODE_POW:
 821    case SHADER_OPCODE_INT_QUOTIENT:
 822    case SHADER_OPCODE_INT_REMAINDER:
 823       return 2 * dispatch_width / 8;
 824    case SHADER_OPCODE_TEX:
 825    case FS_OPCODE_TXB:
 826    case SHADER_OPCODE_TXD:
 827    case SHADER_OPCODE_TXF:
 828    case SHADER_OPCODE_TXF_CMS:
 829    case SHADER_OPCODE_TXF_MCS:
 830    case SHADER_OPCODE_TG4:
 831    case SHADER_OPCODE_TG4_OFFSET:
 832    case SHADER_OPCODE_TXL:
 833    case SHADER_OPCODE_TXS:
 834    case SHADER_OPCODE_LOD:
 835       return 1;
 836    case FS_OPCODE_FB_WRITE:
 837       return 2;
 838    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 839    case SHADER_OPCODE_GEN4_SCRATCH_READ:
 840       return 1;
 841    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
 842       return inst->mlen;
 843    case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
 844       return 2;
 845    case SHADER_OPCODE_UNTYPED_ATOMIC:
 846    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
 847       return 0;
 848    default:
 849       assert(!"not reached");
 850       return inst->mlen;
 851    }
 852 }
 853
 854 int
 855 fs_visitor::virtual_grf_alloc(int size)
 856 {
 857    if (virtual_grf_array_size <= virtual_grf_count) {
 858       if (virtual_grf_array_size == 0)
 859          virtual_grf_array_size = 16;
 860       else
 861          virtual_grf_array_size *= 2;
 862       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 863                                    virtual_grf_array_size);
 864    }
 865    virtual_grf_sizes[virtual_grf_count] = size;
 866    return virtual_grf_count++;
 867 }
 868
 869 /** Fixed HW reg constructor. */
 870 fs_reg::fs_reg(enum register_file file, int reg)
 871 {
 872    init();
 873    this->file = file;
 874    this->reg = reg;
 875    this->type = BRW_REGISTER_TYPE_F;
 876 }
 877
 878 /** Fixed HW reg constructor. */
 879 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
 880 {
 881    init();
 882    this->file = file;
 883    this->reg = reg;
 884    this->type = type;
 885 }
 886
 887 /** Automatic reg constructor. */
 888 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
 889 {
 890    init();
 891
 892    this->file = GRF;
 893    this->reg = v->virtual_grf_alloc(v->type_size(type));
 894    this->reg_offset = 0;
 895    this->type = brw_type_for_base_type(type);
 896 }
 897
 898 fs_reg *
 899 fs_visitor::variable_storage(ir_variable *var)
 900 {
 901    return (fs_reg *)hash_table_find(this->variable_ht, var);
 902 }
 903
 904 void
 905 import_uniforms_callback(const void *key,
 906                          void *data,
 907                          void *closure)
 908 {
 909    struct hash_table *dst_ht = (struct hash_table *)closure;
 910    const fs_reg *reg = (const fs_reg *)data;
 911
 912    if (reg->file != UNIFORM)
 913       return;
 914
 915    hash_table_insert(dst_ht, data, key);
 916 }
 917
 918 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
 919  * This brings in those uniform definitions
 920  */
 921 void
 922 fs_visitor::import_uniforms(fs_visitor *v)
 923 {
 924    hash_table_call_foreach(v->variable_ht,
 925                            import_uniforms_callback,
 926                            variable_ht);
 927    this->push_constant_loc = v->push_constant_loc;
 928    this->pull_constant_loc = v->pull_constant_loc;
 929    this->uniforms = v->uniforms;
 930    this->param_size = v->param_size;
 931 }
 932
 933 /* Our support for uniforms is piggy-backed on the struct
 934  * gl_fragment_program, because that's where the values actually
 935  * get stored, rather than in some global gl_shader_program uniform
 936  * store.
 937  */
 938 void
 939 fs_visitor::setup_uniform_values(ir_variable *ir)
 940 {
 941    int namelen = strlen(ir->name);
 942
 943    /* The data for our (non-builtin) uniforms is stored in a series of
 944     * gl_uniform_driver_storage structs for each subcomponent that
 945     * glGetUniformLocation() could name.  We know it's been set up in the same
 946     * order we'd walk the type, so walk the list of storage and find anything
 947     * with our name, or the prefix of a component that starts with our name.
 948     */
 949    unsigned params_before = uniforms;
 950    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
 951       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 952
 953       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 954           (storage->name[namelen] != 0 &&
 955            storage->name[namelen] != '.' &&
 956            storage->name[namelen] != '[')) {
 957          continue;
 958       }
 959
 960       unsigned slots = storage->type->component_slots();
 961       if (storage->array_elements)
 962          slots *= storage->array_elements;
 963
 964       for (unsigned i = 0; i < slots; i++) {
 965          stage_prog_data->param[uniforms++] = &storage->storage[i].f;
 966       }
 967    }
 968
 969    /* Make sure we actually initialized the right amount of stuff here. */
 970    assert(params_before + ir->type->component_slots() == uniforms);
 971    (void)params_before;
 972 }
 973
 974
 975 /* Our support for builtin uniforms is even scarier than non-builtin.
 976  * It sits on top of the PROG_STATE_VAR parameters that are
 977  * automatically updated from GL context state.
 978  */
 979 void
 980 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
 981 {
 982    const ir_state_slot *const slots = ir->state_slots;
 983    assert(ir->state_slots != NULL);
 984
 985    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 986       /* This state reference has already been setup by ir_to_mesa, but we'll
 987        * get the same index back here.
 988        */
 989       int index = _mesa_add_state_reference(this->fp->Base.Parameters,
 990                                             (gl_state_index *)slots[i].tokens);
 991
 992       /* Add each of the unique swizzles of the element as a parameter.
 993        * This'll end up matching the expected layout of the
 994        * array/matrix/structure we're trying to fill in.
 995        */
 996       int last_swiz = -1;
 997       for (unsigned int j = 0; j < 4; j++) {
 998          int swiz = GET_SWZ(slots[i].swizzle, j);
 999          if (swiz == last_swiz)
1000             break;
1001          last_swiz = swiz;
1002
1003          stage_prog_data->param[uniforms++] =
1004             &fp->Base.Parameters->ParameterValues[index][swiz].f;
1005       }
1006    }
1007 }
1008
1009 fs_reg *
1010 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
1011 {
1012    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1013    fs_reg wpos = *reg;
1014    bool flip = !ir->data.origin_upper_left ^ c->key.render_to_fbo;
1015
1016    /* gl_FragCoord.x */
1017    if (ir->data.pixel_center_integer) {
1018       emit(MOV(wpos, this->pixel_x));
1019    } else {
1020       emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
1021    }
1022    wpos.reg_offset++;
1023
1024    /* gl_FragCoord.y */
1025    if (!flip && ir->data.pixel_center_integer) {
1026       emit(MOV(wpos, this->pixel_y));
1027    } else {
1028       fs_reg pixel_y = this->pixel_y;
1029       float offset = (ir->data.pixel_center_integer ? 0.0 : 0.5);
1030
1031       if (flip) {
1032          pixel_y.negate = true;
1033          offset += c->key.drawable_height - 1.0;
1034       }
1035
1036       emit(ADD(wpos, pixel_y, fs_reg(offset)));
1037    }
1038    wpos.reg_offset++;
1039
1040    /* gl_FragCoord.z */
1041    if (brw->gen >= 6) {
1042       emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
1043    } else {
1044       emit(FS_OPCODE_LINTERP, wpos,
1045            this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1046            this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1047            interp_reg(VARYING_SLOT_POS, 2));
1048    }
1049    wpos.reg_offset++;
1050
1051    /* gl_FragCoord.w: Already set up in emit_interpolation */
1052    emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1053
1054    return reg;
1055 }
1056
1057 fs_inst *
1058 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1059                          glsl_interp_qualifier interpolation_mode,
1060                          bool is_centroid, bool is_sample)
1061 {
1062    brw_wm_barycentric_interp_mode barycoord_mode;
1063    if (brw->gen >= 6) {
1064       if (is_centroid) {
1065          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1066             barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1067          else
1068             barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1069       } else if (is_sample) {
1070           if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1071             barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1072          else
1073             barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1074       } else {
1075          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1076             barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1077          else
1078             barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1079       }
1080    } else {
1081       /* On Ironlake and below, there is only one interpolation mode.
1082        * Centroid interpolation doesn't mean anything on this hardware --
1083        * there is no multisampling.
1084        */
1085       barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1086    }
1087    return emit(FS_OPCODE_LINTERP, attr,
1088                this->delta_x[barycoord_mode],
1089                this->delta_y[barycoord_mode], interp);
1090 }
1091
1092 fs_reg *
1093 fs_visitor::emit_general_interpolation(ir_variable *ir)
1094 {
1095    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1096    reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
1097    fs_reg attr = *reg;
1098
1099    unsigned int array_elements;
1100    const glsl_type *type;
1101
1102    if (ir->type->is_array()) {
1103       array_elements = ir->type->length;
1104       if (array_elements == 0) {
1105          fail("dereferenced array '%s' has length 0\n", ir->name);
1106       }
1107       type = ir->type->fields.array;
1108    } else {
1109       array_elements = 1;
1110       type = ir->type;
1111    }
1112
1113    glsl_interp_qualifier interpolation_mode =
1114       ir->determine_interpolation_mode(c->key.flat_shade);
1115
1116    int location = ir->data.location;
1117    for (unsigned int i = 0; i < array_elements; i++) {
1118       for (unsigned int j = 0; j < type->matrix_columns; j++) {
1119          if (c->prog_data.urb_setup[location] == -1) {
1120             /* If there's no incoming setup data for this slot, don't
1121              * emit interpolation for it.
1122              */
1123             attr.reg_offset += type->vector_elements;
1124             location++;
1125             continue;
1126          }
1127
1128          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1129             /* Constant interpolation (flat shading) case. The SF has
1130              * handed us defined values in only the constant offset
1131              * field of the setup reg.
1132              */
1133             for (unsigned int k = 0; k < type->vector_elements; k++) {
1134                struct brw_reg interp = interp_reg(location, k);
1135                interp = suboffset(interp, 3);
1136                interp.type = reg->type;
1137                emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1138                attr.reg_offset++;
1139             }
1140          } else {
1141             /* Smooth/noperspective interpolation case. */
1142             for (unsigned int k = 0; k < type->vector_elements; k++) {
1143                struct brw_reg interp = interp_reg(location, k);
1144                emit_linterp(attr, fs_reg(interp), interpolation_mode,
1145                             ir->data.centroid && !c->key.persample_shading,
1146                             ir->data.sample || c->key.persample_shading);
1147                if (brw->needs_unlit_centroid_workaround && ir->data.centroid) {
1148                   /* Get the pixel/sample mask into f0 so that we know
1149                    * which pixels are lit.  Then, for each channel that is
1150                    * unlit, replace the centroid data with non-centroid
1151                    * data.
1152                    */
1153                   emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1154                   fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1155                                                interpolation_mode,
1156                                                false, false);
1157                   inst->predicate = BRW_PREDICATE_NORMAL;
1158                   inst->predicate_inverse = true;
1159                }
1160                if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1161                   emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1162                }
1163                attr.reg_offset++;
1164             }
1165
1166          }
1167          location++;
1168       }
1169    }
1170
1171    return reg;
1172 }
1173
1174 fs_reg *
1175 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1176 {
1177    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1178
1179    /* The frontfacing comes in as a bit in the thread payload. */
1180    if (brw->gen >= 6) {
1181       emit(BRW_OPCODE_ASR, *reg,
1182            fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1183            fs_reg(15));
1184       emit(BRW_OPCODE_NOT, *reg, *reg);
1185       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1186    } else {
1187       struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1188       /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1189        * us front face
1190        */
1191       emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1192       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1193    }
1194
1195    return reg;
1196 }
1197
1198 void
1199 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1200 {
1201    assert(dst.type == BRW_REGISTER_TYPE_F);
1202
1203    if (c->key.compute_pos_offset) {
1204       /* Convert int_sample_pos to floating point */
1205       emit(MOV(dst, int_sample_pos));
1206       /* Scale to the range [0, 1] */
1207       emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1208    }
1209    else {
1210       /* From ARB_sample_shading specification:
1211        * "When rendering to a non-multisample buffer, or if multisample
1212        *  rasterization is disabled, gl_SamplePosition will always be
1213        *  (0.5, 0.5).
1214        */
1215       emit(MOV(dst, fs_reg(0.5f)));
1216    }
1217 }
1218
1219 fs_reg *
1220 fs_visitor::emit_samplepos_setup(ir_variable *ir)
1221 {
1222    assert(brw->gen >= 6);
1223    assert(ir->type == glsl_type::vec2_type);
1224
1225    this->current_annotation = "compute sample position";
1226    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1227    fs_reg pos = *reg;
1228    fs_reg int_sample_x = fs_reg(this, glsl_type::int_type);
1229    fs_reg int_sample_y = fs_reg(this, glsl_type::int_type);
1230
1231    /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1232     * mode will be enabled.
1233     *
1234     * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1235     * R31.1:0         Position Offset X/Y for Slot[3:0]
1236     * R31.3:2         Position Offset X/Y for Slot[7:4]
1237     * .....
1238     *
1239     * The X, Y sample positions come in as bytes in  thread payload. So, read
1240     * the positions using vstride=16, width=8, hstride=2.
1241     */
1242    struct brw_reg sample_pos_reg =
1243       stride(retype(brw_vec1_grf(c->sample_pos_reg, 0),
1244                     BRW_REGISTER_TYPE_B), 16, 8, 2);
1245
1246    emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1247    if (dispatch_width == 16) {
1248       fs_inst *inst = emit(MOV(half(int_sample_x, 1),
1249                                fs_reg(suboffset(sample_pos_reg, 16))));
1250       inst->force_sechalf = true;
1251    }
1252    /* Compute gl_SamplePosition.x */
1253    compute_sample_position(pos, int_sample_x);
1254    pos.reg_offset++;
1255    emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1256    if (dispatch_width == 16) {
1257       fs_inst *inst = emit(MOV(half(int_sample_y, 1),
1258                                fs_reg(suboffset(sample_pos_reg, 17))));
1259       inst->force_sechalf = true;
1260    }
1261    /* Compute gl_SamplePosition.y */
1262    compute_sample_position(pos, int_sample_y);
1263    return reg;
1264 }
1265
1266 fs_reg *
1267 fs_visitor::emit_sampleid_setup(ir_variable *ir)
1268 {
1269    assert(brw->gen >= 6);
1270
1271    this->current_annotation = "compute sample id";
1272    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1273
1274    if (c->key.compute_sample_id) {
1275       fs_reg t1 = fs_reg(this, glsl_type::int_type);
1276       fs_reg t2 = fs_reg(this, glsl_type::int_type);
1277       t2.type = BRW_REGISTER_TYPE_UW;
1278
1279       /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1280        * 8x multisampling, subspan 0 will represent sample N (where N
1281        * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1282        * 7. We can find the value of N by looking at R0.0 bits 7:6
1283        * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1284        * (since samples are always delivered in pairs). That is, we
1285        * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1286        * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1287        * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1288        * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1289        * populating a temporary variable with the sequence (0, 1, 2, 3),
1290        * and then reading from it using vstride=1, width=4, hstride=0.
1291        * These computations hold good for 4x multisampling as well.
1292        */
1293       emit(BRW_OPCODE_AND, t1,
1294            fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1295            fs_reg(brw_imm_d(0xc0)));
1296       emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1297       /* This works for both SIMD8 and SIMD16 */
1298       emit(MOV(t2, brw_imm_v(0x3210)));
1299       /* This special instruction takes care of setting vstride=1,
1300        * width=4, hstride=0 of t2 during an ADD instruction.
1301        */
1302       emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1303    } else {
1304       /* As per GL_ARB_sample_shading specification:
1305        * "When rendering to a non-multisample buffer, or if multisample
1306        *  rasterization is disabled, gl_SampleID will always be zero."
1307        */
1308       emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1309    }
1310
1311    return reg;
1312 }
1313
1314 fs_reg *
1315 fs_visitor::emit_samplemaskin_setup(ir_variable *ir)
1316 {
1317    assert(brw->gen >= 7);
1318    this->current_annotation = "compute gl_SampleMaskIn";
1319    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1320    emit(MOV(*reg, fs_reg(retype(brw_vec8_grf(c->sample_mask_reg, 0), BRW_REGISTER_TYPE_D))));
1321    return reg;
1322 }
1323
1324 fs_reg
1325 fs_visitor::fix_math_operand(fs_reg src)
1326 {
1327    /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1328     * might be able to do better by doing execsize = 1 math and then
1329     * expanding that result out, but we would need to be careful with
1330     * masking.
1331     *
1332     * The hardware ignores source modifiers (negate and abs) on math
1333     * instructions, so we also move to a temp to set those up.
1334     */
1335    if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1336        !src.abs && !src.negate)
1337       return src;
1338
1339    /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1340     * operands to math
1341     */
1342    if (brw->gen >= 7 && src.file != IMM)
1343       return src;
1344
1345    fs_reg expanded = fs_reg(this, glsl_type::float_type);
1346    expanded.type = src.type;
1347    emit(BRW_OPCODE_MOV, expanded, src);
1348    return expanded;
1349 }
1350
1351 fs_inst *
1352 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1353 {
1354    switch (opcode) {
1355    case SHADER_OPCODE_RCP:
1356    case SHADER_OPCODE_RSQ:
1357    case SHADER_OPCODE_SQRT:
1358    case SHADER_OPCODE_EXP2:
1359    case SHADER_OPCODE_LOG2:
1360    case SHADER_OPCODE_SIN:
1361    case SHADER_OPCODE_COS:
1362       break;
1363    default:
1364       assert(!"not reached: bad math opcode");
1365       return NULL;
1366    }
1367
1368    /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
1369     * might be able to do better by doing execsize = 1 math and then
1370     * expanding that result out, but we would need to be careful with
1371     * masking.
1372     *
1373     * Gen 6 hardware ignores source modifiers (negate and abs) on math
1374     * instructions, so we also move to a temp to set those up.
1375     */
1376    if (brw->gen >= 6)
1377       src = fix_math_operand(src);
1378
1379    fs_inst *inst = emit(opcode, dst, src);
1380
1381    if (brw->gen < 6) {
1382       inst->base_mrf = 2;
1383       inst->mlen = dispatch_width / 8;
1384    }
1385
1386    return inst;
1387 }
1388
1389 fs_inst *
1390 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1391 {
1392    int base_mrf = 2;
1393    fs_inst *inst;
1394
1395    switch (opcode) {
1396    case SHADER_OPCODE_INT_QUOTIENT:
1397    case SHADER_OPCODE_INT_REMAINDER:
1398       if (brw->gen >= 7)
1399          no16("SIMD16 INTDIV unsupported\n");
1400       break;
1401    case SHADER_OPCODE_POW:
1402       break;
1403    default:
1404       assert(!"not reached: unsupported binary math opcode.");
1405       return NULL;
1406    }
1407
1408    if (brw->gen >= 6) {
1409       src0 = fix_math_operand(src0);
1410       src1 = fix_math_operand(src1);
1411
1412       inst = emit(opcode, dst, src0, src1);
1413    } else {
1414       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1415        * "Message Payload":
1416        *
1417        * "Operand0[7].  For the INT DIV functions, this operand is the
1418        *  denominator."
1419        *  ...
1420        * "Operand1[7].  For the INT DIV functions, this operand is the
1421        *  numerator."
1422        */
1423       bool is_int_div = opcode != SHADER_OPCODE_POW;
1424       fs_reg &op0 = is_int_div ? src1 : src0;
1425       fs_reg &op1 = is_int_div ? src0 : src1;
1426
1427       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1428       inst = emit(opcode, dst, op0, reg_null_f);
1429
1430       inst->base_mrf = base_mrf;
1431       inst->mlen = 2 * dispatch_width / 8;
1432    }
1433    return inst;
1434 }
1435
1436 void
1437 fs_visitor::assign_curb_setup()
1438 {
1439    if (dispatch_width == 8) {
1440       c->prog_data.first_curbe_grf = c->nr_payload_regs;
1441    } else {
1442       c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1443    }
1444
1445    c->prog_data.curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1446
1447    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1448    foreach_list(node, &this->instructions) {
1449       fs_inst *inst = (fs_inst *)node;
1450
1451       for (unsigned int i = 0; i < 3; i++) {
1452          if (inst->src[i].file == UNIFORM) {
1453             int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1454             int constant_nr;
1455             if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1456                constant_nr = push_constant_loc[uniform_nr];
1457             } else {
1458                /* Section 5.11 of the OpenGL 4.1 spec says:
1459                 * "Out-of-bounds reads return undefined values, which include
1460                 *  values from other variables of the active program or zero."
1461                 * Just return the first push constant.
1462                 */
1463                constant_nr = 0;
1464             }
1465
1466             struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1467                                                   constant_nr / 8,
1468                                                   constant_nr % 8);
1469
1470             inst->src[i].file = HW_REG;
1471             inst->src[i].fixed_hw_reg = byte_offset(
1472                retype(brw_reg, inst->src[i].type),
1473                inst->src[i].subreg_offset);
1474          }
1475       }
1476    }
1477 }
1478
1479 void
1480 fs_visitor::calculate_urb_setup()
1481 {
1482    for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1483       c->prog_data.urb_setup[i] = -1;
1484    }
1485
1486    int urb_next = 0;
1487    /* Figure out where each of the incoming setup attributes lands. */
1488    if (brw->gen >= 6) {
1489       if (_mesa_bitcount_64(fp->Base.InputsRead &
1490                             BRW_FS_VARYING_INPUT_MASK) <= 16) {
1491          /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1492           * first 16 varying inputs, so we can put them wherever we want.
1493           * Just put them in order.
1494           *
1495           * This is useful because it means that (a) inputs not used by the
1496           * fragment shader won't take up valuable register space, and (b) we
1497           * won't have to recompile the fragment shader if it gets paired with
1498           * a different vertex (or geometry) shader.
1499           */
1500          for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1501             if (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1502                 BITFIELD64_BIT(i)) {
1503                c->prog_data.urb_setup[i] = urb_next++;
1504             }
1505          }
1506       } else {
1507          /* We have enough input varyings that the SF/SBE pipeline stage can't
1508           * arbitrarily rearrange them to suit our whim; we have to put them
1509           * in an order that matches the output of the previous pipeline stage
1510           * (geometry or vertex shader).
1511           */
1512          struct brw_vue_map prev_stage_vue_map;
1513          brw_compute_vue_map(brw, &prev_stage_vue_map,
1514                              c->key.input_slots_valid);
1515          int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1516          assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1517          for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1518               slot++) {
1519             int varying = prev_stage_vue_map.slot_to_varying[slot];
1520             /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1521              * unused.
1522              */
1523             if (varying != BRW_VARYING_SLOT_COUNT &&
1524                 (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1525                  BITFIELD64_BIT(varying))) {
1526                c->prog_data.urb_setup[varying] = slot - first_slot;
1527             }
1528          }
1529          urb_next = prev_stage_vue_map.num_slots - first_slot;
1530       }
1531    } else {
1532       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1533       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1534          /* Point size is packed into the header, not as a general attribute */
1535          if (i == VARYING_SLOT_PSIZ)
1536             continue;
1537
1538          if (c->key.input_slots_valid & BITFIELD64_BIT(i)) {
1539             /* The back color slot is skipped when the front color is
1540              * also written to.  In addition, some slots can be
1541              * written in the vertex shader and not read in the
1542              * fragment shader.  So the register number must always be
1543              * incremented, mapped or not.
1544              */
1545             if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1546                c->prog_data.urb_setup[i] = urb_next;
1547             urb_next++;
1548          }
1549       }
1550
1551       /*
1552        * It's a FS only attribute, and we did interpolation for this attribute
1553        * in SF thread. So, count it here, too.
1554        *
1555        * See compile_sf_prog() for more info.
1556        */
1557       if (fp->Base.InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1558          c->prog_data.urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1559    }
1560
1561    c->prog_data.num_varying_inputs = urb_next;
1562 }
1563
1564 void
1565 fs_visitor::assign_urb_setup()
1566 {
1567    int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1568
1569    /* Offset all the urb_setup[] index by the actual position of the
1570     * setup regs, now that the location of the constants has been chosen.
1571     */
1572    foreach_list(node, &this->instructions) {
1573       fs_inst *inst = (fs_inst *)node;
1574
1575       if (inst->opcode == FS_OPCODE_LINTERP) {
1576          assert(inst->src[2].file == HW_REG);
1577          inst->src[2].fixed_hw_reg.nr += urb_start;
1578       }
1579
1580       if (inst->opcode == FS_OPCODE_CINTERP) {
1581          assert(inst->src[0].file == HW_REG);
1582          inst->src[0].fixed_hw_reg.nr += urb_start;
1583       }
1584    }
1585
1586    /* Each attribute is 4 setup channels, each of which is half a reg. */
1587    this->first_non_payload_grf =
1588       urb_start + c->prog_data.num_varying_inputs * 2;
1589 }
1590
1591 /**
1592  * Split large virtual GRFs into separate components if we can.
1593  *
1594  * This is mostly duplicated with what brw_fs_vector_splitting does,
1595  * but that's really conservative because it's afraid of doing
1596  * splitting that doesn't result in real progress after the rest of
1597  * the optimization phases, which would cause infinite looping in
1598  * optimization.  We can do it once here, safely.  This also has the
1599  * opportunity to split interpolated values, or maybe even uniforms,
1600  * which we don't have at the IR level.
1601  *
1602  * We want to split, because virtual GRFs are what we register
1603  * allocate and spill (due to contiguousness requirements for some
1604  * instructions), and they're what we naturally generate in the
1605  * codegen process, but most virtual GRFs don't actually need to be
1606  * contiguous sets of GRFs.  If we split, we'll end up with reduced
1607  * live intervals and better dead code elimination and coalescing.
1608  */
1609 void
1610 fs_visitor::split_virtual_grfs()
1611 {
1612    int num_vars = this->virtual_grf_count;
1613    bool split_grf[num_vars];
1614    int new_virtual_grf[num_vars];
1615
1616    /* Try to split anything > 0 sized. */
1617    for (int i = 0; i < num_vars; i++) {
1618       if (this->virtual_grf_sizes[i] != 1)
1619          split_grf[i] = true;
1620       else
1621          split_grf[i] = false;
1622    }
1623
1624    if (brw->has_pln &&
1625        this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1626       /* PLN opcodes rely on the delta_xy being contiguous.  We only have to
1627        * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1628        * Gen6, that was the only supported interpolation mode, and since Gen6,
1629        * delta_x and delta_y are in fixed hardware registers.
1630        */
1631       split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1632          false;
1633    }
1634
1635    foreach_list(node, &this->instructions) {
1636       fs_inst *inst = (fs_inst *)node;
1637
1638       /* If there's a SEND message that requires contiguous destination
1639        * registers, no splitting is allowed.
1640        */
1641       if (inst->regs_written > 1) {
1642          split_grf[inst->dst.reg] = false;
1643       }
1644
1645       /* If we're sending from a GRF, don't split it, on the assumption that
1646        * the send is reading the whole thing.
1647        */
1648       if (inst->is_send_from_grf()) {
1649          for (int i = 0; i < 3; i++) {
1650             if (inst->src[i].file == GRF) {
1651                split_grf[inst->src[i].reg] = false;
1652             }
1653          }
1654       }
1655    }
1656
1657    /* Allocate new space for split regs.  Note that the virtual
1658     * numbers will be contiguous.
1659     */
1660    for (int i = 0; i < num_vars; i++) {
1661       if (split_grf[i]) {
1662          new_virtual_grf[i] = virtual_grf_alloc(1);
1663          for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1664             int reg = virtual_grf_alloc(1);
1665             assert(reg == new_virtual_grf[i] + j - 1);
1666             (void) reg;
1667          }
1668          this->virtual_grf_sizes[i] = 1;
1669       }
1670    }
1671
1672    foreach_list(node, &this->instructions) {
1673       fs_inst *inst = (fs_inst *)node;
1674
1675       if (inst->dst.file == GRF &&
1676           split_grf[inst->dst.reg] &&
1677           inst->dst.reg_offset != 0) {
1678          inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1679                           inst->dst.reg_offset - 1);
1680          inst->dst.reg_offset = 0;
1681       }
1682       for (int i = 0; i < 3; i++) {
1683          if (inst->src[i].file == GRF &&
1684              split_grf[inst->src[i].reg] &&
1685              inst->src[i].reg_offset != 0) {
1686             inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1687                                 inst->src[i].reg_offset - 1);
1688             inst->src[i].reg_offset = 0;
1689          }
1690       }
1691    }
1692    invalidate_live_intervals();
1693 }
1694
1695 /**
1696  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1697  *
1698  * During code generation, we create tons of temporary variables, many of
1699  * which get immediately killed and are never used again.  Yet, in later
1700  * optimization and analysis passes, such as compute_live_intervals, we need
1701  * to loop over all the virtual GRFs.  Compacting them can save a lot of
1702  * overhead.
1703  */
1704 void
1705 fs_visitor::compact_virtual_grfs()
1706 {
1707    /* Mark which virtual GRFs are used, and count how many. */
1708    int remap_table[this->virtual_grf_count];
1709    memset(remap_table, -1, sizeof(remap_table));
1710
1711    foreach_list(node, &this->instructions) {
1712       const fs_inst *inst = (const fs_inst *) node;
1713
1714       if (inst->dst.file == GRF)
1715          remap_table[inst->dst.reg] = 0;
1716
1717       for (int i = 0; i < 3; i++) {
1718          if (inst->src[i].file == GRF)
1719             remap_table[inst->src[i].reg] = 0;
1720       }
1721    }
1722
1723    /* In addition to registers used in instructions, fs_visitor keeps
1724     * direct references to certain special values which must be patched:
1725     */
1726    struct {
1727       fs_reg *reg;
1728       unsigned count;
1729    } special[] = {
1730       { &frag_depth, 1 },
1731       { &pixel_x, 1 },
1732       { &pixel_y, 1 },
1733       { &pixel_w, 1 },
1734       { &wpos_w, 1 },
1735       { &dual_src_output, 1 },
1736       { outputs, ARRAY_SIZE(outputs) },
1737       { delta_x, ARRAY_SIZE(delta_x) },
1738       { delta_y, ARRAY_SIZE(delta_y) },
1739       { &sample_mask, 1 },
1740       { &shader_start_time, 1 },
1741    };
1742
1743    /* Treat all special values as used, to be conservative */
1744    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1745       for (unsigned j = 0; j < special[i].count; j++) {
1746          if (special[i].reg[j].file == GRF)
1747             remap_table[special[i].reg[j].reg] = 0;
1748       }
1749    }
1750
1751    /* Compact the GRF arrays. */
1752    int new_index = 0;
1753    for (int i = 0; i < this->virtual_grf_count; i++) {
1754       if (remap_table[i] != -1) {
1755          remap_table[i] = new_index;
1756          virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1757          invalidate_live_intervals();
1758          ++new_index;
1759       }
1760    }
1761
1762    this->virtual_grf_count = new_index;
1763
1764    /* Patch all the instructions to use the newly renumbered registers */
1765    foreach_list(node, &this->instructions) {
1766       fs_inst *inst = (fs_inst *) node;
1767
1768       if (inst->dst.file == GRF)
1769          inst->dst.reg = remap_table[inst->dst.reg];
1770
1771       for (int i = 0; i < 3; i++) {
1772          if (inst->src[i].file == GRF)
1773             inst->src[i].reg = remap_table[inst->src[i].reg];
1774       }
1775    }
1776
1777    /* Patch all the references to special values */
1778    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1779       for (unsigned j = 0; j < special[i].count; j++) {
1780          fs_reg *reg = &special[i].reg[j];
1781          if (reg->file == GRF && remap_table[reg->reg] != -1)
1782             reg->reg = remap_table[reg->reg];
1783       }
1784    }
1785 }
1786
1787 /*
1788  * Implements array access of uniforms by inserting a
1789  * PULL_CONSTANT_LOAD instruction.
1790  *
1791  * Unlike temporary GRF array access (where we don't support it due to
1792  * the difficulty of doing relative addressing on instruction
1793  * destinations), we could potentially do array access of uniforms
1794  * that were loaded in GRF space as push constants.  In real-world
1795  * usage we've seen, though, the arrays being used are always larger
1796  * than we could load as push constants, so just always move all
1797  * uniform array access out to a pull constant buffer.
1798  */
1799 void
1800 fs_visitor::move_uniform_array_access_to_pull_constants()
1801 {
1802    if (dispatch_width != 8)
1803       return;
1804
1805    pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
1806
1807    for (unsigned int i = 0; i < uniforms; i++) {
1808       pull_constant_loc[i] = -1;
1809    }
1810
1811    /* Walk through and find array access of uniforms.  Put a copy of that
1812     * uniform in the pull constant buffer.
1813     *
1814     * Note that we don't move constant-indexed accesses to arrays.  No
1815     * testing has been done of the performance impact of this choice.
1816     */
1817    foreach_list_safe(node, &this->instructions) {
1818       fs_inst *inst = (fs_inst *)node;
1819
1820       for (int i = 0 ; i < 3; i++) {
1821          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1822             continue;
1823
1824          int uniform = inst->src[i].reg;
1825
1826          /* If this array isn't already present in the pull constant buffer,
1827           * add it.
1828           */
1829          if (pull_constant_loc[uniform] == -1) {
1830             const float **values = &stage_prog_data->param[uniform];
1831
1832             assert(param_size[uniform]);
1833
1834             for (int j = 0; j < param_size[uniform]; j++) {
1835                pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
1836
1837                stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
1838                   values[j];
1839             }
1840          }
1841       }
1842    }
1843 }
1844
1845 /**
1846  * Assign UNIFORM file registers to either push constants or pull constants.
1847  *
1848  * We allow a fragment shader to have more than the specified minimum
1849  * maximum number of fragment shader uniform components (64).  If
1850  * there are too many of these, they'd fill up all of register space.
1851  * So, this will push some of them out to the pull constant buffer and
1852  * update the program to load them.
1853  */
1854 void
1855 fs_visitor::assign_constant_locations()
1856 {
1857    /* Only the first compile (SIMD8 mode) gets to decide on locations. */
1858    if (dispatch_width != 8)
1859       return;
1860
1861    /* Find which UNIFORM registers are still in use. */
1862    bool is_live[uniforms];
1863    for (unsigned int i = 0; i < uniforms; i++) {
1864       is_live[i] = false;
1865    }
1866
1867    foreach_list(node, &this->instructions) {
1868       fs_inst *inst = (fs_inst *) node;
1869
1870       for (int i = 0; i < 3; i++) {
1871          if (inst->src[i].file != UNIFORM)
1872             continue;
1873
1874          int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1875          if (constant_nr >= 0 && constant_nr < (int) uniforms)
1876             is_live[constant_nr] = true;
1877       }
1878    }
1879
1880    /* Only allow 16 registers (128 uniform components) as push constants.
1881     *
1882     * Just demote the end of the list.  We could probably do better
1883     * here, demoting things that are rarely used in the program first.
1884     */
1885    unsigned int max_push_components = 16 * 8;
1886    unsigned int num_push_constants = 0;
1887
1888    push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
1889
1890    for (unsigned int i = 0; i < uniforms; i++) {
1891       if (!is_live[i] || pull_constant_loc[i] != -1) {
1892          /* This UNIFORM register is either dead, or has already been demoted
1893           * to a pull const.  Mark it as no longer living in the param[] array.
1894           */
1895          push_constant_loc[i] = -1;
1896          continue;
1897       }
1898
1899       if (num_push_constants < max_push_components) {
1900          /* Retain as a push constant.  Record the location in the params[]
1901           * array.
1902           */
1903          push_constant_loc[i] = num_push_constants++;
1904       } else {
1905          /* Demote to a pull constant. */
1906          push_constant_loc[i] = -1;
1907
1908          int pull_index = stage_prog_data->nr_pull_params++;
1909          stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
1910          pull_constant_loc[i] = pull_index;
1911       }
1912    }
1913
1914    stage_prog_data->nr_params = num_push_constants;
1915
1916    /* Up until now, the param[] array has been indexed by reg + reg_offset
1917     * of UNIFORM registers.  Condense it to only contain the uniforms we
1918     * chose to upload as push constants.
1919     */
1920    for (unsigned int i = 0; i < uniforms; i++) {
1921       int remapped = push_constant_loc[i];
1922
1923       if (remapped == -1)
1924          continue;
1925
1926       assert(remapped <= (int)i);
1927       stage_prog_data->param[remapped] = stage_prog_data->param[i];
1928    }
1929 }
1930
1931 /**
1932  * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
1933  * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
1934  */
1935 void
1936 fs_visitor::demote_pull_constants()
1937 {
1938    foreach_list(node, &this->instructions) {
1939       fs_inst *inst = (fs_inst *)node;
1940
1941       for (int i = 0; i < 3; i++) {
1942          if (inst->src[i].file != UNIFORM)
1943             continue;
1944
1945          int pull_index = pull_constant_loc[inst->src[i].reg +
1946                                             inst->src[i].reg_offset];
1947          if (pull_index == -1)
1948             continue;
1949
1950          /* Set up the annotation tracking for new generated instructions. */
1951          base_ir = inst->ir;
1952          current_annotation = inst->annotation;
1953
1954          fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
1955          fs_reg dst = fs_reg(this, glsl_type::float_type);
1956
1957          /* Generate a pull load into dst. */
1958          if (inst->src[i].reladdr) {
1959             exec_list list = VARYING_PULL_CONSTANT_LOAD(dst,
1960                                                         surf_index,
1961                                                         *inst->src[i].reladdr,
1962                                                         pull_index);
1963             inst->insert_before(&list);
1964             inst->src[i].reladdr = NULL;
1965          } else {
1966             fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1967             fs_inst *pull =
1968                new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1969                                     dst, surf_index, offset);
1970             inst->insert_before(pull);
1971             inst->src[i].set_smear(pull_index & 3);
1972          }
1973
1974          /* Rewrite the instruction to use the temporary VGRF. */
1975          inst->src[i].file = GRF;
1976          inst->src[i].reg = dst.reg;
1977          inst->src[i].reg_offset = 0;
1978       }
1979    }
1980    invalidate_live_intervals();
1981 }
1982
1983 bool
1984 fs_visitor::opt_algebraic()
1985 {
1986    bool progress = false;
1987
1988    foreach_list(node, &this->instructions) {
1989       fs_inst *inst = (fs_inst *)node;
1990
1991       switch (inst->opcode) {
1992       case BRW_OPCODE_MUL:
1993          if (inst->src[1].file != IMM)
1994             continue;
1995
1996          /* a * 1.0 = a */
1997          if (inst->src[1].is_one()) {
1998             inst->opcode = BRW_OPCODE_MOV;
1999             inst->src[1] = reg_undef;
2000             progress = true;
2001             break;
2002          }
2003
2004          /* a * 0.0 = 0.0 */
2005          if (inst->src[1].is_zero()) {
2006             inst->opcode = BRW_OPCODE_MOV;
2007             inst->src[0] = inst->src[1];
2008             inst->src[1] = reg_undef;
2009             progress = true;
2010             break;
2011          }
2012
2013          break;
2014       case BRW_OPCODE_ADD:
2015          if (inst->src[1].file != IMM)
2016             continue;
2017
2018          /* a + 0.0 = a */
2019          if (inst->src[1].is_zero()) {
2020             inst->opcode = BRW_OPCODE_MOV;
2021             inst->src[1] = reg_undef;
2022             progress = true;
2023             break;
2024          }
2025          break;
2026       case BRW_OPCODE_OR:
2027          if (inst->src[0].equals(inst->src[1])) {
2028             inst->opcode = BRW_OPCODE_MOV;
2029             inst->src[1] = reg_undef;
2030             progress = true;
2031             break;
2032          }
2033          break;
2034       case BRW_OPCODE_LRP:
2035          if (inst->src[1].equals(inst->src[2])) {
2036             inst->opcode = BRW_OPCODE_MOV;
2037             inst->src[0] = inst->src[1];
2038             inst->src[1] = reg_undef;
2039             inst->src[2] = reg_undef;
2040             progress = true;
2041             break;
2042          }
2043          break;
2044       case BRW_OPCODE_SEL:
2045          if (inst->saturate && inst->src[1].file == IMM) {
2046             switch (inst->conditional_mod) {
2047             case BRW_CONDITIONAL_LE:
2048             case BRW_CONDITIONAL_L:
2049                switch (inst->src[1].type) {
2050                case BRW_REGISTER_TYPE_F:
2051                   if (inst->src[1].imm.f >= 1.0f) {
2052                      inst->opcode = BRW_OPCODE_MOV;
2053                      inst->src[1] = reg_undef;
2054                      progress = true;
2055                   }
2056                   break;
2057                default:
2058                   break;
2059                }
2060                break;
2061             case BRW_CONDITIONAL_GE:
2062             case BRW_CONDITIONAL_G:
2063                switch (inst->src[1].type) {
2064                case BRW_REGISTER_TYPE_F:
2065                   if (inst->src[1].imm.f <= 0.0f) {
2066                      inst->opcode = BRW_OPCODE_MOV;
2067                      inst->src[1] = reg_undef;
2068                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
2069                      progress = true;
2070                   }
2071                   break;
2072                default:
2073                   break;
2074                }
2075             default:
2076                break;
2077             }
2078          }
2079          break;
2080       default:
2081          break;
2082       }
2083    }
2084
2085    return progress;
2086 }
2087
2088 /**
2089  * Removes any instructions writing a VGRF where that VGRF is not used by any
2090  * later instruction.
2091  */
2092 bool
2093 fs_visitor::dead_code_eliminate()
2094 {
2095    bool progress = false;
2096    int pc = 0;
2097
2098    calculate_live_intervals();
2099
2100    foreach_list_safe(node, &this->instructions) {
2101       fs_inst *inst = (fs_inst *)node;
2102
2103       if (inst->dst.file == GRF && !inst->has_side_effects()) {
2104          bool dead = true;
2105
2106          for (int i = 0; i < inst->regs_written; i++) {
2107             int var = live_intervals->var_from_vgrf[inst->dst.reg];
2108             assert(live_intervals->end[var + inst->dst.reg_offset + i] >= pc);
2109             if (live_intervals->end[var + inst->dst.reg_offset + i] != pc) {
2110                dead = false;
2111                break;
2112             }
2113          }
2114
2115          if (dead) {
2116             /* Don't dead code eliminate instructions that write to the
2117              * accumulator as a side-effect. Instead just set the destination
2118              * to the null register to free it.
2119              */
2120             switch (inst->opcode) {
2121             case BRW_OPCODE_ADDC:
2122             case BRW_OPCODE_SUBB:
2123             case BRW_OPCODE_MACH:
2124                inst->dst = fs_reg(retype(brw_null_reg(), inst->dst.type));
2125                break;
2126             default:
2127                inst->remove();
2128                progress = true;
2129                break;
2130             }
2131          }
2132       }
2133
2134       pc++;
2135    }
2136
2137    if (progress)
2138       invalidate_live_intervals();
2139
2140    return progress;
2141 }
2142
2143 struct dead_code_hash_key
2144 {
2145    int vgrf;
2146    int reg_offset;
2147 };
2148
2149 static bool
2150 dead_code_hash_compare(const void *a, const void *b)
2151 {
2152    return memcmp(a, b, sizeof(struct dead_code_hash_key)) == 0;
2153 }
2154
2155 static void
2156 clear_dead_code_hash(struct hash_table *ht)
2157 {
2158    struct hash_entry *entry;
2159
2160    hash_table_foreach(ht, entry) {
2161       _mesa_hash_table_remove(ht, entry);
2162    }
2163 }
2164
2165 static void
2166 insert_dead_code_hash(struct hash_table *ht,
2167                       int vgrf, int reg_offset, fs_inst *inst)
2168 {
2169    /* We don't bother freeing keys, because they'll be GCed with the ht. */
2170    struct dead_code_hash_key *key = ralloc(ht, struct dead_code_hash_key);
2171
2172    key->vgrf = vgrf;
2173    key->reg_offset = reg_offset;
2174
2175    _mesa_hash_table_insert(ht, _mesa_hash_data(key, sizeof(*key)), key, inst);
2176 }
2177
2178 static struct hash_entry *
2179 get_dead_code_hash_entry(struct hash_table *ht, int vgrf, int reg_offset)
2180 {
2181    struct dead_code_hash_key key;
2182
2183    key.vgrf = vgrf;
2184    key.reg_offset = reg_offset;
2185
2186    return _mesa_hash_table_search(ht, _mesa_hash_data(&key, sizeof(key)), &key);
2187 }
2188
2189 static void
2190 remove_dead_code_hash(struct hash_table *ht,
2191                       int vgrf, int reg_offset)
2192 {
2193    struct hash_entry *entry = get_dead_code_hash_entry(ht, vgrf, reg_offset);
2194    if (!entry)
2195       return;
2196
2197    _mesa_hash_table_remove(ht, entry);
2198 }
2199
2200 /**
2201  * Walks basic blocks, removing any regs that are written but not read before
2202  * being redefined.
2203  *
2204  * The dead_code_eliminate() function implements a global dead code
2205  * elimination, but it only handles the removing the last write to a register
2206  * if it's never read.  This one can handle intermediate writes, but only
2207  * within a basic block.
2208  */
2209 bool
2210 fs_visitor::dead_code_eliminate_local()
2211 {
2212    struct hash_table *ht;
2213    bool progress = false;
2214
2215    ht = _mesa_hash_table_create(mem_ctx, dead_code_hash_compare);
2216
2217    if (ht == NULL) {
2218       return false;
2219    }
2220
2221    foreach_list_safe(node, &this->instructions) {
2222       fs_inst *inst = (fs_inst *)node;
2223
2224       /* At a basic block, empty the HT since we don't understand dataflow
2225        * here.
2226        */
2227       if (inst->is_control_flow()) {
2228          clear_dead_code_hash(ht);
2229          continue;
2230       }
2231
2232       /* Clear the HT of any instructions that got read. */
2233       for (int i = 0; i < 3; i++) {
2234          fs_reg src = inst->src[i];
2235          if (src.file != GRF)
2236             continue;
2237
2238          int read = 1;
2239          if (inst->is_send_from_grf())
2240             read = virtual_grf_sizes[src.reg] - src.reg_offset;
2241
2242          for (int reg_offset = src.reg_offset;
2243               reg_offset < src.reg_offset + read;
2244               reg_offset++) {
2245             remove_dead_code_hash(ht, src.reg, reg_offset);
2246          }
2247       }
2248
2249       /* Add any update of a GRF to the HT, removing a previous write if it
2250        * wasn't read.
2251        */
2252       if (inst->dst.file == GRF) {
2253          if (inst->regs_written > 1) {
2254             /* We don't know how to trim channels from an instruction's
2255              * writes, so we can't incrementally remove unread channels from
2256              * it.  Just remove whatever it overwrites from the table
2257              */
2258             for (int i = 0; i < inst->regs_written; i++) {
2259                remove_dead_code_hash(ht,
2260                                      inst->dst.reg,
2261                                      inst->dst.reg_offset + i);
2262             }
2263          } else {
2264             struct hash_entry *entry =
2265                get_dead_code_hash_entry(ht, inst->dst.reg,
2266                                         inst->dst.reg_offset);
2267
2268             if (entry) {
2269                if (inst->is_partial_write()) {
2270                   /* For a partial write, we can't remove any previous dead code
2271                    * candidate, since we're just modifying their result.
2272                    */
2273                } else {
2274                   /* We're completely updating a channel, and there was a
2275                    * previous write to the channel that wasn't read.  Kill it!
2276                    */
2277                   fs_inst *inst = (fs_inst *)entry->data;
2278                   inst->remove();
2279                   progress = true;
2280                }
2281
2282                _mesa_hash_table_remove(ht, entry);
2283             }
2284
2285             if (!inst->has_side_effects())
2286                insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
2287                                      inst);
2288          }
2289       }
2290    }
2291
2292    _mesa_hash_table_destroy(ht, NULL);
2293
2294    if (progress)
2295       invalidate_live_intervals();
2296
2297    return progress;
2298 }
2299
2300 bool
2301 fs_visitor::compute_to_mrf()
2302 {
2303    bool progress = false;
2304    int next_ip = 0;
2305
2306    calculate_live_intervals();
2307
2308    foreach_list_safe(node, &this->instructions) {
2309       fs_inst *inst = (fs_inst *)node;
2310
2311       int ip = next_ip;
2312       next_ip++;
2313
2314       if (inst->opcode != BRW_OPCODE_MOV ||
2315           inst->is_partial_write() ||
2316           inst->dst.file != MRF || inst->src[0].file != GRF ||
2317           inst->dst.type != inst->src[0].type ||
2318           inst->src[0].abs || inst->src[0].negate ||
2319           !inst->src[0].is_contiguous() ||
2320           inst->src[0].subreg_offset)
2321          continue;
2322
2323       /* Work out which hardware MRF registers are written by this
2324        * instruction.
2325        */
2326       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2327       int mrf_high;
2328       if (inst->dst.reg & BRW_MRF_COMPR4) {
2329          mrf_high = mrf_low + 4;
2330       } else if (dispatch_width == 16 &&
2331                  (!inst->force_uncompressed && !inst->force_sechalf)) {
2332          mrf_high = mrf_low + 1;
2333       } else {
2334          mrf_high = mrf_low;
2335       }
2336
2337       /* Can't compute-to-MRF this GRF if someone else was going to
2338        * read it later.
2339        */
2340       if (this->virtual_grf_end[inst->src[0].reg] > ip)
2341          continue;
2342
2343       /* Found a move of a GRF to a MRF.  Let's see if we can go
2344        * rewrite the thing that made this GRF to write into the MRF.
2345        */
2346       fs_inst *scan_inst;
2347       for (scan_inst = (fs_inst *)inst->prev;
2348            scan_inst->prev != NULL;
2349            scan_inst = (fs_inst *)scan_inst->prev) {
2350          if (scan_inst->dst.file == GRF &&
2351              scan_inst->dst.reg == inst->src[0].reg) {
2352             /* Found the last thing to write our reg we want to turn
2353              * into a compute-to-MRF.
2354              */
2355
2356             /* If this one instruction didn't populate all the
2357              * channels, bail.  We might be able to rewrite everything
2358              * that writes that reg, but it would require smarter
2359              * tracking to delay the rewriting until complete success.
2360              */
2361             if (scan_inst->is_partial_write())
2362                break;
2363
2364             /* Things returning more than one register would need us to
2365              * understand coalescing out more than one MOV at a time.
2366              */
2367             if (scan_inst->regs_written > 1)
2368                break;
2369
2370             /* SEND instructions can't have MRF as a destination. */
2371             if (scan_inst->mlen)
2372                break;
2373
2374             if (brw->gen == 6) {
2375                /* gen6 math instructions must have the destination be
2376                 * GRF, so no compute-to-MRF for them.
2377                 */
2378                if (scan_inst->is_math()) {
2379                   break;
2380                }
2381             }
2382
2383             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2384                /* Found the creator of our MRF's source value. */
2385                scan_inst->dst.file = MRF;
2386                scan_inst->dst.reg = inst->dst.reg;
2387                scan_inst->saturate |= inst->saturate;
2388                inst->remove();
2389                progress = true;
2390             }
2391             break;
2392          }
2393
2394          /* We don't handle control flow here.  Most computation of
2395           * values that end up in MRFs are shortly before the MRF
2396           * write anyway.
2397           */
2398          if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2399             break;
2400
2401          /* You can't read from an MRF, so if someone else reads our
2402           * MRF's source GRF that we wanted to rewrite, that stops us.
2403           */
2404          bool interfered = false;
2405          for (int i = 0; i < 3; i++) {
2406             if (scan_inst->src[i].file == GRF &&
2407                 scan_inst->src[i].reg == inst->src[0].reg &&
2408                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2409                interfered = true;
2410             }
2411          }
2412          if (interfered)
2413             break;
2414
2415          if (scan_inst->dst.file == MRF) {
2416             /* If somebody else writes our MRF here, we can't
2417              * compute-to-MRF before that.
2418              */
2419             int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2420             int scan_mrf_high;
2421
2422             if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2423                scan_mrf_high = scan_mrf_low + 4;
2424             } else if (dispatch_width == 16 &&
2425                        (!scan_inst->force_uncompressed &&
2426                         !scan_inst->force_sechalf)) {
2427                scan_mrf_high = scan_mrf_low + 1;
2428             } else {
2429                scan_mrf_high = scan_mrf_low;
2430             }
2431
2432             if (mrf_low == scan_mrf_low ||
2433                 mrf_low == scan_mrf_high ||
2434                 mrf_high == scan_mrf_low ||
2435                 mrf_high == scan_mrf_high) {
2436                break;
2437             }
2438          }
2439
2440          if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2441             /* Found a SEND instruction, which means that there are
2442              * live values in MRFs from base_mrf to base_mrf +
2443              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
2444              * above it.
2445              */
2446             if (mrf_low >= scan_inst->base_mrf &&
2447                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2448                break;
2449             }
2450             if (mrf_high >= scan_inst->base_mrf &&
2451                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2452                break;
2453             }
2454          }
2455       }
2456    }
2457
2458    if (progress)
2459       invalidate_live_intervals();
2460
2461    return progress;
2462 }
2463
2464 /**
2465  * Walks through basic blocks, looking for repeated MRF writes and
2466  * removing the later ones.
2467  */
2468 bool
2469 fs_visitor::remove_duplicate_mrf_writes()
2470 {
2471    fs_inst *last_mrf_move[16];
2472    bool progress = false;
2473
2474    /* Need to update the MRF tracking for compressed instructions. */
2475    if (dispatch_width == 16)
2476       return false;
2477
2478    memset(last_mrf_move, 0, sizeof(last_mrf_move));
2479
2480    foreach_list_safe(node, &this->instructions) {
2481       fs_inst *inst = (fs_inst *)node;
2482
2483       if (inst->is_control_flow()) {
2484          memset(last_mrf_move, 0, sizeof(last_mrf_move));
2485       }
2486
2487       if (inst->opcode == BRW_OPCODE_MOV &&
2488           inst->dst.file == MRF) {
2489          fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2490          if (prev_inst && inst->equals(prev_inst)) {
2491             inst->remove();
2492             progress = true;
2493             continue;
2494          }
2495       }
2496
2497       /* Clear out the last-write records for MRFs that were overwritten. */
2498       if (inst->dst.file == MRF) {
2499          last_mrf_move[inst->dst.reg] = NULL;
2500       }
2501
2502       if (inst->mlen > 0 && inst->base_mrf != -1) {
2503          /* Found a SEND instruction, which will include two or fewer
2504           * implied MRF writes.  We could do better here.
2505           */
2506          for (int i = 0; i < implied_mrf_writes(inst); i++) {
2507             last_mrf_move[inst->base_mrf + i] = NULL;
2508          }
2509       }
2510
2511       /* Clear out any MRF move records whose sources got overwritten. */
2512       if (inst->dst.file == GRF) {
2513          for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2514             if (last_mrf_move[i] &&
2515                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2516                last_mrf_move[i] = NULL;
2517             }
2518          }
2519       }
2520
2521       if (inst->opcode == BRW_OPCODE_MOV &&
2522           inst->dst.file == MRF &&
2523           inst->src[0].file == GRF &&
2524           !inst->is_partial_write()) {
2525          last_mrf_move[inst->dst.reg] = inst;
2526       }
2527    }
2528
2529    if (progress)
2530       invalidate_live_intervals();
2531
2532    return progress;
2533 }
2534
2535 static void
2536 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2537                         int first_grf, int grf_len)
2538 {
2539    bool inst_simd16 = (dispatch_width > 8 &&
2540                        !inst->force_uncompressed &&
2541                        !inst->force_sechalf);
2542
2543    /* Clear the flag for registers that actually got read (as expected). */
2544    for (int i = 0; i < 3; i++) {
2545       int grf;
2546       if (inst->src[i].file == GRF) {
2547          grf = inst->src[i].reg;
2548       } else if (inst->src[i].file == HW_REG &&
2549                  inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2550          grf = inst->src[i].fixed_hw_reg.nr;
2551       } else {
2552          continue;
2553       }
2554
2555       if (grf >= first_grf &&
2556           grf < first_grf + grf_len) {
2557          deps[grf - first_grf] = false;
2558          if (inst_simd16)
2559             deps[grf - first_grf + 1] = false;
2560       }
2561    }
2562 }
2563
2564 /**
2565  * Implements this workaround for the original 965:
2566  *
2567  *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2568  *      check for post destination dependencies on this instruction, software
2569  *      must ensure that there is no destination hazard for the case of ‘write
2570  *      followed by a posted write’ shown in the following example.
2571  *
2572  *      1. mov r3 0
2573  *      2. send r3.xy <rest of send instruction>
2574  *      3. mov r2 r3
2575  *
2576  *      Due to no post-destination dependency check on the ‘send’, the above
2577  *      code sequence could have two instructions (1 and 2) in flight at the
2578  *      same time that both consider ‘r3’ as the target of their final writes.
2579  */
2580 void
2581 fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
2582 {
2583    int reg_size = dispatch_width / 8;
2584    int write_len = inst->regs_written * reg_size;
2585    int first_write_grf = inst->dst.reg;
2586    bool needs_dep[BRW_MAX_MRF];
2587    assert(write_len < (int)sizeof(needs_dep) - 1);
2588
2589    memset(needs_dep, false, sizeof(needs_dep));
2590    memset(needs_dep, true, write_len);
2591
2592    clear_deps_for_inst_src(inst, dispatch_width,
2593                            needs_dep, first_write_grf, write_len);
2594
2595    /* Walk backwards looking for writes to registers we're writing which
2596     * aren't read since being written.  If we hit the start of the program,
2597     * we assume that there are no outstanding dependencies on entry to the
2598     * program.
2599     */
2600    for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2601         scan_inst != NULL;
2602         scan_inst = (fs_inst *)scan_inst->prev) {
2603
2604       /* If we hit control flow, assume that there *are* outstanding
2605        * dependencies, and force their cleanup before our instruction.
2606        */
2607       if (scan_inst->is_control_flow()) {
2608          for (int i = 0; i < write_len; i++) {
2609             if (needs_dep[i]) {
2610                inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2611             }
2612          }
2613          return;
2614       }
2615
2616       bool scan_inst_simd16 = (dispatch_width > 8 &&
2617                                !scan_inst->force_uncompressed &&
2618                                !scan_inst->force_sechalf);
2619
2620       /* We insert our reads as late as possible on the assumption that any
2621        * instruction but a MOV that might have left us an outstanding
2622        * dependency has more latency than a MOV.
2623        */
2624       if (scan_inst->dst.file == GRF) {
2625          for (int i = 0; i < scan_inst->regs_written; i++) {
2626             int reg = scan_inst->dst.reg + i * reg_size;
2627
2628             if (reg >= first_write_grf &&
2629                 reg < first_write_grf + write_len &&
2630                 needs_dep[reg - first_write_grf]) {
2631                inst->insert_before(DEP_RESOLVE_MOV(reg));
2632                needs_dep[reg - first_write_grf] = false;
2633                if (scan_inst_simd16)
2634                   needs_dep[reg - first_write_grf + 1] = false;
2635             }
2636          }
2637       }
2638
2639       /* Clear the flag for registers that actually got read (as expected). */
2640       clear_deps_for_inst_src(scan_inst, dispatch_width,
2641                               needs_dep, first_write_grf, write_len);
2642
2643       /* Continue the loop only if we haven't resolved all the dependencies */
2644       int i;
2645       for (i = 0; i < write_len; i++) {
2646          if (needs_dep[i])
2647             break;
2648       }
2649       if (i == write_len)
2650          return;
2651    }
2652 }
2653
2654 /**
2655  * Implements this workaround for the original 965:
2656  *
2657  *     "[DevBW, DevCL] Errata: A destination register from a send can not be
2658  *      used as a destination register until after it has been sourced by an
2659  *      instruction with a different destination register.
2660  */
2661 void
2662 fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
2663 {
2664    int write_len = inst->regs_written * dispatch_width / 8;
2665    int first_write_grf = inst->dst.reg;
2666    bool needs_dep[BRW_MAX_MRF];
2667    assert(write_len < (int)sizeof(needs_dep) - 1);
2668
2669    memset(needs_dep, false, sizeof(needs_dep));
2670    memset(needs_dep, true, write_len);
2671    /* Walk forwards looking for writes to registers we're writing which aren't
2672     * read before being written.
2673     */
2674    for (fs_inst *scan_inst = (fs_inst *)inst->next;
2675         !scan_inst->is_tail_sentinel();
2676         scan_inst = (fs_inst *)scan_inst->next) {
2677       /* If we hit control flow, force resolve all remaining dependencies. */
2678       if (scan_inst->is_control_flow()) {
2679          for (int i = 0; i < write_len; i++) {
2680             if (needs_dep[i])
2681                scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2682          }
2683          return;
2684       }
2685
2686       /* Clear the flag for registers that actually got read (as expected). */
2687       clear_deps_for_inst_src(scan_inst, dispatch_width,
2688                               needs_dep, first_write_grf, write_len);
2689
2690       /* We insert our reads as late as possible since they're reading the
2691        * result of a SEND, which has massive latency.
2692        */
2693       if (scan_inst->dst.file == GRF &&
2694           scan_inst->dst.reg >= first_write_grf &&
2695           scan_inst->dst.reg < first_write_grf + write_len &&
2696           needs_dep[scan_inst->dst.reg - first_write_grf]) {
2697          scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2698          needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2699       }
2700
2701       /* Continue the loop only if we haven't resolved all the dependencies */
2702       int i;
2703       for (i = 0; i < write_len; i++) {
2704          if (needs_dep[i])
2705             break;
2706       }
2707       if (i == write_len)
2708          return;
2709    }
2710
2711    /* If we hit the end of the program, resolve all remaining dependencies out
2712     * of paranoia.
2713     */
2714    fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2715    assert(last_inst->eot);
2716    for (int i = 0; i < write_len; i++) {
2717       if (needs_dep[i])
2718          last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2719    }
2720 }
2721
2722 void
2723 fs_visitor::insert_gen4_send_dependency_workarounds()
2724 {
2725    if (brw->gen != 4 || brw->is_g4x)
2726       return;
2727
2728    /* Note that we're done with register allocation, so GRF fs_regs always
2729     * have a .reg_offset of 0.
2730     */
2731
2732    foreach_list_safe(node, &this->instructions) {
2733       fs_inst *inst = (fs_inst *)node;
2734
2735       if (inst->mlen != 0 && inst->dst.file == GRF) {
2736          insert_gen4_pre_send_dependency_workarounds(inst);
2737          insert_gen4_post_send_dependency_workarounds(inst);
2738       }
2739    }
2740 }
2741
2742 /**
2743  * Turns the generic expression-style uniform pull constant load instruction
2744  * into a hardware-specific series of instructions for loading a pull
2745  * constant.
2746  *
2747  * The expression style allows the CSE pass before this to optimize out
2748  * repeated loads from the same offset, and gives the pre-register-allocation
2749  * scheduling full flexibility, while the conversion to native instructions
2750  * allows the post-register-allocation scheduler the best information
2751  * possible.
2752  *
2753  * Note that execution masking for setting up pull constant loads is special:
2754  * the channels that need to be written are unrelated to the current execution
2755  * mask, since a later instruction will use one of the result channels as a
2756  * source operand for all 8 or 16 of its channels.
2757  */
2758 void
2759 fs_visitor::lower_uniform_pull_constant_loads()
2760 {
2761    foreach_list(node, &this->instructions) {
2762       fs_inst *inst = (fs_inst *)node;
2763
2764       if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2765          continue;
2766
2767       if (brw->gen >= 7) {
2768          /* The offset arg before was a vec4-aligned byte offset.  We need to
2769           * turn it into a dword offset.
2770           */
2771          fs_reg const_offset_reg = inst->src[1];
2772          assert(const_offset_reg.file == IMM &&
2773                 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2774          const_offset_reg.imm.u /= 4;
2775          fs_reg payload = fs_reg(this, glsl_type::uint_type);
2776
2777          /* This is actually going to be a MOV, but since only the first dword
2778           * is accessed, we have a special opcode to do just that one.  Note
2779           * that this needs to be an operation that will be considered a def
2780           * by live variable analysis, or register allocation will explode.
2781           */
2782          fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2783                                                payload, const_offset_reg);
2784          setup->force_writemask_all = true;
2785
2786          setup->ir = inst->ir;
2787          setup->annotation = inst->annotation;
2788          inst->insert_before(setup);
2789
2790          /* Similarly, this will only populate the first 4 channels of the
2791           * result register (since we only use smear values from 0-3), but we
2792           * don't tell the optimizer.
2793           */
2794          inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2795          inst->src[1] = payload;
2796
2797          invalidate_live_intervals();
2798       } else {
2799          /* Before register allocation, we didn't tell the scheduler about the
2800           * MRF we use.  We know it's safe to use this MRF because nothing
2801           * else does except for register spill/unspill, which generates and
2802           * uses its MRF within a single IR instruction.
2803           */
2804          inst->base_mrf = 14;
2805          inst->mlen = 1;
2806       }
2807    }
2808 }
2809
2810 void
2811 fs_visitor::dump_instructions()
2812 {
2813    calculate_register_pressure();
2814
2815    int ip = 0, max_pressure = 0;
2816    foreach_list(node, &this->instructions) {
2817       backend_instruction *inst = (backend_instruction *)node;
2818       max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
2819       fprintf(stderr, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
2820       dump_instruction(inst);
2821       ++ip;
2822    }
2823    fprintf(stderr, "Maximum %3d registers live at once.\n", max_pressure);
2824 }
2825
2826 void
2827 fs_visitor::dump_instruction(backend_instruction *be_inst)
2828 {
2829    fs_inst *inst = (fs_inst *)be_inst;
2830
2831    if (inst->predicate) {
2832       fprintf(stderr, "(%cf0.%d) ",
2833              inst->predicate_inverse ? '-' : '+',
2834              inst->flag_subreg);
2835    }
2836
2837    fprintf(stderr, "%s", brw_instruction_name(inst->opcode));
2838    if (inst->saturate)
2839       fprintf(stderr, ".sat");
2840    if (inst->conditional_mod) {
2841       fprintf(stderr, "%s", conditional_modifier[inst->conditional_mod]);
2842       if (!inst->predicate &&
2843           (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2844                               inst->opcode != BRW_OPCODE_IF &&
2845                               inst->opcode != BRW_OPCODE_WHILE))) {
2846          fprintf(stderr, ".f0.%d", inst->flag_subreg);
2847       }
2848    }
2849    fprintf(stderr, " ");
2850
2851
2852    switch (inst->dst.file) {
2853    case GRF:
2854       fprintf(stderr, "vgrf%d", inst->dst.reg);
2855       if (virtual_grf_sizes[inst->dst.reg] != 1 ||
2856           inst->dst.subreg_offset)
2857          fprintf(stderr, "+%d.%d",
2858                  inst->dst.reg_offset, inst->dst.subreg_offset);
2859       break;
2860    case MRF:
2861       fprintf(stderr, "m%d", inst->dst.reg);
2862       break;
2863    case BAD_FILE:
2864       fprintf(stderr, "(null)");
2865       break;
2866    case UNIFORM:
2867       fprintf(stderr, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
2868       break;
2869    case HW_REG:
2870       if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
2871          switch (inst->dst.fixed_hw_reg.nr) {
2872          case BRW_ARF_NULL:
2873             fprintf(stderr, "null");
2874             break;
2875          case BRW_ARF_ADDRESS:
2876             fprintf(stderr, "a0.%d", inst->dst.fixed_hw_reg.subnr);
2877             break;
2878          case BRW_ARF_ACCUMULATOR:
2879             fprintf(stderr, "acc%d", inst->dst.fixed_hw_reg.subnr);
2880             break;
2881          case BRW_ARF_FLAG:
2882             fprintf(stderr, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
2883                              inst->dst.fixed_hw_reg.subnr);
2884             break;
2885          default:
2886             fprintf(stderr, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
2887                                inst->dst.fixed_hw_reg.subnr);
2888             break;
2889          }
2890       } else {
2891          fprintf(stderr, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
2892       }
2893       if (inst->dst.fixed_hw_reg.subnr)
2894          fprintf(stderr, "+%d", inst->dst.fixed_hw_reg.subnr);
2895       break;
2896    default:
2897       fprintf(stderr, "???");
2898       break;
2899    }
2900    fprintf(stderr, ":%s, ", brw_reg_type_letters(inst->dst.type));
2901
2902    for (int i = 0; i < 3 && inst->src[i].file != BAD_FILE; i++) {
2903       if (inst->src[i].negate)
2904          fprintf(stderr, "-");
2905       if (inst->src[i].abs)
2906          fprintf(stderr, "|");
2907       switch (inst->src[i].file) {
2908       case GRF:
2909          fprintf(stderr, "vgrf%d", inst->src[i].reg);
2910          if (virtual_grf_sizes[inst->src[i].reg] != 1 ||
2911              inst->src[i].subreg_offset)
2912             fprintf(stderr, "+%d.%d", inst->src[i].reg_offset,
2913                     inst->src[i].subreg_offset);
2914          break;
2915       case MRF:
2916          fprintf(stderr, "***m%d***", inst->src[i].reg);
2917          break;
2918       case UNIFORM:
2919          fprintf(stderr, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
2920          if (inst->src[i].reladdr) {
2921             fprintf(stderr, "+reladdr");
2922          } else if (virtual_grf_sizes[inst->src[i].reg] != 1 ||
2923              inst->src[i].subreg_offset) {
2924             fprintf(stderr, "+%d.%d", inst->src[i].reg_offset,
2925                     inst->src[i].subreg_offset);
2926          }
2927          break;
2928       case BAD_FILE:
2929          fprintf(stderr, "(null)");
2930          break;
2931       case IMM:
2932          switch (inst->src[i].type) {
2933          case BRW_REGISTER_TYPE_F:
2934             fprintf(stderr, "%ff", inst->src[i].imm.f);
2935             break;
2936          case BRW_REGISTER_TYPE_D:
2937             fprintf(stderr, "%dd", inst->src[i].imm.i);
2938             break;
2939          case BRW_REGISTER_TYPE_UD:
2940             fprintf(stderr, "%uu", inst->src[i].imm.u);
2941             break;
2942          default:
2943             fprintf(stderr, "???");
2944             break;
2945          }
2946          break;
2947       case HW_REG:
2948          if (inst->src[i].fixed_hw_reg.negate)
2949             fprintf(stderr, "-");
2950          if (inst->src[i].fixed_hw_reg.abs)
2951             fprintf(stderr, "|");
2952          if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
2953             switch (inst->src[i].fixed_hw_reg.nr) {
2954             case BRW_ARF_NULL:
2955                fprintf(stderr, "null");
2956                break;
2957             case BRW_ARF_ADDRESS:
2958                fprintf(stderr, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
2959                break;
2960             case BRW_ARF_ACCUMULATOR:
2961                fprintf(stderr, "acc%d", inst->src[i].fixed_hw_reg.subnr);
2962                break;
2963             case BRW_ARF_FLAG:
2964                fprintf(stderr, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
2965                                 inst->src[i].fixed_hw_reg.subnr);
2966                break;
2967             default:
2968                fprintf(stderr, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
2969                                   inst->src[i].fixed_hw_reg.subnr);
2970                break;
2971             }
2972          } else {
2973             fprintf(stderr, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
2974          }
2975          if (inst->src[i].fixed_hw_reg.subnr)
2976             fprintf(stderr, "+%d", inst->src[i].fixed_hw_reg.subnr);
2977          if (inst->src[i].fixed_hw_reg.abs)
2978             fprintf(stderr, "|");
2979          break;
2980       default:
2981          fprintf(stderr, "???");
2982          break;
2983       }
2984       if (inst->src[i].abs)
2985          fprintf(stderr, "|");
2986
2987       if (inst->src[i].file != IMM) {
2988          fprintf(stderr, ":%s", brw_reg_type_letters(inst->src[i].type));
2989       }
2990
2991       if (i < 2 && inst->src[i + 1].file != BAD_FILE)
2992          fprintf(stderr, ", ");
2993    }
2994
2995    fprintf(stderr, " ");
2996
2997    if (inst->force_uncompressed)
2998       fprintf(stderr, "1sthalf ");
2999
3000    if (inst->force_sechalf)
3001       fprintf(stderr, "2ndhalf ");
3002
3003    fprintf(stderr, "\n");
3004 }
3005
3006 /**
3007  * Possibly returns an instruction that set up @param reg.
3008  *
3009  * Sometimes we want to take the result of some expression/variable
3010  * dereference tree and rewrite the instruction generating the result
3011  * of the tree.  When processing the tree, we know that the
3012  * instructions generated are all writing temporaries that are dead
3013  * outside of this tree.  So, if we have some instructions that write
3014  * a temporary, we're free to point that temp write somewhere else.
3015  *
3016  * Note that this doesn't guarantee that the instruction generated
3017  * only reg -- it might be the size=4 destination of a texture instruction.
3018  */
3019 fs_inst *
3020 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3021                                            fs_inst *end,
3022                                            const fs_reg &reg)
3023 {
3024    if (end == start ||
3025        end->is_partial_write() ||
3026        reg.reladdr ||
3027        !reg.equals(end->dst)) {
3028       return NULL;
3029    } else {
3030       return end;
3031    }
3032 }
3033
3034 void
3035 fs_visitor::setup_payload_gen6()
3036 {
3037    bool uses_depth =
3038       (fp->Base.InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3039    unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
3040
3041    assert(brw->gen >= 6);
3042
3043    /* R0-1: masks, pixel X/Y coordinates. */
3044    c->nr_payload_regs = 2;
3045    /* R2: only for 32-pixel dispatch.*/
3046
3047    /* R3-26: barycentric interpolation coordinates.  These appear in the
3048     * same order that they appear in the brw_wm_barycentric_interp_mode
3049     * enum.  Each set of coordinates occupies 2 registers if dispatch width
3050     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
3051     * appear if they were enabled using the "Barycentric Interpolation
3052     * Mode" bits in WM_STATE.
3053     */
3054    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3055       if (barycentric_interp_modes & (1 << i)) {
3056          c->barycentric_coord_reg[i] = c->nr_payload_regs;
3057          c->nr_payload_regs += 2;
3058          if (dispatch_width == 16) {
3059             c->nr_payload_regs += 2;
3060          }
3061       }
3062    }
3063
3064    /* R27: interpolated depth if uses source depth */
3065    if (uses_depth) {
3066       c->source_depth_reg = c->nr_payload_regs;
3067       c->nr_payload_regs++;
3068       if (dispatch_width == 16) {
3069          /* R28: interpolated depth if not SIMD8. */
3070          c->nr_payload_regs++;
3071       }
3072    }
3073    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3074    if (uses_depth) {
3075       c->source_w_reg = c->nr_payload_regs;
3076       c->nr_payload_regs++;
3077       if (dispatch_width == 16) {
3078          /* R30: interpolated W if not SIMD8. */
3079          c->nr_payload_regs++;
3080       }
3081    }
3082
3083    c->prog_data.uses_pos_offset = c->key.compute_pos_offset;
3084    /* R31: MSAA position offsets. */
3085    if (c->prog_data.uses_pos_offset) {
3086       c->sample_pos_reg = c->nr_payload_regs;
3087       c->nr_payload_regs++;
3088    }
3089
3090    /* R32: MSAA input coverage mask */
3091    if (fp->Base.SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
3092       assert(brw->gen >= 7);
3093       c->sample_mask_reg = c->nr_payload_regs;
3094       c->nr_payload_regs++;
3095       if (dispatch_width == 16) {
3096          /* R33: input coverage mask if not SIMD8. */
3097          c->nr_payload_regs++;
3098       }
3099    }
3100
3101    /* R34-: bary for 32-pixel. */
3102    /* R58-59: interp W for 32-pixel. */
3103
3104    if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3105       c->source_depth_to_render_target = true;
3106    }
3107 }
3108
3109 void
3110 fs_visitor::assign_binding_table_offsets()
3111 {
3112    uint32_t next_binding_table_offset = 0;
3113
3114    /* If there are no color regions, we still perform an FB write to a null
3115     * renderbuffer, which we place at surface index 0.
3116     */
3117    c->prog_data.binding_table.render_target_start = next_binding_table_offset;
3118    next_binding_table_offset += MAX2(c->key.nr_color_regions, 1);
3119
3120    assign_common_binding_table_offsets(next_binding_table_offset);
3121 }
3122
3123 void
3124 fs_visitor::calculate_register_pressure()
3125 {
3126    invalidate_live_intervals();
3127    calculate_live_intervals();
3128
3129    int num_instructions = 0;
3130    foreach_list(node, &this->instructions) {
3131       ++num_instructions;
3132    }
3133
3134    regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
3135
3136    for (int reg = 0; reg < virtual_grf_count; reg++) {
3137       for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
3138          regs_live_at_ip[ip] += virtual_grf_sizes[reg];
3139    }
3140 }
3141
3142 /**
3143  * Look for repeated FS_OPCODE_MOV_DISPATCH_TO_FLAGS and drop the later ones.
3144  *
3145  * The needs_unlit_centroid_workaround ends up producing one of these per
3146  * channel of centroid input, so it's good to clean them up.
3147  *
3148  * An assumption here is that nothing ever modifies the dispatched pixels
3149  * value that FS_OPCODE_MOV_DISPATCH_TO_FLAGS reads from, but the hardware
3150  * dictates that anyway.
3151  */
3152 void
3153 fs_visitor::opt_drop_redundant_mov_to_flags()
3154 {
3155    bool flag_mov_found[2] = {false};
3156
3157    foreach_list_safe(node, &this->instructions) {
3158       fs_inst *inst = (fs_inst *)node;
3159
3160       if (inst->is_control_flow()) {
3161          memset(flag_mov_found, 0, sizeof(flag_mov_found));
3162       } else if (inst->opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS) {
3163          if (!flag_mov_found[inst->flag_subreg])
3164             flag_mov_found[inst->flag_subreg] = true;
3165          else
3166             inst->remove();
3167       } else if (inst->writes_flag()) {
3168          flag_mov_found[inst->flag_subreg] = false;
3169       }
3170    }
3171 }
3172
3173 bool
3174 fs_visitor::run()
3175 {
3176    sanity_param_count = fp->Base.Parameters->NumParameters;
3177    bool allocated_without_spills;
3178
3179    assign_binding_table_offsets();
3180
3181    if (brw->gen >= 6)
3182       setup_payload_gen6();
3183    else
3184       setup_payload_gen4();
3185
3186    if (0) {
3187       emit_dummy_fs();
3188    } else {
3189       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3190          emit_shader_time_begin();
3191
3192       calculate_urb_setup();
3193       if (fp->Base.InputsRead > 0) {
3194          if (brw->gen < 6)
3195             emit_interpolation_setup_gen4();
3196          else
3197             emit_interpolation_setup_gen6();
3198       }
3199
3200       /* We handle discards by keeping track of the still-live pixels in f0.1.
3201        * Initialize it with the dispatched pixels.
3202        */
3203       if (fp->UsesKill || c->key.alpha_test_func) {
3204          fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
3205          discard_init->flag_subreg = 1;
3206       }
3207
3208       /* Generate FS IR for main().  (the visitor only descends into
3209        * functions called "main").
3210        */
3211       if (shader) {
3212          foreach_list(node, &*shader->base.ir) {
3213             ir_instruction *ir = (ir_instruction *)node;
3214             base_ir = ir;
3215             this->result = reg_undef;
3216             ir->accept(this);
3217          }
3218       } else {
3219          emit_fragment_program_code();
3220       }
3221       base_ir = NULL;
3222       if (failed)
3223          return false;
3224
3225       emit(FS_OPCODE_PLACEHOLDER_HALT);
3226
3227       if (c->key.alpha_test_func)
3228          emit_alpha_test();
3229
3230       emit_fb_writes();
3231
3232       split_virtual_grfs();
3233
3234       move_uniform_array_access_to_pull_constants();
3235       assign_constant_locations();
3236       demote_pull_constants();
3237
3238       opt_drop_redundant_mov_to_flags();
3239
3240       bool progress;
3241       do {
3242          progress = false;
3243
3244          compact_virtual_grfs();
3245
3246          progress = remove_duplicate_mrf_writes() || progress;
3247
3248          progress = opt_algebraic() || progress;
3249          progress = opt_cse() || progress;
3250          progress = opt_copy_propagate() || progress;
3251          progress = opt_peephole_predicated_break() || progress;
3252          progress = dead_code_eliminate() || progress;
3253          progress = dead_code_eliminate_local() || progress;
3254          progress = opt_peephole_sel() || progress;
3255          progress = dead_control_flow_eliminate(this) || progress;
3256          progress = opt_saturate_propagation() || progress;
3257          progress = register_coalesce() || progress;
3258          progress = compute_to_mrf() || progress;
3259       } while (progress);
3260
3261       lower_uniform_pull_constant_loads();
3262
3263       assign_curb_setup();
3264       assign_urb_setup();
3265
3266       static enum instruction_scheduler_mode pre_modes[] = {
3267          SCHEDULE_PRE,
3268          SCHEDULE_PRE_NON_LIFO,
3269          SCHEDULE_PRE_LIFO,
3270       };
3271
3272       /* Try each scheduling heuristic to see if it can successfully register
3273        * allocate without spilling.  They should be ordered by decreasing
3274        * performance but increasing likelihood of allocating.
3275        */
3276       for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3277          schedule_instructions(pre_modes[i]);
3278
3279          if (0) {
3280             assign_regs_trivial();
3281             allocated_without_spills = true;
3282          } else {
3283             allocated_without_spills = assign_regs(false);
3284          }
3285          if (allocated_without_spills)
3286             break;
3287       }
3288
3289       if (!allocated_without_spills) {
3290          /* We assume that any spilling is worse than just dropping back to
3291           * SIMD8.  There's probably actually some intermediate point where
3292           * SIMD16 with a couple of spills is still better.
3293           */
3294          if (dispatch_width == 16) {
3295             fail("Failure to register allocate.  Reduce number of "
3296                  "live scalar values to avoid this.");
3297          }
3298
3299          /* Since we're out of heuristics, just go spill registers until we
3300           * get an allocation.
3301           */
3302          while (!assign_regs(true)) {
3303             if (failed)
3304                break;
3305          }
3306       }
3307    }
3308    assert(force_uncompressed_stack == 0);
3309
3310    /* This must come after all optimization and register allocation, since
3311     * it inserts dead code that happens to have side effects, and it does
3312     * so based on the actual physical registers in use.
3313     */
3314    insert_gen4_send_dependency_workarounds();
3315
3316    if (failed)
3317       return false;
3318
3319    if (!allocated_without_spills)
3320       schedule_instructions(SCHEDULE_POST);
3321
3322    if (dispatch_width == 8)
3323       c->prog_data.reg_blocks = brw_register_blocks(grf_used);
3324    else
3325       c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
3326
3327    /* If any state parameters were appended, then ParameterValues could have
3328     * been realloced, in which case the driver uniform storage set up by
3329     * _mesa_associate_uniform_storage() would point to freed memory.  Make
3330     * sure that didn't happen.
3331     */
3332    assert(sanity_param_count == fp->Base.Parameters->NumParameters);
3333
3334    return !failed;
3335 }
3336
3337 const unsigned *
3338 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
3339                struct gl_fragment_program *fp,
3340                struct gl_shader_program *prog,
3341                unsigned *final_assembly_size)
3342 {
3343    bool start_busy = false;
3344    double start_time = 0;
3345
3346    if (unlikely(brw->perf_debug)) {
3347       start_busy = (brw->batch.last_bo &&
3348                     drm_intel_bo_busy(brw->batch.last_bo));
3349       start_time = get_time();
3350    }
3351
3352    struct brw_shader *shader = NULL;
3353    if (prog)
3354       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3355
3356    if (unlikely(INTEL_DEBUG & DEBUG_WM))
3357       brw_dump_ir(brw, "fragment", prog, &shader->base, &fp->Base);
3358
3359    /* Now the main event: Visit the shader IR and generate our FS IR for it.
3360     */
3361    fs_visitor v(brw, c, prog, fp, 8);
3362    if (!v.run()) {
3363       if (prog) {
3364          prog->LinkStatus = false;
3365          ralloc_strcat(&prog->InfoLog, v.fail_msg);
3366       }
3367
3368       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3369                     v.fail_msg);
3370
3371       return NULL;
3372    }
3373
3374    exec_list *simd16_instructions = NULL;
3375    fs_visitor v2(brw, c, prog, fp, 16);
3376    if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16))) {
3377       if (!v.simd16_unsupported) {
3378          /* Try a SIMD16 compile */
3379          v2.import_uniforms(&v);
3380          if (!v2.run()) {
3381             perf_debug("SIMD16 shader failed to compile, falling back to "
3382                        "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
3383          } else {
3384             simd16_instructions = &v2.instructions;
3385          }
3386       } else {
3387          perf_debug("SIMD16 shader unsupported, falling back to "
3388                     "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
3389       }
3390    }
3391
3392    const unsigned *assembly = NULL;
3393    if (brw->gen >= 8) {
3394       gen8_fs_generator g(brw, c, prog, fp, v.do_dual_src);
3395       assembly = g.generate_assembly(&v.instructions, simd16_instructions,
3396                                      final_assembly_size);
3397    } else {
3398       fs_generator g(brw, c, prog, fp, v.do_dual_src);
3399       assembly = g.generate_assembly(&v.instructions, simd16_instructions,
3400                                      final_assembly_size);
3401    }
3402
3403    if (unlikely(brw->perf_debug) && shader) {
3404       if (shader->compiled_once)
3405          brw_wm_debug_recompile(brw, prog, &c->key);
3406       shader->compiled_once = true;
3407
3408       if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
3409          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3410                     (get_time() - start_time) * 1000);
3411       }
3412    }
3413
3414    return assembly;
3415 }
3416
3417 bool
3418 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
3419 {
3420    struct brw_context *brw = brw_context(ctx);
3421    struct brw_wm_prog_key key;
3422
3423    if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
3424       return true;
3425
3426    struct gl_fragment_program *fp = (struct gl_fragment_program *)
3427       prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
3428    struct brw_fragment_program *bfp = brw_fragment_program(fp);
3429    bool program_uses_dfdy = fp->UsesDFdy;
3430
3431    memset(&key, 0, sizeof(key));
3432
3433    if (brw->gen < 6) {
3434       if (fp->UsesKill)
3435          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3436
3437       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3438          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3439
3440       /* Just assume depth testing. */
3441       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3442       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3443    }
3444
3445    if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
3446                                          BRW_FS_VARYING_INPUT_MASK) > 16)
3447       key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
3448
3449    key.clamp_fragment_color = ctx->API == API_OPENGL_COMPAT;
3450
3451    unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
3452    for (unsigned i = 0; i < sampler_count; i++) {
3453       if (fp->Base.ShadowSamplers & (1 << i)) {
3454          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3455          key.tex.swizzles[i] =
3456             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3457       } else {
3458          /* Color sampler: assume no swizzling. */
3459          key.tex.swizzles[i] = SWIZZLE_XYZW;
3460       }
3461    }
3462
3463    if (fp->Base.InputsRead & VARYING_BIT_POS) {
3464       key.drawable_height = ctx->DrawBuffer->Height;
3465    }
3466
3467    key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
3468          ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
3469          BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
3470
3471    if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3472       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
3473                           key.nr_color_regions > 1;
3474    }
3475
3476    /* GL_FRAGMENT_SHADER_DERIVATIVE_HINT is almost always GL_DONT_CARE.  The
3477     * quality of the derivatives is likely to be determined by the driconf
3478     * option.
3479     */
3480    key.high_quality_derivatives = brw->disable_derivative_optimization;
3481
3482    key.program_string_id = bfp->id;
3483
3484    uint32_t old_prog_offset = brw->wm.base.prog_offset;
3485    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3486
3487    bool success = do_wm_prog(brw, prog, bfp, &key);
3488
3489    brw->wm.base.prog_offset = old_prog_offset;
3490    brw->wm.prog_data = old_prog_data;
3491
3492    return success;
3493 }