src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 extern "C" {
  32
  33 #include <sys/types.h>
  34
  35 #include "util/hash_table.h"
  36 #include "main/macros.h"
  37 #include "main/shaderobj.h"
  38 #include "main/fbobject.h"
  39 #include "program/prog_parameter.h"
  40 #include "program/prog_print.h"
  41 #include "util/register_allocate.h"
  42 #include "program/sampler.h"
  43 #include "program/hash_table.h"
  44 #include "brw_context.h"
  45 #include "brw_eu.h"
  46 #include "brw_wm.h"
  47 }
  48 #include "brw_fs.h"
  49 #include "brw_cfg.h"
  50 #include "brw_dead_control_flow.h"
  51 #include "main/uniforms.h"
  52 #include "brw_fs_live_variables.h"
  53 #include "glsl/glsl_types.h"
  54
  55 void
  56 fs_inst::init(enum opcode opcode, const fs_reg &dst, fs_reg *src, int sources)
  57 {
  58    memset(this, 0, sizeof(*this));
  59
  60    this->opcode = opcode;
  61    this->dst = dst;
  62    this->src = src;
  63    this->sources = sources;
  64
  65    this->conditional_mod = BRW_CONDITIONAL_NONE;
  66
  67    /* This will be the case for almost all instructions. */
  68    this->regs_written = 1;
  69
  70    this->writes_accumulator = false;
  71 }
  72
  73 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst)
  74 {
  75    fs_reg *src = ralloc_array(this, fs_reg, 3);
  76    init(opcode, dst, src, 0);
  77 }
  78
  79 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
  80 {
  81    fs_reg *src = ralloc_array(this, fs_reg, 3);
  82    src[0] = src0;
  83    init(opcode, dst, src, 1);
  84 }
  85
  86 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
  87                  const fs_reg &src1)
  88 {
  89    fs_reg *src = ralloc_array(this, fs_reg, 3);
  90    src[0] = src0;
  91    src[1] = src1;
  92    init(opcode, dst, src, 2);
  93 }
  94
  95 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
  96                  const fs_reg &src1, const fs_reg &src2)
  97 {
  98    fs_reg *src = ralloc_array(this, fs_reg, 3);
  99    src[0] = src0;
 100    src[1] = src1;
 101    src[2] = src2;
 102    init(opcode, dst, src, 3);
 103 }
 104
 105 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, fs_reg src[], int sources)
 106 {
 107    init(opcode, dst, src, sources);
 108 }
 109
 110 fs_inst::fs_inst(const fs_inst &that)
 111 {
 112    memcpy(this, &that, sizeof(that));
 113
 114    this->src = ralloc_array(this, fs_reg, that.sources);
 115
 116    for (int i = 0; i < that.sources; i++)
 117       this->src[i] = that.src[i];
 118 }
 119
 120 void
 121 fs_inst::resize_sources(uint8_t num_sources)
 122 {
 123    if (this->sources != num_sources) {
 124       this->src = reralloc(this, this->src, fs_reg, num_sources);
 125       this->sources = num_sources;
 126    }
 127 }
 128
 129 #define ALU1(op)                                                        \
 130    fs_inst *                                                            \
 131    fs_visitor::op(const fs_reg &dst, const fs_reg &src0)                \
 132    {                                                                    \
 133       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
 134    }
 135
 136 #define ALU2(op)                                                        \
 137    fs_inst *                                                            \
 138    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 139                   const fs_reg &src1)                                   \
 140    {                                                                    \
 141       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
 142    }
 143
 144 #define ALU2_ACC(op)                                                    \
 145    fs_inst *                                                            \
 146    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 147                   const fs_reg &src1)                                   \
 148    {                                                                    \
 149       fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
 150       inst->writes_accumulator = true;                                  \
 151       return inst;                                                      \
 152    }
 153
 154 #define ALU3(op)                                                        \
 155    fs_inst *                                                            \
 156    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 157                   const fs_reg &src1, const fs_reg &src2)               \
 158    {                                                                    \
 159       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
 160    }
 161
 162 ALU1(NOT)
 163 ALU1(MOV)
 164 ALU1(FRC)
 165 ALU1(RNDD)
 166 ALU1(RNDE)
 167 ALU1(RNDZ)
 168 ALU2(ADD)
 169 ALU2(MUL)
 170 ALU2_ACC(MACH)
 171 ALU2(AND)
 172 ALU2(OR)
 173 ALU2(XOR)
 174 ALU2(SHL)
 175 ALU2(SHR)
 176 ALU2(ASR)
 177 ALU3(LRP)
 178 ALU1(BFREV)
 179 ALU3(BFE)
 180 ALU2(BFI1)
 181 ALU3(BFI2)
 182 ALU1(FBH)
 183 ALU1(FBL)
 184 ALU1(CBIT)
 185 ALU3(MAD)
 186 ALU2_ACC(ADDC)
 187 ALU2_ACC(SUBB)
 188 ALU2(SEL)
 189 ALU2(MAC)
 190
 191 /** Gen4 predicated IF. */
 192 fs_inst *
 193 fs_visitor::IF(enum brw_predicate predicate)
 194 {
 195    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
 196    inst->predicate = predicate;
 197    return inst;
 198 }
 199
 200 /** Gen6 IF with embedded comparison. */
 201 fs_inst *
 202 fs_visitor::IF(const fs_reg &src0, const fs_reg &src1,
 203                enum brw_conditional_mod condition)
 204 {
 205    assert(brw->gen == 6);
 206    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
 207                                         reg_null_d, src0, src1);
 208    inst->conditional_mod = condition;
 209    return inst;
 210 }
 211
 212 /**
 213  * CMP: Sets the low bit of the destination channels with the result
 214  * of the comparison, while the upper bits are undefined, and updates
 215  * the flag register with the packed 16 bits of the result.
 216  */
 217 fs_inst *
 218 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1,
 219                 enum brw_conditional_mod condition)
 220 {
 221    fs_inst *inst;
 222
 223    /* Take the instruction:
 224     *
 225     * CMP null<d> src0<f> src1<f>
 226     *
 227     * Original gen4 does type conversion to the destination type before
 228     * comparison, producing garbage results for floating point comparisons.
 229     * gen5 does the comparison on the execution type (resolved source types),
 230     * so dst type doesn't matter.  gen6 does comparison and then uses the
 231     * result as if it was the dst type with no conversion, which happens to
 232     * mostly work out for float-interpreted-as-int since our comparisons are
 233     * for >0, =0, <0.
 234     */
 235    if (brw->gen == 4) {
 236       dst.type = src0.type;
 237       if (dst.file == HW_REG)
 238          dst.fixed_hw_reg.type = dst.type;
 239    }
 240
 241    resolve_ud_negate(&src0);
 242    resolve_ud_negate(&src1);
 243
 244    inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
 245    inst->conditional_mod = condition;
 246
 247    return inst;
 248 }
 249
 250 fs_inst *
 251 fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources)
 252 {
 253    fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, dst, src,
 254                                         sources);
 255    inst->regs_written = sources;
 256
 257    return inst;
 258 }
 259
 260 exec_list
 261 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
 262                                        const fs_reg &surf_index,
 263                                        const fs_reg &varying_offset,
 264                                        uint32_t const_offset)
 265 {
 266    exec_list instructions;
 267    fs_inst *inst;
 268
 269    /* We have our constant surface use a pitch of 4 bytes, so our index can
 270     * be any component of a vector, and then we load 4 contiguous
 271     * components starting from that.
 272     *
 273     * We break down the const_offset to a portion added to the variable
 274     * offset and a portion done using reg_offset, which means that if you
 275     * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
 276     * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
 277     * CSE can later notice that those loads are all the same and eliminate
 278     * the redundant ones.
 279     */
 280    fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
 281    instructions.push_tail(ADD(vec4_offset,
 282                               varying_offset, const_offset & ~3));
 283
 284    int scale = 1;
 285    if (brw->gen == 4 && dispatch_width == 8) {
 286       /* Pre-gen5, we can either use a SIMD8 message that requires (header,
 287        * u, v, r) as parameters, or we can just use the SIMD16 message
 288        * consisting of (header, u).  We choose the second, at the cost of a
 289        * longer return length.
 290        */
 291       scale = 2;
 292    }
 293
 294    enum opcode op;
 295    if (brw->gen >= 7)
 296       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
 297    else
 298       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
 299    fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4 * scale), dst.type);
 300    inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
 301    inst->regs_written = 4 * scale;
 302    instructions.push_tail(inst);
 303
 304    if (brw->gen < 7) {
 305       inst->base_mrf = 13;
 306       inst->header_present = true;
 307       if (brw->gen == 4)
 308          inst->mlen = 3;
 309       else
 310          inst->mlen = 1 + dispatch_width / 8;
 311    }
 312
 313    vec4_result.reg_offset += (const_offset & 3) * scale;
 314    instructions.push_tail(MOV(dst, vec4_result));
 315
 316    return instructions;
 317 }
 318
 319 /**
 320  * A helper for MOV generation for fixing up broken hardware SEND dependency
 321  * handling.
 322  */
 323 fs_inst *
 324 fs_visitor::DEP_RESOLVE_MOV(int grf)
 325 {
 326    fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
 327
 328    inst->ir = NULL;
 329    inst->annotation = "send dependency resolve";
 330
 331    /* The caller always wants uncompressed to emit the minimal extra
 332     * dependencies, and to avoid having to deal with aligning its regs to 2.
 333     */
 334    inst->force_uncompressed = true;
 335
 336    return inst;
 337 }
 338
 339 bool
 340 fs_inst::equals(fs_inst *inst) const
 341 {
 342    return (opcode == inst->opcode &&
 343            dst.equals(inst->dst) &&
 344            src[0].equals(inst->src[0]) &&
 345            src[1].equals(inst->src[1]) &&
 346            src[2].equals(inst->src[2]) &&
 347            saturate == inst->saturate &&
 348            predicate == inst->predicate &&
 349            conditional_mod == inst->conditional_mod &&
 350            mlen == inst->mlen &&
 351            base_mrf == inst->base_mrf &&
 352            target == inst->target &&
 353            eot == inst->eot &&
 354            header_present == inst->header_present &&
 355            shadow_compare == inst->shadow_compare &&
 356            offset == inst->offset);
 357 }
 358
 359 bool
 360 fs_inst::overwrites_reg(const fs_reg &reg) const
 361 {
 362    return (reg.file == dst.file &&
 363            reg.reg == dst.reg &&
 364            reg.reg_offset >= dst.reg_offset  &&
 365            reg.reg_offset < dst.reg_offset + regs_written);
 366 }
 367
 368 bool
 369 fs_inst::is_send_from_grf() const
 370 {
 371    return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
 372            opcode == SHADER_OPCODE_SHADER_TIME_ADD ||
 373            opcode == FS_OPCODE_INTERPOLATE_AT_CENTROID ||
 374            opcode == FS_OPCODE_INTERPOLATE_AT_SAMPLE ||
 375            opcode == FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET ||
 376            opcode == FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET ||
 377            (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
 378             src[1].file == GRF) ||
 379            (is_tex() && src[0].file == GRF));
 380 }
 381
 382 bool
 383 fs_inst::can_do_source_mods(struct brw_context *brw)
 384 {
 385    if (brw->gen == 6 && is_math())
 386       return false;
 387
 388    if (is_send_from_grf())
 389       return false;
 390
 391    if (!backend_instruction::can_do_source_mods())
 392       return false;
 393
 394    return true;
 395 }
 396
 397 void
 398 fs_reg::init()
 399 {
 400    memset(this, 0, sizeof(*this));
 401    stride = 1;
 402 }
 403
 404 /** Generic unset register constructor. */
 405 fs_reg::fs_reg()
 406 {
 407    init();
 408    this->file = BAD_FILE;
 409 }
 410
 411 /** Immediate value constructor. */
 412 fs_reg::fs_reg(float f)
 413 {
 414    init();
 415    this->file = IMM;
 416    this->type = BRW_REGISTER_TYPE_F;
 417    this->fixed_hw_reg.dw1.f = f;
 418 }
 419
 420 /** Immediate value constructor. */
 421 fs_reg::fs_reg(int32_t i)
 422 {
 423    init();
 424    this->file = IMM;
 425    this->type = BRW_REGISTER_TYPE_D;
 426    this->fixed_hw_reg.dw1.d = i;
 427 }
 428
 429 /** Immediate value constructor. */
 430 fs_reg::fs_reg(uint32_t u)
 431 {
 432    init();
 433    this->file = IMM;
 434    this->type = BRW_REGISTER_TYPE_UD;
 435    this->fixed_hw_reg.dw1.ud = u;
 436 }
 437
 438 /** Fixed brw_reg. */
 439 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
 440 {
 441    init();
 442    this->file = HW_REG;
 443    this->fixed_hw_reg = fixed_hw_reg;
 444    this->type = fixed_hw_reg.type;
 445 }
 446
 447 bool
 448 fs_reg::equals(const fs_reg &r) const
 449 {
 450    return (file == r.file &&
 451            reg == r.reg &&
 452            reg_offset == r.reg_offset &&
 453            subreg_offset == r.subreg_offset &&
 454            type == r.type &&
 455            negate == r.negate &&
 456            abs == r.abs &&
 457            !reladdr && !r.reladdr &&
 458            memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
 459                   sizeof(fixed_hw_reg)) == 0 &&
 460            stride == r.stride);
 461 }
 462
 463 fs_reg &
 464 fs_reg::apply_stride(unsigned stride)
 465 {
 466    assert((this->stride * stride) <= 4 &&
 467           (is_power_of_two(stride) || stride == 0) &&
 468           file != HW_REG && file != IMM);
 469    this->stride *= stride;
 470    return *this;
 471 }
 472
 473 fs_reg &
 474 fs_reg::set_smear(unsigned subreg)
 475 {
 476    assert(file != HW_REG && file != IMM);
 477    subreg_offset = subreg * type_sz(type);
 478    stride = 0;
 479    return *this;
 480 }
 481
 482 bool
 483 fs_reg::is_contiguous() const
 484 {
 485    return stride == 1;
 486 }
 487
 488 bool
 489 fs_reg::is_valid_3src() const
 490 {
 491    return file == GRF || file == UNIFORM;
 492 }
 493
 494 int
 495 fs_visitor::type_size(const struct glsl_type *type)
 496 {
 497    unsigned int size, i;
 498
 499    switch (type->base_type) {
 500    case GLSL_TYPE_UINT:
 501    case GLSL_TYPE_INT:
 502    case GLSL_TYPE_FLOAT:
 503    case GLSL_TYPE_BOOL:
 504       return type->components();
 505    case GLSL_TYPE_ARRAY:
 506       return type_size(type->fields.array) * type->length;
 507    case GLSL_TYPE_STRUCT:
 508       size = 0;
 509       for (i = 0; i < type->length; i++) {
 510          size += type_size(type->fields.structure[i].type);
 511       }
 512       return size;
 513    case GLSL_TYPE_SAMPLER:
 514       /* Samplers take up no register space, since they're baked in at
 515        * link time.
 516        */
 517       return 0;
 518    case GLSL_TYPE_ATOMIC_UINT:
 519       return 0;
 520    case GLSL_TYPE_IMAGE:
 521    case GLSL_TYPE_VOID:
 522    case GLSL_TYPE_ERROR:
 523    case GLSL_TYPE_INTERFACE:
 524       unreachable("not reached");
 525    }
 526
 527    return 0;
 528 }
 529
 530 fs_reg
 531 fs_visitor::get_timestamp()
 532 {
 533    assert(brw->gen >= 7);
 534
 535    fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 536                                           BRW_ARF_TIMESTAMP,
 537                                           0),
 538                              BRW_REGISTER_TYPE_UD));
 539
 540    fs_reg dst = fs_reg(this, glsl_type::uint_type);
 541
 542    fs_inst *mov = emit(MOV(dst, ts));
 543    /* We want to read the 3 fields we care about (mostly field 0, but also 2)
 544     * even if it's not enabled in the dispatch.
 545     */
 546    mov->force_writemask_all = true;
 547    mov->force_uncompressed = true;
 548
 549    /* The caller wants the low 32 bits of the timestamp.  Since it's running
 550     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
 551     * which is plenty of time for our purposes.  It is identical across the
 552     * EUs, but since it's tracking GPU core speed it will increment at a
 553     * varying rate as render P-states change.
 554     *
 555     * The caller could also check if render P-states have changed (or anything
 556     * else that might disrupt timing) by setting smear to 2 and checking if
 557     * that field is != 0.
 558     */
 559    dst.set_smear(0);
 560
 561    return dst;
 562 }
 563
 564 void
 565 fs_visitor::emit_shader_time_begin()
 566 {
 567    current_annotation = "shader time start";
 568    shader_start_time = get_timestamp();
 569 }
 570
 571 void
 572 fs_visitor::emit_shader_time_end()
 573 {
 574    current_annotation = "shader time end";
 575
 576    enum shader_time_shader_type type, written_type, reset_type;
 577    if (dispatch_width == 8) {
 578       type = ST_FS8;
 579       written_type = ST_FS8_WRITTEN;
 580       reset_type = ST_FS8_RESET;
 581    } else {
 582       assert(dispatch_width == 16);
 583       type = ST_FS16;
 584       written_type = ST_FS16_WRITTEN;
 585       reset_type = ST_FS16_RESET;
 586    }
 587
 588    fs_reg shader_end_time = get_timestamp();
 589
 590    /* Check that there weren't any timestamp reset events (assuming these
 591     * were the only two timestamp reads that happened).
 592     */
 593    fs_reg reset = shader_end_time;
 594    reset.set_smear(2);
 595    fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
 596    test->conditional_mod = BRW_CONDITIONAL_Z;
 597    emit(IF(BRW_PREDICATE_NORMAL));
 598
 599    push_force_uncompressed();
 600    fs_reg start = shader_start_time;
 601    start.negate = true;
 602    fs_reg diff = fs_reg(this, glsl_type::uint_type);
 603    emit(ADD(diff, start, shader_end_time));
 604
 605    /* If there were no instructions between the two timestamp gets, the diff
 606     * is 2 cycles.  Remove that overhead, so I can forget about that when
 607     * trying to determine the time taken for single instructions.
 608     */
 609    emit(ADD(diff, diff, fs_reg(-2u)));
 610
 611    emit_shader_time_write(type, diff);
 612    emit_shader_time_write(written_type, fs_reg(1u));
 613    emit(BRW_OPCODE_ELSE);
 614    emit_shader_time_write(reset_type, fs_reg(1u));
 615    emit(BRW_OPCODE_ENDIF);
 616
 617    pop_force_uncompressed();
 618 }
 619
 620 void
 621 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
 622                                    fs_reg value)
 623 {
 624    int shader_time_index =
 625       brw_get_shader_time_index(brw, shader_prog, prog, type);
 626    fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
 627
 628    fs_reg payload;
 629    if (dispatch_width == 8)
 630       payload = fs_reg(this, glsl_type::uvec2_type);
 631    else
 632       payload = fs_reg(this, glsl_type::uint_type);
 633
 634    emit(new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
 635                              fs_reg(), payload, offset, value));
 636 }
 637
 638 void
 639 fs_visitor::vfail(const char *format, va_list va)
 640 {
 641    char *msg;
 642
 643    if (failed)
 644       return;
 645
 646    failed = true;
 647
 648    msg = ralloc_vasprintf(mem_ctx, format, va);
 649    msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
 650
 651    this->fail_msg = msg;
 652
 653    if (INTEL_DEBUG & DEBUG_WM) {
 654       fprintf(stderr, "%s",  msg);
 655    }
 656 }
 657
 658 void
 659 fs_visitor::fail(const char *format, ...)
 660 {
 661    va_list va;
 662
 663    va_start(va, format);
 664    vfail(format, va);
 665    va_end(va);
 666 }
 667
 668 /**
 669  * Mark this program as impossible to compile in SIMD16 mode.
 670  *
 671  * During the SIMD8 compile (which happens first), we can detect and flag
 672  * things that are unsupported in SIMD16 mode, so the compiler can skip
 673  * the SIMD16 compile altogether.
 674  *
 675  * During a SIMD16 compile (if one happens anyway), this just calls fail().
 676  */
 677 void
 678 fs_visitor::no16(const char *format, ...)
 679 {
 680    va_list va;
 681
 682    va_start(va, format);
 683
 684    if (dispatch_width == 16) {
 685       vfail(format, va);
 686    } else {
 687       simd16_unsupported = true;
 688
 689       if (brw->perf_debug) {
 690          if (no16_msg)
 691             ralloc_vasprintf_append(&no16_msg, format, va);
 692          else
 693             no16_msg = ralloc_vasprintf(mem_ctx, format, va);
 694       }
 695    }
 696
 697    va_end(va);
 698 }
 699
 700 fs_inst *
 701 fs_visitor::emit(enum opcode opcode)
 702 {
 703    return emit(new(mem_ctx) fs_inst(opcode));
 704 }
 705
 706 fs_inst *
 707 fs_visitor::emit(enum opcode opcode, const fs_reg &dst)
 708 {
 709    return emit(new(mem_ctx) fs_inst(opcode, dst));
 710 }
 711
 712 fs_inst *
 713 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
 714 {
 715    return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
 716 }
 717
 718 fs_inst *
 719 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 720                  const fs_reg &src1)
 721 {
 722    return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
 723 }
 724
 725 fs_inst *
 726 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 727                  const fs_reg &src1, const fs_reg &src2)
 728 {
 729    return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
 730 }
 731
 732 fs_inst *
 733 fs_visitor::emit(enum opcode opcode, const fs_reg &dst,
 734                  fs_reg src[], int sources)
 735 {
 736    return emit(new(mem_ctx) fs_inst(opcode, dst, src, sources));
 737 }
 738
 739 void
 740 fs_visitor::push_force_uncompressed()
 741 {
 742    force_uncompressed_stack++;
 743 }
 744
 745 void
 746 fs_visitor::pop_force_uncompressed()
 747 {
 748    force_uncompressed_stack--;
 749    assert(force_uncompressed_stack >= 0);
 750 }
 751
 752 /**
 753  * Returns true if the instruction has a flag that means it won't
 754  * update an entire destination register.
 755  *
 756  * For example, dead code elimination and live variable analysis want to know
 757  * when a write to a variable screens off any preceding values that were in
 758  * it.
 759  */
 760 bool
 761 fs_inst::is_partial_write() const
 762 {
 763    return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
 764            this->force_uncompressed ||
 765            this->force_sechalf || !this->dst.is_contiguous());
 766 }
 767
 768 int
 769 fs_inst::regs_read(fs_visitor *v, int arg) const
 770 {
 771    if (is_tex() && arg == 0 && src[0].file == GRF) {
 772       if (v->dispatch_width == 16)
 773          return (mlen + 1) / 2;
 774       else
 775          return mlen;
 776    }
 777    return 1;
 778 }
 779
 780 bool
 781 fs_inst::reads_flag() const
 782 {
 783    return predicate;
 784 }
 785
 786 bool
 787 fs_inst::writes_flag() const
 788 {
 789    return (conditional_mod && opcode != BRW_OPCODE_SEL) ||
 790           opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
 791 }
 792
 793 /**
 794  * Returns how many MRFs an FS opcode will write over.
 795  *
 796  * Note that this is not the 0 or 1 implied writes in an actual gen
 797  * instruction -- the FS opcodes often generate MOVs in addition.
 798  */
 799 int
 800 fs_visitor::implied_mrf_writes(fs_inst *inst)
 801 {
 802    if (inst->mlen == 0)
 803       return 0;
 804
 805    if (inst->base_mrf == -1)
 806       return 0;
 807
 808    switch (inst->opcode) {
 809    case SHADER_OPCODE_RCP:
 810    case SHADER_OPCODE_RSQ:
 811    case SHADER_OPCODE_SQRT:
 812    case SHADER_OPCODE_EXP2:
 813    case SHADER_OPCODE_LOG2:
 814    case SHADER_OPCODE_SIN:
 815    case SHADER_OPCODE_COS:
 816       return 1 * dispatch_width / 8;
 817    case SHADER_OPCODE_POW:
 818    case SHADER_OPCODE_INT_QUOTIENT:
 819    case SHADER_OPCODE_INT_REMAINDER:
 820       return 2 * dispatch_width / 8;
 821    case SHADER_OPCODE_TEX:
 822    case FS_OPCODE_TXB:
 823    case SHADER_OPCODE_TXD:
 824    case SHADER_OPCODE_TXF:
 825    case SHADER_OPCODE_TXF_CMS:
 826    case SHADER_OPCODE_TXF_MCS:
 827    case SHADER_OPCODE_TG4:
 828    case SHADER_OPCODE_TG4_OFFSET:
 829    case SHADER_OPCODE_TXL:
 830    case SHADER_OPCODE_TXS:
 831    case SHADER_OPCODE_LOD:
 832       return 1;
 833    case FS_OPCODE_FB_WRITE:
 834       return 2;
 835    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 836    case SHADER_OPCODE_GEN4_SCRATCH_READ:
 837       return 1;
 838    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
 839       return inst->mlen;
 840    case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
 841       return 2;
 842    case SHADER_OPCODE_UNTYPED_ATOMIC:
 843    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
 844    case FS_OPCODE_INTERPOLATE_AT_CENTROID:
 845    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
 846    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
 847    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
 848       return 0;
 849    default:
 850       unreachable("not reached");
 851    }
 852 }
 853
 854 int
 855 fs_visitor::virtual_grf_alloc(int size)
 856 {
 857    if (virtual_grf_array_size <= virtual_grf_count) {
 858       if (virtual_grf_array_size == 0)
 859          virtual_grf_array_size = 16;
 860       else
 861          virtual_grf_array_size *= 2;
 862       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 863                                    virtual_grf_array_size);
 864    }
 865    virtual_grf_sizes[virtual_grf_count] = size;
 866    return virtual_grf_count++;
 867 }
 868
 869 /** Fixed HW reg constructor. */
 870 fs_reg::fs_reg(enum register_file file, int reg)
 871 {
 872    init();
 873    this->file = file;
 874    this->reg = reg;
 875    this->type = BRW_REGISTER_TYPE_F;
 876 }
 877
 878 /** Fixed HW reg constructor. */
 879 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type)
 880 {
 881    init();
 882    this->file = file;
 883    this->reg = reg;
 884    this->type = type;
 885 }
 886
 887 /** Automatic reg constructor. */
 888 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
 889 {
 890    init();
 891
 892    this->file = GRF;
 893    this->reg = v->virtual_grf_alloc(v->type_size(type));
 894    this->reg_offset = 0;
 895    this->type = brw_type_for_base_type(type);
 896 }
 897
 898 fs_reg *
 899 fs_visitor::variable_storage(ir_variable *var)
 900 {
 901    return (fs_reg *)hash_table_find(this->variable_ht, var);
 902 }
 903
 904 void
 905 import_uniforms_callback(const void *key,
 906                          void *data,
 907                          void *closure)
 908 {
 909    struct hash_table *dst_ht = (struct hash_table *)closure;
 910    const fs_reg *reg = (const fs_reg *)data;
 911
 912    if (reg->file != UNIFORM)
 913       return;
 914
 915    hash_table_insert(dst_ht, data, key);
 916 }
 917
 918 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
 919  * This brings in those uniform definitions
 920  */
 921 void
 922 fs_visitor::import_uniforms(fs_visitor *v)
 923 {
 924    hash_table_call_foreach(v->variable_ht,
 925                            import_uniforms_callback,
 926                            variable_ht);
 927    this->push_constant_loc = v->push_constant_loc;
 928    this->pull_constant_loc = v->pull_constant_loc;
 929    this->uniforms = v->uniforms;
 930    this->param_size = v->param_size;
 931 }
 932
 933 /* Our support for uniforms is piggy-backed on the struct
 934  * gl_fragment_program, because that's where the values actually
 935  * get stored, rather than in some global gl_shader_program uniform
 936  * store.
 937  */
 938 void
 939 fs_visitor::setup_uniform_values(ir_variable *ir)
 940 {
 941    int namelen = strlen(ir->name);
 942
 943    /* The data for our (non-builtin) uniforms is stored in a series of
 944     * gl_uniform_driver_storage structs for each subcomponent that
 945     * glGetUniformLocation() could name.  We know it's been set up in the same
 946     * order we'd walk the type, so walk the list of storage and find anything
 947     * with our name, or the prefix of a component that starts with our name.
 948     */
 949    unsigned params_before = uniforms;
 950    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
 951       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 952
 953       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 954           (storage->name[namelen] != 0 &&
 955            storage->name[namelen] != '.' &&
 956            storage->name[namelen] != '[')) {
 957          continue;
 958       }
 959
 960       unsigned slots = storage->type->component_slots();
 961       if (storage->array_elements)
 962          slots *= storage->array_elements;
 963
 964       for (unsigned i = 0; i < slots; i++) {
 965          stage_prog_data->param[uniforms++] = &storage->storage[i];
 966       }
 967    }
 968
 969    /* Make sure we actually initialized the right amount of stuff here. */
 970    assert(params_before + ir->type->component_slots() == uniforms);
 971    (void)params_before;
 972 }
 973
 974
 975 /* Our support for builtin uniforms is even scarier than non-builtin.
 976  * It sits on top of the PROG_STATE_VAR parameters that are
 977  * automatically updated from GL context state.
 978  */
 979 void
 980 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
 981 {
 982    const ir_state_slot *const slots = ir->state_slots;
 983    assert(ir->state_slots != NULL);
 984
 985    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 986       /* This state reference has already been setup by ir_to_mesa, but we'll
 987        * get the same index back here.
 988        */
 989       int index = _mesa_add_state_reference(this->prog->Parameters,
 990                                             (gl_state_index *)slots[i].tokens);
 991
 992       /* Add each of the unique swizzles of the element as a parameter.
 993        * This'll end up matching the expected layout of the
 994        * array/matrix/structure we're trying to fill in.
 995        */
 996       int last_swiz = -1;
 997       for (unsigned int j = 0; j < 4; j++) {
 998          int swiz = GET_SWZ(slots[i].swizzle, j);
 999          if (swiz == last_swiz)
1000             break;
1001          last_swiz = swiz;
1002
1003          stage_prog_data->param[uniforms++] =
1004             &prog->Parameters->ParameterValues[index][swiz];
1005       }
1006    }
1007 }
1008
1009 fs_reg *
1010 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
1011 {
1012    assert(stage == MESA_SHADER_FRAGMENT);
1013    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1014    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1015    fs_reg wpos = *reg;
1016    bool flip = !ir->data.origin_upper_left ^ key->render_to_fbo;
1017
1018    /* gl_FragCoord.x */
1019    if (ir->data.pixel_center_integer) {
1020       emit(MOV(wpos, this->pixel_x));
1021    } else {
1022       emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
1023    }
1024    wpos.reg_offset++;
1025
1026    /* gl_FragCoord.y */
1027    if (!flip && ir->data.pixel_center_integer) {
1028       emit(MOV(wpos, this->pixel_y));
1029    } else {
1030       fs_reg pixel_y = this->pixel_y;
1031       float offset = (ir->data.pixel_center_integer ? 0.0 : 0.5);
1032
1033       if (flip) {
1034          pixel_y.negate = true;
1035          offset += key->drawable_height - 1.0;
1036       }
1037
1038       emit(ADD(wpos, pixel_y, fs_reg(offset)));
1039    }
1040    wpos.reg_offset++;
1041
1042    /* gl_FragCoord.z */
1043    if (brw->gen >= 6) {
1044       emit(MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
1045    } else {
1046       emit(FS_OPCODE_LINTERP, wpos,
1047            this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1048            this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1049            interp_reg(VARYING_SLOT_POS, 2));
1050    }
1051    wpos.reg_offset++;
1052
1053    /* gl_FragCoord.w: Already set up in emit_interpolation */
1054    emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1055
1056    return reg;
1057 }
1058
1059 fs_inst *
1060 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1061                          glsl_interp_qualifier interpolation_mode,
1062                          bool is_centroid, bool is_sample)
1063 {
1064    brw_wm_barycentric_interp_mode barycoord_mode;
1065    if (brw->gen >= 6) {
1066       if (is_centroid) {
1067          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1068             barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1069          else
1070             barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1071       } else if (is_sample) {
1072           if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1073             barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1074          else
1075             barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1076       } else {
1077          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1078             barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1079          else
1080             barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1081       }
1082    } else {
1083       /* On Ironlake and below, there is only one interpolation mode.
1084        * Centroid interpolation doesn't mean anything on this hardware --
1085        * there is no multisampling.
1086        */
1087       barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1088    }
1089    return emit(FS_OPCODE_LINTERP, attr,
1090                this->delta_x[barycoord_mode],
1091                this->delta_y[barycoord_mode], interp);
1092 }
1093
1094 fs_reg *
1095 fs_visitor::emit_general_interpolation(ir_variable *ir)
1096 {
1097    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1098    reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
1099    fs_reg attr = *reg;
1100
1101    assert(stage == MESA_SHADER_FRAGMENT);
1102    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1103    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1104
1105    unsigned int array_elements;
1106    const glsl_type *type;
1107
1108    if (ir->type->is_array()) {
1109       array_elements = ir->type->length;
1110       if (array_elements == 0) {
1111          fail("dereferenced array '%s' has length 0\n", ir->name);
1112       }
1113       type = ir->type->fields.array;
1114    } else {
1115       array_elements = 1;
1116       type = ir->type;
1117    }
1118
1119    glsl_interp_qualifier interpolation_mode =
1120       ir->determine_interpolation_mode(key->flat_shade);
1121
1122    int location = ir->data.location;
1123    for (unsigned int i = 0; i < array_elements; i++) {
1124       for (unsigned int j = 0; j < type->matrix_columns; j++) {
1125          if (prog_data->urb_setup[location] == -1) {
1126             /* If there's no incoming setup data for this slot, don't
1127              * emit interpolation for it.
1128              */
1129             attr.reg_offset += type->vector_elements;
1130             location++;
1131             continue;
1132          }
1133
1134          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1135             /* Constant interpolation (flat shading) case. The SF has
1136              * handed us defined values in only the constant offset
1137              * field of the setup reg.
1138              */
1139             for (unsigned int k = 0; k < type->vector_elements; k++) {
1140                struct brw_reg interp = interp_reg(location, k);
1141                interp = suboffset(interp, 3);
1142                interp.type = reg->type;
1143                emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1144                attr.reg_offset++;
1145             }
1146          } else {
1147             /* Smooth/noperspective interpolation case. */
1148             for (unsigned int k = 0; k < type->vector_elements; k++) {
1149                struct brw_reg interp = interp_reg(location, k);
1150                if (brw->needs_unlit_centroid_workaround && ir->data.centroid) {
1151                   /* Get the pixel/sample mask into f0 so that we know
1152                    * which pixels are lit.  Then, for each channel that is
1153                    * unlit, replace the centroid data with non-centroid
1154                    * data.
1155                    */
1156                   emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1157
1158                   fs_inst *inst;
1159                   inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1160                                       false, false);
1161                   inst->predicate = BRW_PREDICATE_NORMAL;
1162                   inst->predicate_inverse = true;
1163                   if (brw->has_pln)
1164                      inst->no_dd_clear = true;
1165
1166                   inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1167                                       ir->data.centroid && !key->persample_shading,
1168                                       ir->data.sample || key->persample_shading);
1169                   inst->predicate = BRW_PREDICATE_NORMAL;
1170                   inst->predicate_inverse = false;
1171                   if (brw->has_pln)
1172                      inst->no_dd_check = true;
1173
1174                } else {
1175                   emit_linterp(attr, fs_reg(interp), interpolation_mode,
1176                                ir->data.centroid && !key->persample_shading,
1177                                ir->data.sample || key->persample_shading);
1178                }
1179                if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1180                   emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1181                }
1182                attr.reg_offset++;
1183             }
1184
1185          }
1186          location++;
1187       }
1188    }
1189
1190    return reg;
1191 }
1192
1193 fs_reg *
1194 fs_visitor::emit_frontfacing_interpolation()
1195 {
1196    fs_reg *reg = new(this->mem_ctx) fs_reg(this, glsl_type::bool_type);
1197
1198    if (brw->gen >= 6) {
1199       /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
1200        * a boolean result from this (~0/true or 0/false).
1201        *
1202        * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
1203        * this task in only one instruction:
1204        *    - a negation source modifier will flip the bit; and
1205        *    - a W -> D type conversion will sign extend the bit into the high
1206        *      word of the destination.
1207        *
1208        * An ASR 15 fills the low word of the destination.
1209        */
1210       fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
1211       g0.negate = true;
1212
1213       emit(ASR(*reg, g0, fs_reg(15)));
1214    } else {
1215       /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
1216        * a boolean result from this (1/true or 0/false).
1217        *
1218        * Like in the above case, since the bit is the MSB of g1.6:UD we can use
1219        * the negation source modifier to flip it. Unfortunately the SHR
1220        * instruction only operates on UD (or D with an abs source modifier)
1221        * sources without negation.
1222        *
1223        * Instead, use ASR (which will give ~0/true or 0/false) followed by an
1224        * AND 1.
1225        */
1226       fs_reg asr = fs_reg(this, glsl_type::bool_type);
1227       fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
1228       g1_6.negate = true;
1229
1230       emit(ASR(asr, g1_6, fs_reg(31)));
1231       emit(AND(*reg, asr, fs_reg(1)));
1232    }
1233
1234    return reg;
1235 }
1236
1237 void
1238 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1239 {
1240    assert(stage == MESA_SHADER_FRAGMENT);
1241    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1242    assert(dst.type == BRW_REGISTER_TYPE_F);
1243
1244    if (key->compute_pos_offset) {
1245       /* Convert int_sample_pos to floating point */
1246       emit(MOV(dst, int_sample_pos));
1247       /* Scale to the range [0, 1] */
1248       emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1249    }
1250    else {
1251       /* From ARB_sample_shading specification:
1252        * "When rendering to a non-multisample buffer, or if multisample
1253        *  rasterization is disabled, gl_SamplePosition will always be
1254        *  (0.5, 0.5).
1255        */
1256       emit(MOV(dst, fs_reg(0.5f)));
1257    }
1258 }
1259
1260 fs_reg *
1261 fs_visitor::emit_samplepos_setup()
1262 {
1263    assert(brw->gen >= 6);
1264
1265    this->current_annotation = "compute sample position";
1266    fs_reg *reg = new(this->mem_ctx) fs_reg(this, glsl_type::vec2_type);
1267    fs_reg pos = *reg;
1268    fs_reg int_sample_x = fs_reg(this, glsl_type::int_type);
1269    fs_reg int_sample_y = fs_reg(this, glsl_type::int_type);
1270
1271    /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1272     * mode will be enabled.
1273     *
1274     * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1275     * R31.1:0         Position Offset X/Y for Slot[3:0]
1276     * R31.3:2         Position Offset X/Y for Slot[7:4]
1277     * .....
1278     *
1279     * The X, Y sample positions come in as bytes in  thread payload. So, read
1280     * the positions using vstride=16, width=8, hstride=2.
1281     */
1282    struct brw_reg sample_pos_reg =
1283       stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
1284                     BRW_REGISTER_TYPE_B), 16, 8, 2);
1285
1286    fs_inst *inst = emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1287    if (dispatch_width == 16) {
1288       inst->force_uncompressed = true;
1289       inst = emit(MOV(half(int_sample_x, 1),
1290                       fs_reg(suboffset(sample_pos_reg, 16))));
1291       inst->force_sechalf = true;
1292    }
1293    /* Compute gl_SamplePosition.x */
1294    compute_sample_position(pos, int_sample_x);
1295    pos.reg_offset++;
1296    inst = emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1297    if (dispatch_width == 16) {
1298       inst->force_uncompressed = true;
1299       inst = emit(MOV(half(int_sample_y, 1),
1300                       fs_reg(suboffset(sample_pos_reg, 17))));
1301       inst->force_sechalf = true;
1302    }
1303    /* Compute gl_SamplePosition.y */
1304    compute_sample_position(pos, int_sample_y);
1305    return reg;
1306 }
1307
1308 fs_reg *
1309 fs_visitor::emit_sampleid_setup(ir_variable *ir)
1310 {
1311    assert(stage == MESA_SHADER_FRAGMENT);
1312    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1313    assert(brw->gen >= 6);
1314
1315    this->current_annotation = "compute sample id";
1316    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1317
1318    if (key->compute_sample_id) {
1319       fs_reg t1 = fs_reg(this, glsl_type::int_type);
1320       fs_reg t2 = fs_reg(this, glsl_type::int_type);
1321       t2.type = BRW_REGISTER_TYPE_UW;
1322
1323       /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1324        * 8x multisampling, subspan 0 will represent sample N (where N
1325        * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1326        * 7. We can find the value of N by looking at R0.0 bits 7:6
1327        * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1328        * (since samples are always delivered in pairs). That is, we
1329        * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1330        * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1331        * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1332        * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1333        * populating a temporary variable with the sequence (0, 1, 2, 3),
1334        * and then reading from it using vstride=1, width=4, hstride=0.
1335        * These computations hold good for 4x multisampling as well.
1336        *
1337        * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
1338        * the first four slots are sample 0 of subspan 0; the next four
1339        * are sample 1 of subspan 0; the third group is sample 0 of
1340        * subspan 1, and finally sample 1 of subspan 1.
1341        */
1342       fs_inst *inst;
1343       inst = emit(BRW_OPCODE_AND, t1,
1344                   fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
1345                   fs_reg(0xc0));
1346       inst->force_writemask_all = true;
1347       inst = emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1348       inst->force_writemask_all = true;
1349       /* This works for both SIMD8 and SIMD16 */
1350       inst = emit(MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210)));
1351       inst->force_writemask_all = true;
1352       /* This special instruction takes care of setting vstride=1,
1353        * width=4, hstride=0 of t2 during an ADD instruction.
1354        */
1355       emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1356    } else {
1357       /* As per GL_ARB_sample_shading specification:
1358        * "When rendering to a non-multisample buffer, or if multisample
1359        *  rasterization is disabled, gl_SampleID will always be zero."
1360        */
1361       emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1362    }
1363
1364    return reg;
1365 }
1366
1367 fs_reg
1368 fs_visitor::fix_math_operand(fs_reg src)
1369 {
1370    /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1371     * might be able to do better by doing execsize = 1 math and then
1372     * expanding that result out, but we would need to be careful with
1373     * masking.
1374     *
1375     * The hardware ignores source modifiers (negate and abs) on math
1376     * instructions, so we also move to a temp to set those up.
1377     */
1378    if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1379        !src.abs && !src.negate)
1380       return src;
1381
1382    /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1383     * operands to math
1384     */
1385    if (brw->gen >= 7 && src.file != IMM)
1386       return src;
1387
1388    fs_reg expanded = fs_reg(this, glsl_type::float_type);
1389    expanded.type = src.type;
1390    emit(BRW_OPCODE_MOV, expanded, src);
1391    return expanded;
1392 }
1393
1394 fs_inst *
1395 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1396 {
1397    switch (opcode) {
1398    case SHADER_OPCODE_RCP:
1399    case SHADER_OPCODE_RSQ:
1400    case SHADER_OPCODE_SQRT:
1401    case SHADER_OPCODE_EXP2:
1402    case SHADER_OPCODE_LOG2:
1403    case SHADER_OPCODE_SIN:
1404    case SHADER_OPCODE_COS:
1405       break;
1406    default:
1407       unreachable("not reached: bad math opcode");
1408    }
1409
1410    /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
1411     * might be able to do better by doing execsize = 1 math and then
1412     * expanding that result out, but we would need to be careful with
1413     * masking.
1414     *
1415     * Gen 6 hardware ignores source modifiers (negate and abs) on math
1416     * instructions, so we also move to a temp to set those up.
1417     */
1418    if (brw->gen == 6 || brw->gen == 7)
1419       src = fix_math_operand(src);
1420
1421    fs_inst *inst = emit(opcode, dst, src);
1422
1423    if (brw->gen < 6) {
1424       inst->base_mrf = 2;
1425       inst->mlen = dispatch_width / 8;
1426    }
1427
1428    return inst;
1429 }
1430
1431 fs_inst *
1432 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1433 {
1434    int base_mrf = 2;
1435    fs_inst *inst;
1436
1437    if (brw->gen >= 8) {
1438       inst = emit(opcode, dst, src0, src1);
1439    } else if (brw->gen >= 6) {
1440       src0 = fix_math_operand(src0);
1441       src1 = fix_math_operand(src1);
1442
1443       inst = emit(opcode, dst, src0, src1);
1444    } else {
1445       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1446        * "Message Payload":
1447        *
1448        * "Operand0[7].  For the INT DIV functions, this operand is the
1449        *  denominator."
1450        *  ...
1451        * "Operand1[7].  For the INT DIV functions, this operand is the
1452        *  numerator."
1453        */
1454       bool is_int_div = opcode != SHADER_OPCODE_POW;
1455       fs_reg &op0 = is_int_div ? src1 : src0;
1456       fs_reg &op1 = is_int_div ? src0 : src1;
1457
1458       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1459       inst = emit(opcode, dst, op0, reg_null_f);
1460
1461       inst->base_mrf = base_mrf;
1462       inst->mlen = 2 * dispatch_width / 8;
1463    }
1464    return inst;
1465 }
1466
1467 void
1468 fs_visitor::assign_curb_setup()
1469 {
1470    if (dispatch_width == 8) {
1471       prog_data->dispatch_grf_start_reg = payload.num_regs;
1472    } else {
1473       assert(stage == MESA_SHADER_FRAGMENT);
1474       brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1475       prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1476    }
1477
1478    prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1479
1480    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1481    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1482       for (unsigned int i = 0; i < inst->sources; i++) {
1483          if (inst->src[i].file == UNIFORM) {
1484             int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1485             int constant_nr;
1486             if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1487                constant_nr = push_constant_loc[uniform_nr];
1488             } else {
1489                /* Section 5.11 of the OpenGL 4.1 spec says:
1490                 * "Out-of-bounds reads return undefined values, which include
1491                 *  values from other variables of the active program or zero."
1492                 * Just return the first push constant.
1493                 */
1494                constant_nr = 0;
1495             }
1496
1497             struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1498                                                   constant_nr / 8,
1499                                                   constant_nr % 8);
1500
1501             inst->src[i].file = HW_REG;
1502             inst->src[i].fixed_hw_reg = byte_offset(
1503                retype(brw_reg, inst->src[i].type),
1504                inst->src[i].subreg_offset);
1505          }
1506       }
1507    }
1508 }
1509
1510 void
1511 fs_visitor::calculate_urb_setup()
1512 {
1513    assert(stage == MESA_SHADER_FRAGMENT);
1514    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1515    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1516
1517    for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1518       prog_data->urb_setup[i] = -1;
1519    }
1520
1521    int urb_next = 0;
1522    /* Figure out where each of the incoming setup attributes lands. */
1523    if (brw->gen >= 6) {
1524       if (_mesa_bitcount_64(prog->InputsRead &
1525                             BRW_FS_VARYING_INPUT_MASK) <= 16) {
1526          /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1527           * first 16 varying inputs, so we can put them wherever we want.
1528           * Just put them in order.
1529           *
1530           * This is useful because it means that (a) inputs not used by the
1531           * fragment shader won't take up valuable register space, and (b) we
1532           * won't have to recompile the fragment shader if it gets paired with
1533           * a different vertex (or geometry) shader.
1534           */
1535          for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1536             if (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1537                 BITFIELD64_BIT(i)) {
1538                prog_data->urb_setup[i] = urb_next++;
1539             }
1540          }
1541       } else {
1542          /* We have enough input varyings that the SF/SBE pipeline stage can't
1543           * arbitrarily rearrange them to suit our whim; we have to put them
1544           * in an order that matches the output of the previous pipeline stage
1545           * (geometry or vertex shader).
1546           */
1547          struct brw_vue_map prev_stage_vue_map;
1548          brw_compute_vue_map(brw, &prev_stage_vue_map,
1549                              key->input_slots_valid);
1550          int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1551          assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1552          for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1553               slot++) {
1554             int varying = prev_stage_vue_map.slot_to_varying[slot];
1555             /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1556              * unused.
1557              */
1558             if (varying != BRW_VARYING_SLOT_COUNT &&
1559                 (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1560                  BITFIELD64_BIT(varying))) {
1561                prog_data->urb_setup[varying] = slot - first_slot;
1562             }
1563          }
1564          urb_next = prev_stage_vue_map.num_slots - first_slot;
1565       }
1566    } else {
1567       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1568       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1569          /* Point size is packed into the header, not as a general attribute */
1570          if (i == VARYING_SLOT_PSIZ)
1571             continue;
1572
1573          if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1574             /* The back color slot is skipped when the front color is
1575              * also written to.  In addition, some slots can be
1576              * written in the vertex shader and not read in the
1577              * fragment shader.  So the register number must always be
1578              * incremented, mapped or not.
1579              */
1580             if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1581                prog_data->urb_setup[i] = urb_next;
1582             urb_next++;
1583          }
1584       }
1585
1586       /*
1587        * It's a FS only attribute, and we did interpolation for this attribute
1588        * in SF thread. So, count it here, too.
1589        *
1590        * See compile_sf_prog() for more info.
1591        */
1592       if (prog->InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1593          prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1594    }
1595
1596    prog_data->num_varying_inputs = urb_next;
1597 }
1598
1599 void
1600 fs_visitor::assign_urb_setup()
1601 {
1602    assert(stage == MESA_SHADER_FRAGMENT);
1603    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1604
1605    int urb_start = payload.num_regs + prog_data->base.curb_read_length;
1606
1607    /* Offset all the urb_setup[] index by the actual position of the
1608     * setup regs, now that the location of the constants has been chosen.
1609     */
1610    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1611       if (inst->opcode == FS_OPCODE_LINTERP) {
1612          assert(inst->src[2].file == HW_REG);
1613          inst->src[2].fixed_hw_reg.nr += urb_start;
1614       }
1615
1616       if (inst->opcode == FS_OPCODE_CINTERP) {
1617          assert(inst->src[0].file == HW_REG);
1618          inst->src[0].fixed_hw_reg.nr += urb_start;
1619       }
1620    }
1621
1622    /* Each attribute is 4 setup channels, each of which is half a reg. */
1623    this->first_non_payload_grf =
1624       urb_start + prog_data->num_varying_inputs * 2;
1625 }
1626
1627 /**
1628  * Split large virtual GRFs into separate components if we can.
1629  *
1630  * This is mostly duplicated with what brw_fs_vector_splitting does,
1631  * but that's really conservative because it's afraid of doing
1632  * splitting that doesn't result in real progress after the rest of
1633  * the optimization phases, which would cause infinite looping in
1634  * optimization.  We can do it once here, safely.  This also has the
1635  * opportunity to split interpolated values, or maybe even uniforms,
1636  * which we don't have at the IR level.
1637  *
1638  * We want to split, because virtual GRFs are what we register
1639  * allocate and spill (due to contiguousness requirements for some
1640  * instructions), and they're what we naturally generate in the
1641  * codegen process, but most virtual GRFs don't actually need to be
1642  * contiguous sets of GRFs.  If we split, we'll end up with reduced
1643  * live intervals and better dead code elimination and coalescing.
1644  */
1645 void
1646 fs_visitor::split_virtual_grfs()
1647 {
1648    int num_vars = this->virtual_grf_count;
1649    bool split_grf[num_vars];
1650    int new_virtual_grf[num_vars];
1651
1652    /* Try to split anything > 0 sized. */
1653    for (int i = 0; i < num_vars; i++) {
1654       if (this->virtual_grf_sizes[i] != 1)
1655          split_grf[i] = true;
1656       else
1657          split_grf[i] = false;
1658    }
1659
1660    if (brw->has_pln &&
1661        this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1662       /* PLN opcodes rely on the delta_xy being contiguous.  We only have to
1663        * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1664        * Gen6, that was the only supported interpolation mode, and since Gen6,
1665        * delta_x and delta_y are in fixed hardware registers.
1666        */
1667       split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1668          false;
1669    }
1670
1671    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1672       /* If there's a SEND message that requires contiguous destination
1673        * registers, no splitting is allowed.
1674        */
1675       if (inst->regs_written > 1) {
1676          split_grf[inst->dst.reg] = false;
1677       }
1678
1679       /* If we're sending from a GRF, don't split it, on the assumption that
1680        * the send is reading the whole thing.
1681        */
1682       if (inst->is_send_from_grf()) {
1683          for (int i = 0; i < inst->sources; i++) {
1684             if (inst->src[i].file == GRF) {
1685                split_grf[inst->src[i].reg] = false;
1686             }
1687          }
1688       }
1689    }
1690
1691    /* Allocate new space for split regs.  Note that the virtual
1692     * numbers will be contiguous.
1693     */
1694    for (int i = 0; i < num_vars; i++) {
1695       if (split_grf[i]) {
1696          new_virtual_grf[i] = virtual_grf_alloc(1);
1697          for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1698             int reg = virtual_grf_alloc(1);
1699             assert(reg == new_virtual_grf[i] + j - 1);
1700             (void) reg;
1701          }
1702          this->virtual_grf_sizes[i] = 1;
1703       }
1704    }
1705
1706    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1707       if (inst->dst.file == GRF &&
1708           split_grf[inst->dst.reg] &&
1709           inst->dst.reg_offset != 0) {
1710          inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1711                           inst->dst.reg_offset - 1);
1712          inst->dst.reg_offset = 0;
1713       }
1714       for (int i = 0; i < inst->sources; i++) {
1715          if (inst->src[i].file == GRF &&
1716              split_grf[inst->src[i].reg] &&
1717              inst->src[i].reg_offset != 0) {
1718             inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1719                                 inst->src[i].reg_offset - 1);
1720             inst->src[i].reg_offset = 0;
1721          }
1722       }
1723    }
1724    invalidate_live_intervals();
1725 }
1726
1727 /**
1728  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1729  *
1730  * During code generation, we create tons of temporary variables, many of
1731  * which get immediately killed and are never used again.  Yet, in later
1732  * optimization and analysis passes, such as compute_live_intervals, we need
1733  * to loop over all the virtual GRFs.  Compacting them can save a lot of
1734  * overhead.
1735  */
1736 void
1737 fs_visitor::compact_virtual_grfs()
1738 {
1739    if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER))
1740       return;
1741
1742    /* Mark which virtual GRFs are used, and count how many. */
1743    int remap_table[this->virtual_grf_count];
1744    memset(remap_table, -1, sizeof(remap_table));
1745
1746    foreach_block_and_inst(block, const fs_inst, inst, cfg) {
1747       if (inst->dst.file == GRF)
1748          remap_table[inst->dst.reg] = 0;
1749
1750       for (int i = 0; i < inst->sources; i++) {
1751          if (inst->src[i].file == GRF)
1752             remap_table[inst->src[i].reg] = 0;
1753       }
1754    }
1755
1756    /* Compact the GRF arrays. */
1757    int new_index = 0;
1758    for (int i = 0; i < this->virtual_grf_count; i++) {
1759       if (remap_table[i] != -1) {
1760          remap_table[i] = new_index;
1761          virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1762          invalidate_live_intervals();
1763          ++new_index;
1764       }
1765    }
1766
1767    this->virtual_grf_count = new_index;
1768
1769    /* Patch all the instructions to use the newly renumbered registers */
1770    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1771       if (inst->dst.file == GRF)
1772          inst->dst.reg = remap_table[inst->dst.reg];
1773
1774       for (int i = 0; i < inst->sources; i++) {
1775          if (inst->src[i].file == GRF)
1776             inst->src[i].reg = remap_table[inst->src[i].reg];
1777       }
1778    }
1779
1780    /* Patch all the references to delta_x/delta_y, since they're used in
1781     * register allocation.  If they're unused, switch them to BAD_FILE so
1782     * we don't think some random VGRF is delta_x/delta_y.
1783     */
1784    for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
1785       if (delta_x[i].file == GRF) {
1786          if (remap_table[delta_x[i].reg] != -1) {
1787             delta_x[i].reg = remap_table[delta_x[i].reg];
1788          } else {
1789             delta_x[i].file = BAD_FILE;
1790          }
1791       }
1792    }
1793    for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
1794       if (delta_y[i].file == GRF) {
1795          if (remap_table[delta_y[i].reg] != -1) {
1796             delta_y[i].reg = remap_table[delta_y[i].reg];
1797          } else {
1798             delta_y[i].file = BAD_FILE;
1799          }
1800       }
1801    }
1802 }
1803
1804 /*
1805  * Implements array access of uniforms by inserting a
1806  * PULL_CONSTANT_LOAD instruction.
1807  *
1808  * Unlike temporary GRF array access (where we don't support it due to
1809  * the difficulty of doing relative addressing on instruction
1810  * destinations), we could potentially do array access of uniforms
1811  * that were loaded in GRF space as push constants.  In real-world
1812  * usage we've seen, though, the arrays being used are always larger
1813  * than we could load as push constants, so just always move all
1814  * uniform array access out to a pull constant buffer.
1815  */
1816 void
1817 fs_visitor::move_uniform_array_access_to_pull_constants()
1818 {
1819    if (dispatch_width != 8)
1820       return;
1821
1822    pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
1823
1824    for (unsigned int i = 0; i < uniforms; i++) {
1825       pull_constant_loc[i] = -1;
1826    }
1827
1828    /* Walk through and find array access of uniforms.  Put a copy of that
1829     * uniform in the pull constant buffer.
1830     *
1831     * Note that we don't move constant-indexed accesses to arrays.  No
1832     * testing has been done of the performance impact of this choice.
1833     */
1834    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
1835       for (int i = 0 ; i < inst->sources; i++) {
1836          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1837             continue;
1838
1839          int uniform = inst->src[i].reg;
1840
1841          /* If this array isn't already present in the pull constant buffer,
1842           * add it.
1843           */
1844          if (pull_constant_loc[uniform] == -1) {
1845             const gl_constant_value **values = &stage_prog_data->param[uniform];
1846
1847             assert(param_size[uniform]);
1848
1849             for (int j = 0; j < param_size[uniform]; j++) {
1850                pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
1851
1852                stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
1853                   values[j];
1854             }
1855          }
1856       }
1857    }
1858 }
1859
1860 /**
1861  * Assign UNIFORM file registers to either push constants or pull constants.
1862  *
1863  * We allow a fragment shader to have more than the specified minimum
1864  * maximum number of fragment shader uniform components (64).  If
1865  * there are too many of these, they'd fill up all of register space.
1866  * So, this will push some of them out to the pull constant buffer and
1867  * update the program to load them.
1868  */
1869 void
1870 fs_visitor::assign_constant_locations()
1871 {
1872    /* Only the first compile (SIMD8 mode) gets to decide on locations. */
1873    if (dispatch_width != 8)
1874       return;
1875
1876    /* Find which UNIFORM registers are still in use. */
1877    bool is_live[uniforms];
1878    for (unsigned int i = 0; i < uniforms; i++) {
1879       is_live[i] = false;
1880    }
1881
1882    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1883       for (int i = 0; i < inst->sources; i++) {
1884          if (inst->src[i].file != UNIFORM)
1885             continue;
1886
1887          int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1888          if (constant_nr >= 0 && constant_nr < (int) uniforms)
1889             is_live[constant_nr] = true;
1890       }
1891    }
1892
1893    /* Only allow 16 registers (128 uniform components) as push constants.
1894     *
1895     * Just demote the end of the list.  We could probably do better
1896     * here, demoting things that are rarely used in the program first.
1897     *
1898     * If changing this value, note the limitation about total_regs in
1899     * brw_curbe.c.
1900     */
1901    unsigned int max_push_components = 16 * 8;
1902    unsigned int num_push_constants = 0;
1903
1904    push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
1905
1906    for (unsigned int i = 0; i < uniforms; i++) {
1907       if (!is_live[i] || pull_constant_loc[i] != -1) {
1908          /* This UNIFORM register is either dead, or has already been demoted
1909           * to a pull const.  Mark it as no longer living in the param[] array.
1910           */
1911          push_constant_loc[i] = -1;
1912          continue;
1913       }
1914
1915       if (num_push_constants < max_push_components) {
1916          /* Retain as a push constant.  Record the location in the params[]
1917           * array.
1918           */
1919          push_constant_loc[i] = num_push_constants++;
1920       } else {
1921          /* Demote to a pull constant. */
1922          push_constant_loc[i] = -1;
1923
1924          int pull_index = stage_prog_data->nr_pull_params++;
1925          stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
1926          pull_constant_loc[i] = pull_index;
1927       }
1928    }
1929
1930    stage_prog_data->nr_params = num_push_constants;
1931
1932    /* Up until now, the param[] array has been indexed by reg + reg_offset
1933     * of UNIFORM registers.  Condense it to only contain the uniforms we
1934     * chose to upload as push constants.
1935     */
1936    for (unsigned int i = 0; i < uniforms; i++) {
1937       int remapped = push_constant_loc[i];
1938
1939       if (remapped == -1)
1940          continue;
1941
1942       assert(remapped <= (int)i);
1943       stage_prog_data->param[remapped] = stage_prog_data->param[i];
1944    }
1945 }
1946
1947 /**
1948  * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
1949  * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
1950  */
1951 void
1952 fs_visitor::demote_pull_constants()
1953 {
1954    foreach_block_and_inst (block, fs_inst, inst, cfg) {
1955       for (int i = 0; i < inst->sources; i++) {
1956          if (inst->src[i].file != UNIFORM)
1957             continue;
1958
1959          int pull_index = pull_constant_loc[inst->src[i].reg +
1960                                             inst->src[i].reg_offset];
1961          if (pull_index == -1)
1962             continue;
1963
1964          /* Set up the annotation tracking for new generated instructions. */
1965          base_ir = inst->ir;
1966          current_annotation = inst->annotation;
1967
1968          fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
1969          fs_reg dst = fs_reg(this, glsl_type::float_type);
1970
1971          /* Generate a pull load into dst. */
1972          if (inst->src[i].reladdr) {
1973             exec_list list = VARYING_PULL_CONSTANT_LOAD(dst,
1974                                                         surf_index,
1975                                                         *inst->src[i].reladdr,
1976                                                         pull_index);
1977             inst->insert_before(block, &list);
1978             inst->src[i].reladdr = NULL;
1979          } else {
1980             fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1981             fs_inst *pull =
1982                new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1983                                     dst, surf_index, offset);
1984             inst->insert_before(block, pull);
1985             inst->src[i].set_smear(pull_index & 3);
1986          }
1987
1988          /* Rewrite the instruction to use the temporary VGRF. */
1989          inst->src[i].file = GRF;
1990          inst->src[i].reg = dst.reg;
1991          inst->src[i].reg_offset = 0;
1992       }
1993    }
1994    invalidate_live_intervals();
1995 }
1996
1997 bool
1998 fs_visitor::opt_algebraic()
1999 {
2000    bool progress = false;
2001
2002    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2003       switch (inst->opcode) {
2004       case BRW_OPCODE_MUL:
2005          if (inst->src[1].file != IMM)
2006             continue;
2007
2008          /* a * 1.0 = a */
2009          if (inst->src[1].is_one()) {
2010             inst->opcode = BRW_OPCODE_MOV;
2011             inst->src[1] = reg_undef;
2012             progress = true;
2013             break;
2014          }
2015
2016          /* a * 0.0 = 0.0 */
2017          if (inst->src[1].is_zero()) {
2018             inst->opcode = BRW_OPCODE_MOV;
2019             inst->src[0] = inst->src[1];
2020             inst->src[1] = reg_undef;
2021             progress = true;
2022             break;
2023          }
2024
2025          break;
2026       case BRW_OPCODE_ADD:
2027          if (inst->src[1].file != IMM)
2028             continue;
2029
2030          /* a + 0.0 = a */
2031          if (inst->src[1].is_zero()) {
2032             inst->opcode = BRW_OPCODE_MOV;
2033             inst->src[1] = reg_undef;
2034             progress = true;
2035             break;
2036          }
2037          break;
2038       case BRW_OPCODE_OR:
2039          if (inst->src[0].equals(inst->src[1])) {
2040             inst->opcode = BRW_OPCODE_MOV;
2041             inst->src[1] = reg_undef;
2042             progress = true;
2043             break;
2044          }
2045          break;
2046       case BRW_OPCODE_LRP:
2047          if (inst->src[1].equals(inst->src[2])) {
2048             inst->opcode = BRW_OPCODE_MOV;
2049             inst->src[0] = inst->src[1];
2050             inst->src[1] = reg_undef;
2051             inst->src[2] = reg_undef;
2052             progress = true;
2053             break;
2054          }
2055          break;
2056       case BRW_OPCODE_SEL:
2057          if (inst->src[0].equals(inst->src[1])) {
2058             inst->opcode = BRW_OPCODE_MOV;
2059             inst->src[1] = reg_undef;
2060             inst->predicate = BRW_PREDICATE_NONE;
2061             inst->predicate_inverse = false;
2062             progress = true;
2063          } else if (inst->saturate && inst->src[1].file == IMM) {
2064             switch (inst->conditional_mod) {
2065             case BRW_CONDITIONAL_LE:
2066             case BRW_CONDITIONAL_L:
2067                switch (inst->src[1].type) {
2068                case BRW_REGISTER_TYPE_F:
2069                   if (inst->src[1].fixed_hw_reg.dw1.f >= 1.0f) {
2070                      inst->opcode = BRW_OPCODE_MOV;
2071                      inst->src[1] = reg_undef;
2072                      progress = true;
2073                   }
2074                   break;
2075                default:
2076                   break;
2077                }
2078                break;
2079             case BRW_CONDITIONAL_GE:
2080             case BRW_CONDITIONAL_G:
2081                switch (inst->src[1].type) {
2082                case BRW_REGISTER_TYPE_F:
2083                   if (inst->src[1].fixed_hw_reg.dw1.f <= 0.0f) {
2084                      inst->opcode = BRW_OPCODE_MOV;
2085                      inst->src[1] = reg_undef;
2086                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
2087                      progress = true;
2088                   }
2089                   break;
2090                default:
2091                   break;
2092                }
2093             default:
2094                break;
2095             }
2096          }
2097          break;
2098       default:
2099          break;
2100       }
2101    }
2102
2103    return progress;
2104 }
2105
2106 bool
2107 fs_visitor::opt_register_renaming()
2108 {
2109    bool progress = false;
2110    int depth = 0;
2111
2112    int remap[virtual_grf_count];
2113    memset(remap, -1, sizeof(int) * virtual_grf_count);
2114
2115    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2116       if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
2117          depth++;
2118       } else if (inst->opcode == BRW_OPCODE_ENDIF ||
2119                  inst->opcode == BRW_OPCODE_WHILE) {
2120          depth--;
2121       }
2122
2123       /* Rewrite instruction sources. */
2124       for (int i = 0; i < inst->sources; i++) {
2125          if (inst->src[i].file == GRF &&
2126              remap[inst->src[i].reg] != -1 &&
2127              remap[inst->src[i].reg] != inst->src[i].reg) {
2128             inst->src[i].reg = remap[inst->src[i].reg];
2129             progress = true;
2130          }
2131       }
2132
2133       const int dst = inst->dst.reg;
2134
2135       if (depth == 0 &&
2136           inst->dst.file == GRF &&
2137           virtual_grf_sizes[inst->dst.reg] == 1 &&
2138           !inst->is_partial_write()) {
2139          if (remap[dst] == -1) {
2140             remap[dst] = dst;
2141          } else {
2142             remap[dst] = virtual_grf_alloc(1);
2143             inst->dst.reg = remap[dst];
2144             progress = true;
2145          }
2146       } else if (inst->dst.file == GRF &&
2147                  remap[dst] != -1 &&
2148                  remap[dst] != dst) {
2149          inst->dst.reg = remap[dst];
2150          progress = true;
2151       }
2152    }
2153
2154    if (progress) {
2155       invalidate_live_intervals();
2156
2157       for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
2158          if (delta_x[i].file == GRF && remap[delta_x[i].reg] != -1) {
2159             delta_x[i].reg = remap[delta_x[i].reg];
2160          }
2161       }
2162       for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
2163          if (delta_y[i].file == GRF && remap[delta_y[i].reg] != -1) {
2164             delta_y[i].reg = remap[delta_y[i].reg];
2165          }
2166       }
2167    }
2168
2169    return progress;
2170 }
2171
2172 bool
2173 fs_visitor::compute_to_mrf()
2174 {
2175    bool progress = false;
2176    int next_ip = 0;
2177
2178    calculate_live_intervals();
2179
2180    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2181       int ip = next_ip;
2182       next_ip++;
2183
2184       if (inst->opcode != BRW_OPCODE_MOV ||
2185           inst->is_partial_write() ||
2186           inst->dst.file != MRF || inst->src[0].file != GRF ||
2187           inst->dst.type != inst->src[0].type ||
2188           inst->src[0].abs || inst->src[0].negate ||
2189           !inst->src[0].is_contiguous() ||
2190           inst->src[0].subreg_offset)
2191          continue;
2192
2193       /* Work out which hardware MRF registers are written by this
2194        * instruction.
2195        */
2196       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2197       int mrf_high;
2198       if (inst->dst.reg & BRW_MRF_COMPR4) {
2199          mrf_high = mrf_low + 4;
2200       } else if (dispatch_width == 16 &&
2201                  (!inst->force_uncompressed && !inst->force_sechalf)) {
2202          mrf_high = mrf_low + 1;
2203       } else {
2204          mrf_high = mrf_low;
2205       }
2206
2207       /* Can't compute-to-MRF this GRF if someone else was going to
2208        * read it later.
2209        */
2210       if (this->virtual_grf_end[inst->src[0].reg] > ip)
2211          continue;
2212
2213       /* Found a move of a GRF to a MRF.  Let's see if we can go
2214        * rewrite the thing that made this GRF to write into the MRF.
2215        */
2216       fs_inst *scan_inst;
2217       for (scan_inst = (fs_inst *)inst->prev;
2218            !scan_inst->is_head_sentinel();
2219            scan_inst = (fs_inst *)scan_inst->prev) {
2220          if (scan_inst->dst.file == GRF &&
2221              scan_inst->dst.reg == inst->src[0].reg) {
2222             /* Found the last thing to write our reg we want to turn
2223              * into a compute-to-MRF.
2224              */
2225
2226             /* If this one instruction didn't populate all the
2227              * channels, bail.  We might be able to rewrite everything
2228              * that writes that reg, but it would require smarter
2229              * tracking to delay the rewriting until complete success.
2230              */
2231             if (scan_inst->is_partial_write())
2232                break;
2233
2234             /* Things returning more than one register would need us to
2235              * understand coalescing out more than one MOV at a time.
2236              */
2237             if (scan_inst->regs_written > 1)
2238                break;
2239
2240             /* SEND instructions can't have MRF as a destination. */
2241             if (scan_inst->mlen)
2242                break;
2243
2244             if (brw->gen == 6) {
2245                /* gen6 math instructions must have the destination be
2246                 * GRF, so no compute-to-MRF for them.
2247                 */
2248                if (scan_inst->is_math()) {
2249                   break;
2250                }
2251             }
2252
2253             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2254                /* Found the creator of our MRF's source value. */
2255                scan_inst->dst.file = MRF;
2256                scan_inst->dst.reg = inst->dst.reg;
2257                scan_inst->saturate |= inst->saturate;
2258                inst->remove(block);
2259                progress = true;
2260             }
2261             break;
2262          }
2263
2264          /* We don't handle control flow here.  Most computation of
2265           * values that end up in MRFs are shortly before the MRF
2266           * write anyway.
2267           */
2268          if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2269             break;
2270
2271          /* You can't read from an MRF, so if someone else reads our
2272           * MRF's source GRF that we wanted to rewrite, that stops us.
2273           */
2274          bool interfered = false;
2275          for (int i = 0; i < scan_inst->sources; i++) {
2276             if (scan_inst->src[i].file == GRF &&
2277                 scan_inst->src[i].reg == inst->src[0].reg &&
2278                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2279                interfered = true;
2280             }
2281          }
2282          if (interfered)
2283             break;
2284
2285          if (scan_inst->dst.file == MRF) {
2286             /* If somebody else writes our MRF here, we can't
2287              * compute-to-MRF before that.
2288              */
2289             int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2290             int scan_mrf_high;
2291
2292             if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2293                scan_mrf_high = scan_mrf_low + 4;
2294             } else if (dispatch_width == 16 &&
2295                        (!scan_inst->force_uncompressed &&
2296                         !scan_inst->force_sechalf)) {
2297                scan_mrf_high = scan_mrf_low + 1;
2298             } else {
2299                scan_mrf_high = scan_mrf_low;
2300             }
2301
2302             if (mrf_low == scan_mrf_low ||
2303                 mrf_low == scan_mrf_high ||
2304                 mrf_high == scan_mrf_low ||
2305                 mrf_high == scan_mrf_high) {
2306                break;
2307             }
2308          }
2309
2310          if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2311             /* Found a SEND instruction, which means that there are
2312              * live values in MRFs from base_mrf to base_mrf +
2313              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
2314              * above it.
2315              */
2316             if (mrf_low >= scan_inst->base_mrf &&
2317                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2318                break;
2319             }
2320             if (mrf_high >= scan_inst->base_mrf &&
2321                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2322                break;
2323             }
2324          }
2325       }
2326    }
2327
2328    if (progress)
2329       invalidate_live_intervals();
2330
2331    return progress;
2332 }
2333
2334 /**
2335  * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
2336  * instructions to FS_OPCODE_REP_FB_WRITE.
2337  */
2338 void
2339 fs_visitor::try_rep_send()
2340 {
2341    int i, count;
2342    fs_inst *start = NULL;
2343    bblock_t *mov_block;
2344
2345    /* From the Ivybridge PRM, Volume 4 Part 1, section 3.9.11.2
2346     * ("Message Descriptor - Render Target Write"):
2347     *
2348     * "SIMD16_REPDATA message must not be used in SIMD8 pixel-shaders."
2349     */
2350    if (dispatch_width != 16)
2351       return;
2352
2353    /* The constant color write message can't handle anything but the 4 color
2354     * values.  We could do MRT, but the loops below would need to understand
2355     * handling the header being enabled or disabled on different messages.  It
2356     * also requires that the render target be tiled, which might not be the
2357     * case for some EGLImage paths or if we some day do rendering to PBOs.
2358     */
2359    if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH) ||
2360        payload.aa_dest_stencil_reg ||
2361        payload.dest_depth_reg ||
2362        dual_src_output.file != BAD_FILE)
2363       return;
2364
2365    /* The optimization is implemented as one pass through the instruction
2366     * list.  We keep track of the most recent block of MOVs into sequential
2367     * MRFs from single, sequential float registers (ie uniforms).  Then when
2368     * we find an FB_WRITE opcode, we see if the payload registers match the
2369     * destination registers in our block of MOVs.
2370     */
2371    count = 0;
2372    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2373       if (count == 0) {
2374          start = inst;
2375          mov_block = block;
2376       }
2377       if (inst->opcode == BRW_OPCODE_MOV &&
2378           inst->dst.file == MRF &&
2379           inst->dst.reg == start->dst.reg + 2 * count &&
2380           inst->src[0].file == HW_REG &&
2381           inst->src[0].reg_offset == start->src[0].reg_offset + count) {
2382          if (count == 0) {
2383             start = inst;
2384             mov_block = block;
2385          }
2386          count++;
2387       }
2388
2389       if (inst->opcode == FS_OPCODE_FB_WRITE &&
2390           count == 4 &&
2391           (inst->base_mrf == start->dst.reg ||
2392            (inst->base_mrf + 2 == start->dst.reg && inst->header_present))) {
2393          fs_inst *mov = MOV(start->dst, start->src[0]);
2394
2395          /* Make a MOV that moves the four floats into the replicated write
2396           * payload.  Since we're running at the very end of code generation
2397           * we can use hw registers and generate the stride and offsets we
2398           * need for this MOV.  We use the first of the eight registers
2399           * allocated for the SIMD16 payload for the four floats.
2400           */
2401          mov->dst.fixed_hw_reg =
2402             brw_vec4_reg(BRW_MESSAGE_REGISTER_FILE,
2403                          start->dst.reg, 0);
2404          mov->dst.file = HW_REG;
2405          mov->dst.type = mov->dst.fixed_hw_reg.type;
2406
2407          mov->src[0].fixed_hw_reg =
2408             brw_vec4_grf(mov->src[0].fixed_hw_reg.nr, 0);
2409          mov->src[0].file = HW_REG;
2410          mov->src[0].type = mov->src[0].fixed_hw_reg.type;
2411          mov->force_writemask_all = true;
2412          mov->dst.type = BRW_REGISTER_TYPE_F;
2413
2414          /* Replace the four MOVs with the new vec4 MOV. */
2415          start->insert_before(mov_block, mov);
2416          for (i = 0; i < 4; i++)
2417             ((fs_inst *) mov->next)->remove(mov_block);
2418
2419          /* Finally, adjust the message length and set the opcode to
2420           * REP_FB_WRITE for the send, so that the generator will use the
2421           * replicated data mesage type.  Then reset count so we'll start
2422           * looking for a new block in case we're in a MRT shader.
2423           */
2424          inst->opcode = FS_OPCODE_REP_FB_WRITE;
2425          inst->mlen -= 7;
2426          count = 0;
2427       }
2428    }
2429
2430    return;
2431 }
2432
2433 /**
2434  * Walks through basic blocks, looking for repeated MRF writes and
2435  * removing the later ones.
2436  */
2437 bool
2438 fs_visitor::remove_duplicate_mrf_writes()
2439 {
2440    fs_inst *last_mrf_move[16];
2441    bool progress = false;
2442
2443    /* Need to update the MRF tracking for compressed instructions. */
2444    if (dispatch_width == 16)
2445       return false;
2446
2447    memset(last_mrf_move, 0, sizeof(last_mrf_move));
2448
2449    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
2450       if (inst->is_control_flow()) {
2451          memset(last_mrf_move, 0, sizeof(last_mrf_move));
2452       }
2453
2454       if (inst->opcode == BRW_OPCODE_MOV &&
2455           inst->dst.file == MRF) {
2456          fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2457          if (prev_inst && inst->equals(prev_inst)) {
2458             inst->remove(block);
2459             progress = true;
2460             continue;
2461          }
2462       }
2463
2464       /* Clear out the last-write records for MRFs that were overwritten. */
2465       if (inst->dst.file == MRF) {
2466          last_mrf_move[inst->dst.reg] = NULL;
2467       }
2468
2469       if (inst->mlen > 0 && inst->base_mrf != -1) {
2470          /* Found a SEND instruction, which will include two or fewer
2471           * implied MRF writes.  We could do better here.
2472           */
2473          for (int i = 0; i < implied_mrf_writes(inst); i++) {
2474             last_mrf_move[inst->base_mrf + i] = NULL;
2475          }
2476       }
2477
2478       /* Clear out any MRF move records whose sources got overwritten. */
2479       if (inst->dst.file == GRF) {
2480          for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2481             if (last_mrf_move[i] &&
2482                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2483                last_mrf_move[i] = NULL;
2484             }
2485          }
2486       }
2487
2488       if (inst->opcode == BRW_OPCODE_MOV &&
2489           inst->dst.file == MRF &&
2490           inst->src[0].file == GRF &&
2491           !inst->is_partial_write()) {
2492          last_mrf_move[inst->dst.reg] = inst;
2493       }
2494    }
2495
2496    if (progress)
2497       invalidate_live_intervals();
2498
2499    return progress;
2500 }
2501
2502 static void
2503 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2504                         int first_grf, int grf_len)
2505 {
2506    bool inst_simd16 = (dispatch_width > 8 &&
2507                        !inst->force_uncompressed &&
2508                        !inst->force_sechalf);
2509
2510    /* Clear the flag for registers that actually got read (as expected). */
2511    for (int i = 0; i < inst->sources; i++) {
2512       int grf;
2513       if (inst->src[i].file == GRF) {
2514          grf = inst->src[i].reg;
2515       } else if (inst->src[i].file == HW_REG &&
2516                  inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2517          grf = inst->src[i].fixed_hw_reg.nr;
2518       } else {
2519          continue;
2520       }
2521
2522       if (grf >= first_grf &&
2523           grf < first_grf + grf_len) {
2524          deps[grf - first_grf] = false;
2525          if (inst_simd16)
2526             deps[grf - first_grf + 1] = false;
2527       }
2528    }
2529 }
2530
2531 /**
2532  * Implements this workaround for the original 965:
2533  *
2534  *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2535  *      check for post destination dependencies on this instruction, software
2536  *      must ensure that there is no destination hazard for the case of ‘write
2537  *      followed by a posted write’ shown in the following example.
2538  *
2539  *      1. mov r3 0
2540  *      2. send r3.xy <rest of send instruction>
2541  *      3. mov r2 r3
2542  *
2543  *      Due to no post-destination dependency check on the ‘send’, the above
2544  *      code sequence could have two instructions (1 and 2) in flight at the
2545  *      same time that both consider ‘r3’ as the target of their final writes.
2546  */
2547 void
2548 fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
2549                                                         fs_inst *inst)
2550 {
2551    int reg_size = dispatch_width / 8;
2552    int write_len = inst->regs_written * reg_size;
2553    int first_write_grf = inst->dst.reg;
2554    bool needs_dep[BRW_MAX_MRF];
2555    assert(write_len < (int)sizeof(needs_dep) - 1);
2556
2557    memset(needs_dep, false, sizeof(needs_dep));
2558    memset(needs_dep, true, write_len);
2559
2560    clear_deps_for_inst_src(inst, dispatch_width,
2561                            needs_dep, first_write_grf, write_len);
2562
2563    /* Walk backwards looking for writes to registers we're writing which
2564     * aren't read since being written.  If we hit the start of the program,
2565     * we assume that there are no outstanding dependencies on entry to the
2566     * program.
2567     */
2568    for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2569         !scan_inst->is_head_sentinel();
2570         scan_inst = (fs_inst *)scan_inst->prev) {
2571
2572       /* If we hit control flow, assume that there *are* outstanding
2573        * dependencies, and force their cleanup before our instruction.
2574        */
2575       if (scan_inst->is_control_flow()) {
2576          for (int i = 0; i < write_len; i++) {
2577             if (needs_dep[i]) {
2578                inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
2579             }
2580          }
2581          return;
2582       }
2583
2584       bool scan_inst_simd16 = (dispatch_width > 8 &&
2585                                !scan_inst->force_uncompressed &&
2586                                !scan_inst->force_sechalf);
2587
2588       /* We insert our reads as late as possible on the assumption that any
2589        * instruction but a MOV that might have left us an outstanding
2590        * dependency has more latency than a MOV.
2591        */
2592       if (scan_inst->dst.file == GRF) {
2593          for (int i = 0; i < scan_inst->regs_written; i++) {
2594             int reg = scan_inst->dst.reg + i * reg_size;
2595
2596             if (reg >= first_write_grf &&
2597                 reg < first_write_grf + write_len &&
2598                 needs_dep[reg - first_write_grf]) {
2599                inst->insert_before(block, DEP_RESOLVE_MOV(reg));
2600                needs_dep[reg - first_write_grf] = false;
2601                if (scan_inst_simd16)
2602                   needs_dep[reg - first_write_grf + 1] = false;
2603             }
2604          }
2605       }
2606
2607       /* Clear the flag for registers that actually got read (as expected). */
2608       clear_deps_for_inst_src(scan_inst, dispatch_width,
2609                               needs_dep, first_write_grf, write_len);
2610
2611       /* Continue the loop only if we haven't resolved all the dependencies */
2612       int i;
2613       for (i = 0; i < write_len; i++) {
2614          if (needs_dep[i])
2615             break;
2616       }
2617       if (i == write_len)
2618          return;
2619    }
2620 }
2621
2622 /**
2623  * Implements this workaround for the original 965:
2624  *
2625  *     "[DevBW, DevCL] Errata: A destination register from a send can not be
2626  *      used as a destination register until after it has been sourced by an
2627  *      instruction with a different destination register.
2628  */
2629 void
2630 fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
2631 {
2632    int write_len = inst->regs_written * dispatch_width / 8;
2633    int first_write_grf = inst->dst.reg;
2634    bool needs_dep[BRW_MAX_MRF];
2635    assert(write_len < (int)sizeof(needs_dep) - 1);
2636
2637    memset(needs_dep, false, sizeof(needs_dep));
2638    memset(needs_dep, true, write_len);
2639    /* Walk forwards looking for writes to registers we're writing which aren't
2640     * read before being written.
2641     */
2642    for (fs_inst *scan_inst = (fs_inst *)inst->next;
2643         !scan_inst->is_tail_sentinel();
2644         scan_inst = (fs_inst *)scan_inst->next) {
2645       /* If we hit control flow, force resolve all remaining dependencies. */
2646       if (scan_inst->is_control_flow()) {
2647          for (int i = 0; i < write_len; i++) {
2648             if (needs_dep[i])
2649                scan_inst->insert_before(block,
2650                                         DEP_RESOLVE_MOV(first_write_grf + i));
2651          }
2652          return;
2653       }
2654
2655       /* Clear the flag for registers that actually got read (as expected). */
2656       clear_deps_for_inst_src(scan_inst, dispatch_width,
2657                               needs_dep, first_write_grf, write_len);
2658
2659       /* We insert our reads as late as possible since they're reading the
2660        * result of a SEND, which has massive latency.
2661        */
2662       if (scan_inst->dst.file == GRF &&
2663           scan_inst->dst.reg >= first_write_grf &&
2664           scan_inst->dst.reg < first_write_grf + write_len &&
2665           needs_dep[scan_inst->dst.reg - first_write_grf]) {
2666          scan_inst->insert_before(block, DEP_RESOLVE_MOV(scan_inst->dst.reg));
2667          needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2668       }
2669
2670       /* Continue the loop only if we haven't resolved all the dependencies */
2671       int i;
2672       for (i = 0; i < write_len; i++) {
2673          if (needs_dep[i])
2674             break;
2675       }
2676       if (i == write_len)
2677          return;
2678    }
2679
2680    /* If we hit the end of the program, resolve all remaining dependencies out
2681     * of paranoia.
2682     */
2683    fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2684    assert(last_inst->eot);
2685    for (int i = 0; i < write_len; i++) {
2686       if (needs_dep[i])
2687          last_inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
2688    }
2689 }
2690
2691 void
2692 fs_visitor::insert_gen4_send_dependency_workarounds()
2693 {
2694    if (brw->gen != 4 || brw->is_g4x)
2695       return;
2696
2697    bool progress = false;
2698
2699    /* Note that we're done with register allocation, so GRF fs_regs always
2700     * have a .reg_offset of 0.
2701     */
2702
2703    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2704       if (inst->mlen != 0 && inst->dst.file == GRF) {
2705          insert_gen4_pre_send_dependency_workarounds(block, inst);
2706          insert_gen4_post_send_dependency_workarounds(block, inst);
2707          progress = true;
2708       }
2709    }
2710
2711    if (progress)
2712       invalidate_live_intervals();
2713 }
2714
2715 /**
2716  * Turns the generic expression-style uniform pull constant load instruction
2717  * into a hardware-specific series of instructions for loading a pull
2718  * constant.
2719  *
2720  * The expression style allows the CSE pass before this to optimize out
2721  * repeated loads from the same offset, and gives the pre-register-allocation
2722  * scheduling full flexibility, while the conversion to native instructions
2723  * allows the post-register-allocation scheduler the best information
2724  * possible.
2725  *
2726  * Note that execution masking for setting up pull constant loads is special:
2727  * the channels that need to be written are unrelated to the current execution
2728  * mask, since a later instruction will use one of the result channels as a
2729  * source operand for all 8 or 16 of its channels.
2730  */
2731 void
2732 fs_visitor::lower_uniform_pull_constant_loads()
2733 {
2734    foreach_block_and_inst (block, fs_inst, inst, cfg) {
2735       if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2736          continue;
2737
2738       if (brw->gen >= 7) {
2739          /* The offset arg before was a vec4-aligned byte offset.  We need to
2740           * turn it into a dword offset.
2741           */
2742          fs_reg const_offset_reg = inst->src[1];
2743          assert(const_offset_reg.file == IMM &&
2744                 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2745          const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
2746          fs_reg payload = fs_reg(this, glsl_type::uint_type);
2747
2748          /* This is actually going to be a MOV, but since only the first dword
2749           * is accessed, we have a special opcode to do just that one.  Note
2750           * that this needs to be an operation that will be considered a def
2751           * by live variable analysis, or register allocation will explode.
2752           */
2753          fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2754                                                payload, const_offset_reg);
2755          setup->force_writemask_all = true;
2756
2757          setup->ir = inst->ir;
2758          setup->annotation = inst->annotation;
2759          inst->insert_before(block, setup);
2760
2761          /* Similarly, this will only populate the first 4 channels of the
2762           * result register (since we only use smear values from 0-3), but we
2763           * don't tell the optimizer.
2764           */
2765          inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2766          inst->src[1] = payload;
2767
2768          invalidate_live_intervals();
2769       } else {
2770          /* Before register allocation, we didn't tell the scheduler about the
2771           * MRF we use.  We know it's safe to use this MRF because nothing
2772           * else does except for register spill/unspill, which generates and
2773           * uses its MRF within a single IR instruction.
2774           */
2775          inst->base_mrf = 14;
2776          inst->mlen = 1;
2777       }
2778    }
2779 }
2780
2781 bool
2782 fs_visitor::lower_load_payload()
2783 {
2784    bool progress = false;
2785
2786    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
2787       if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
2788          fs_reg dst = inst->dst;
2789
2790          /* src[0] represents the (optional) message header. */
2791          if (inst->src[0].file != BAD_FILE) {
2792             inst->insert_before(block, MOV(dst, inst->src[0]));
2793          }
2794          dst.reg_offset++;
2795
2796          for (int i = 1; i < inst->sources; i++) {
2797             inst->insert_before(block, MOV(dst, inst->src[i]));
2798             dst.reg_offset++;
2799          }
2800
2801          inst->remove(block);
2802          progress = true;
2803       }
2804    }
2805
2806    if (progress)
2807       invalidate_live_intervals();
2808
2809    return progress;
2810 }
2811
2812 void
2813 fs_visitor::dump_instructions()
2814 {
2815    dump_instructions(NULL);
2816 }
2817
2818 void
2819 fs_visitor::dump_instructions(const char *name)
2820 {
2821    calculate_register_pressure();
2822    FILE *file = stderr;
2823    if (name && geteuid() != 0) {
2824       file = fopen(name, "w");
2825       if (!file)
2826          file = stderr;
2827    }
2828
2829    int ip = 0, max_pressure = 0;
2830    foreach_block_and_inst(block, backend_instruction, inst, cfg) {
2831       max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
2832       fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
2833       dump_instruction(inst, file);
2834       ++ip;
2835    }
2836    fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
2837
2838    if (file != stderr) {
2839       fclose(file);
2840    }
2841 }
2842
2843 void
2844 fs_visitor::dump_instruction(backend_instruction *be_inst)
2845 {
2846    dump_instruction(be_inst, stderr);
2847 }
2848
2849 void
2850 fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
2851 {
2852    fs_inst *inst = (fs_inst *)be_inst;
2853
2854    if (inst->predicate) {
2855       fprintf(file, "(%cf0.%d) ",
2856              inst->predicate_inverse ? '-' : '+',
2857              inst->flag_subreg);
2858    }
2859
2860    fprintf(file, "%s", brw_instruction_name(inst->opcode));
2861    if (inst->saturate)
2862       fprintf(file, ".sat");
2863    if (inst->conditional_mod) {
2864       fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
2865       if (!inst->predicate &&
2866           (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2867                               inst->opcode != BRW_OPCODE_IF &&
2868                               inst->opcode != BRW_OPCODE_WHILE))) {
2869          fprintf(file, ".f0.%d", inst->flag_subreg);
2870       }
2871    }
2872    fprintf(file, " ");
2873
2874
2875    switch (inst->dst.file) {
2876    case GRF:
2877       fprintf(file, "vgrf%d", inst->dst.reg);
2878       if (virtual_grf_sizes[inst->dst.reg] != 1 ||
2879           inst->dst.subreg_offset)
2880          fprintf(file, "+%d.%d",
2881                  inst->dst.reg_offset, inst->dst.subreg_offset);
2882       break;
2883    case MRF:
2884       fprintf(file, "m%d", inst->dst.reg);
2885       break;
2886    case BAD_FILE:
2887       fprintf(file, "(null)");
2888       break;
2889    case UNIFORM:
2890       fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
2891       break;
2892    case HW_REG:
2893       if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
2894          switch (inst->dst.fixed_hw_reg.nr) {
2895          case BRW_ARF_NULL:
2896             fprintf(file, "null");
2897             break;
2898          case BRW_ARF_ADDRESS:
2899             fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
2900             break;
2901          case BRW_ARF_ACCUMULATOR:
2902             fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
2903             break;
2904          case BRW_ARF_FLAG:
2905             fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
2906                              inst->dst.fixed_hw_reg.subnr);
2907             break;
2908          default:
2909             fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
2910                                inst->dst.fixed_hw_reg.subnr);
2911             break;
2912          }
2913       } else {
2914          fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
2915       }
2916       if (inst->dst.fixed_hw_reg.subnr)
2917          fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
2918       break;
2919    default:
2920       fprintf(file, "???");
2921       break;
2922    }
2923    fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
2924
2925    for (int i = 0; i < inst->sources && inst->src[i].file != BAD_FILE; i++) {
2926       if (inst->src[i].negate)
2927          fprintf(file, "-");
2928       if (inst->src[i].abs)
2929          fprintf(file, "|");
2930       switch (inst->src[i].file) {
2931       case GRF:
2932          fprintf(file, "vgrf%d", inst->src[i].reg);
2933          if (virtual_grf_sizes[inst->src[i].reg] != 1 ||
2934              inst->src[i].subreg_offset)
2935             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
2936                     inst->src[i].subreg_offset);
2937          break;
2938       case MRF:
2939          fprintf(file, "***m%d***", inst->src[i].reg);
2940          break;
2941       case UNIFORM:
2942          fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
2943          if (inst->src[i].reladdr) {
2944             fprintf(file, "+reladdr");
2945          } else if (inst->src[i].subreg_offset) {
2946             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
2947                     inst->src[i].subreg_offset);
2948          }
2949          break;
2950       case BAD_FILE:
2951          fprintf(file, "(null)");
2952          break;
2953       case IMM:
2954          switch (inst->src[i].type) {
2955          case BRW_REGISTER_TYPE_F:
2956             fprintf(file, "%ff", inst->src[i].fixed_hw_reg.dw1.f);
2957             break;
2958          case BRW_REGISTER_TYPE_D:
2959             fprintf(file, "%dd", inst->src[i].fixed_hw_reg.dw1.d);
2960             break;
2961          case BRW_REGISTER_TYPE_UD:
2962             fprintf(file, "%uu", inst->src[i].fixed_hw_reg.dw1.ud);
2963             break;
2964          default:
2965             fprintf(file, "???");
2966             break;
2967          }
2968          break;
2969       case HW_REG:
2970          if (inst->src[i].fixed_hw_reg.negate)
2971             fprintf(file, "-");
2972          if (inst->src[i].fixed_hw_reg.abs)
2973             fprintf(file, "|");
2974          if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
2975             switch (inst->src[i].fixed_hw_reg.nr) {
2976             case BRW_ARF_NULL:
2977                fprintf(file, "null");
2978                break;
2979             case BRW_ARF_ADDRESS:
2980                fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
2981                break;
2982             case BRW_ARF_ACCUMULATOR:
2983                fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
2984                break;
2985             case BRW_ARF_FLAG:
2986                fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
2987                                 inst->src[i].fixed_hw_reg.subnr);
2988                break;
2989             default:
2990                fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
2991                                   inst->src[i].fixed_hw_reg.subnr);
2992                break;
2993             }
2994          } else {
2995             fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
2996          }
2997          if (inst->src[i].fixed_hw_reg.subnr)
2998             fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
2999          if (inst->src[i].fixed_hw_reg.abs)
3000             fprintf(file, "|");
3001          break;
3002       default:
3003          fprintf(file, "???");
3004          break;
3005       }
3006       if (inst->src[i].abs)
3007          fprintf(file, "|");
3008
3009       if (inst->src[i].file != IMM) {
3010          fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
3011       }
3012
3013       if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
3014          fprintf(file, ", ");
3015    }
3016
3017    fprintf(file, " ");
3018
3019    if (inst->force_uncompressed)
3020       fprintf(file, "1sthalf ");
3021
3022    if (inst->force_sechalf)
3023       fprintf(file, "2ndhalf ");
3024
3025    fprintf(file, "\n");
3026 }
3027
3028 /**
3029  * Possibly returns an instruction that set up @param reg.
3030  *
3031  * Sometimes we want to take the result of some expression/variable
3032  * dereference tree and rewrite the instruction generating the result
3033  * of the tree.  When processing the tree, we know that the
3034  * instructions generated are all writing temporaries that are dead
3035  * outside of this tree.  So, if we have some instructions that write
3036  * a temporary, we're free to point that temp write somewhere else.
3037  *
3038  * Note that this doesn't guarantee that the instruction generated
3039  * only reg -- it might be the size=4 destination of a texture instruction.
3040  */
3041 fs_inst *
3042 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3043                                            fs_inst *end,
3044                                            const fs_reg &reg)
3045 {
3046    if (end == start ||
3047        end->is_partial_write() ||
3048        reg.reladdr ||
3049        !reg.equals(end->dst)) {
3050       return NULL;
3051    } else {
3052       return end;
3053    }
3054 }
3055
3056 void
3057 fs_visitor::setup_payload_gen6()
3058 {
3059    bool uses_depth =
3060       (prog->InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3061    unsigned barycentric_interp_modes =
3062       (stage == MESA_SHADER_FRAGMENT) ?
3063       ((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0;
3064
3065    assert(brw->gen >= 6);
3066
3067    /* R0-1: masks, pixel X/Y coordinates. */
3068    payload.num_regs = 2;
3069    /* R2: only for 32-pixel dispatch.*/
3070
3071    /* R3-26: barycentric interpolation coordinates.  These appear in the
3072     * same order that they appear in the brw_wm_barycentric_interp_mode
3073     * enum.  Each set of coordinates occupies 2 registers if dispatch width
3074     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
3075     * appear if they were enabled using the "Barycentric Interpolation
3076     * Mode" bits in WM_STATE.
3077     */
3078    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3079       if (barycentric_interp_modes & (1 << i)) {
3080          payload.barycentric_coord_reg[i] = payload.num_regs;
3081          payload.num_regs += 2;
3082          if (dispatch_width == 16) {
3083             payload.num_regs += 2;
3084          }
3085       }
3086    }
3087
3088    /* R27: interpolated depth if uses source depth */
3089    if (uses_depth) {
3090       payload.source_depth_reg = payload.num_regs;
3091       payload.num_regs++;
3092       if (dispatch_width == 16) {
3093          /* R28: interpolated depth if not SIMD8. */
3094          payload.num_regs++;
3095       }
3096    }
3097    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3098    if (uses_depth) {
3099       payload.source_w_reg = payload.num_regs;
3100       payload.num_regs++;
3101       if (dispatch_width == 16) {
3102          /* R30: interpolated W if not SIMD8. */
3103          payload.num_regs++;
3104       }
3105    }
3106
3107    if (stage == MESA_SHADER_FRAGMENT) {
3108       brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3109       brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3110       prog_data->uses_pos_offset = key->compute_pos_offset;
3111       /* R31: MSAA position offsets. */
3112       if (prog_data->uses_pos_offset) {
3113          payload.sample_pos_reg = payload.num_regs;
3114          payload.num_regs++;
3115       }
3116    }
3117
3118    /* R32: MSAA input coverage mask */
3119    if (prog->SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
3120       assert(brw->gen >= 7);
3121       payload.sample_mask_in_reg = payload.num_regs;
3122       payload.num_regs++;
3123       if (dispatch_width == 16) {
3124          /* R33: input coverage mask if not SIMD8. */
3125          payload.num_regs++;
3126       }
3127    }
3128
3129    /* R34-: bary for 32-pixel. */
3130    /* R58-59: interp W for 32-pixel. */
3131
3132    if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3133       source_depth_to_render_target = true;
3134    }
3135 }
3136
3137 void
3138 fs_visitor::assign_binding_table_offsets()
3139 {
3140    assert(stage == MESA_SHADER_FRAGMENT);
3141    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3142    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3143    uint32_t next_binding_table_offset = 0;
3144
3145    /* If there are no color regions, we still perform an FB write to a null
3146     * renderbuffer, which we place at surface index 0.
3147     */
3148    prog_data->binding_table.render_target_start = next_binding_table_offset;
3149    next_binding_table_offset += MAX2(key->nr_color_regions, 1);
3150
3151    assign_common_binding_table_offsets(next_binding_table_offset);
3152 }
3153
3154 void
3155 fs_visitor::calculate_register_pressure()
3156 {
3157    invalidate_live_intervals();
3158    calculate_live_intervals();
3159
3160    unsigned num_instructions = instructions.length();
3161
3162    regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
3163
3164    for (int reg = 0; reg < virtual_grf_count; reg++) {
3165       for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
3166          regs_live_at_ip[ip] += virtual_grf_sizes[reg];
3167    }
3168 }
3169
3170 /**
3171  * Look for repeated FS_OPCODE_MOV_DISPATCH_TO_FLAGS and drop the later ones.
3172  *
3173  * The needs_unlit_centroid_workaround ends up producing one of these per
3174  * channel of centroid input, so it's good to clean them up.
3175  *
3176  * An assumption here is that nothing ever modifies the dispatched pixels
3177  * value that FS_OPCODE_MOV_DISPATCH_TO_FLAGS reads from, but the hardware
3178  * dictates that anyway.
3179  */
3180 void
3181 fs_visitor::opt_drop_redundant_mov_to_flags()
3182 {
3183    bool flag_mov_found[2] = {false};
3184
3185    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
3186       if (inst->is_control_flow()) {
3187          memset(flag_mov_found, 0, sizeof(flag_mov_found));
3188       } else if (inst->opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS) {
3189          if (!flag_mov_found[inst->flag_subreg])
3190             flag_mov_found[inst->flag_subreg] = true;
3191          else
3192             inst->remove(block);
3193       } else if (inst->writes_flag()) {
3194          flag_mov_found[inst->flag_subreg] = false;
3195       }
3196    }
3197 }
3198
3199 bool
3200 fs_visitor::run()
3201 {
3202    sanity_param_count = prog->Parameters->NumParameters;
3203    bool allocated_without_spills;
3204
3205    assign_binding_table_offsets();
3206
3207    if (brw->gen >= 6)
3208       setup_payload_gen6();
3209    else
3210       setup_payload_gen4();
3211
3212    if (0) {
3213       emit_dummy_fs();
3214    } else {
3215       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3216          emit_shader_time_begin();
3217
3218       calculate_urb_setup();
3219       if (prog->InputsRead > 0) {
3220          if (brw->gen < 6)
3221             emit_interpolation_setup_gen4();
3222          else
3223             emit_interpolation_setup_gen6();
3224       }
3225
3226       /* We handle discards by keeping track of the still-live pixels in f0.1.
3227        * Initialize it with the dispatched pixels.
3228        */
3229       bool uses_kill =
3230          (stage == MESA_SHADER_FRAGMENT) &&
3231          ((brw_wm_prog_data*) this->prog_data)->uses_kill;
3232       bool alpha_test_func =
3233          (stage == MESA_SHADER_FRAGMENT) &&
3234          ((brw_wm_prog_key*) this->key)->alpha_test_func;
3235       if (uses_kill || alpha_test_func) {
3236          fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
3237          discard_init->flag_subreg = 1;
3238       }
3239
3240       /* Generate FS IR for main().  (the visitor only descends into
3241        * functions called "main").
3242        */
3243       if (shader) {
3244          foreach_in_list(ir_instruction, ir, shader->base.ir) {
3245             base_ir = ir;
3246             this->result = reg_undef;
3247             ir->accept(this);
3248          }
3249       } else {
3250          emit_fragment_program_code();
3251       }
3252       base_ir = NULL;
3253       if (failed)
3254          return false;
3255
3256       emit(FS_OPCODE_PLACEHOLDER_HALT);
3257
3258       if (alpha_test_func)
3259          emit_alpha_test();
3260
3261       emit_fb_writes();
3262
3263       calculate_cfg();
3264
3265       split_virtual_grfs();
3266
3267       move_uniform_array_access_to_pull_constants();
3268       assign_constant_locations();
3269       demote_pull_constants();
3270
3271       opt_drop_redundant_mov_to_flags();
3272
3273 #define OPT(pass, args...) do {                                            \
3274       pass_num++;                                                          \
3275       bool this_progress = pass(args);                                     \
3276                                                                            \
3277       if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) {      \
3278          char filename[64];                                                \
3279          snprintf(filename, 64, "fs%d-%04d-%02d-%02d-" #pass,              \
3280                   dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
3281                                                                            \
3282          backend_visitor::dump_instructions(filename);                     \
3283       }                                                                    \
3284                                                                            \
3285       progress = progress || this_progress;                                \
3286    } while (false)
3287
3288       if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
3289          char filename[64];
3290          snprintf(filename, 64, "fs%d-%04d-00-start",
3291                   dispatch_width, shader_prog ? shader_prog->Name : 0);
3292
3293          backend_visitor::dump_instructions(filename);
3294       }
3295
3296       bool progress;
3297       int iteration = 0;
3298       do {
3299          progress = false;
3300          iteration++;
3301          int pass_num = 0;
3302
3303          compact_virtual_grfs();
3304
3305          OPT(remove_duplicate_mrf_writes);
3306
3307          OPT(opt_algebraic);
3308          OPT(opt_cse);
3309          OPT(opt_copy_propagate);
3310          OPT(opt_peephole_predicated_break);
3311          OPT(dead_code_eliminate);
3312          OPT(opt_peephole_sel);
3313          OPT(dead_control_flow_eliminate, this);
3314          OPT(opt_register_renaming);
3315          OPT(opt_saturate_propagation);
3316          OPT(register_coalesce);
3317          OPT(compute_to_mrf);
3318       } while (progress);
3319
3320       if (lower_load_payload()) {
3321          register_coalesce();
3322          dead_code_eliminate();
3323       }
3324
3325       lower_uniform_pull_constant_loads();
3326
3327       assign_curb_setup();
3328       assign_urb_setup();
3329
3330       static enum instruction_scheduler_mode pre_modes[] = {
3331          SCHEDULE_PRE,
3332          SCHEDULE_PRE_NON_LIFO,
3333          SCHEDULE_PRE_LIFO,
3334       };
3335
3336       /* Try each scheduling heuristic to see if it can successfully register
3337        * allocate without spilling.  They should be ordered by decreasing
3338        * performance but increasing likelihood of allocating.
3339        */
3340       for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3341          schedule_instructions(pre_modes[i]);
3342
3343          if (0) {
3344             assign_regs_trivial();
3345             allocated_without_spills = true;
3346          } else {
3347             allocated_without_spills = assign_regs(false);
3348          }
3349          if (allocated_without_spills)
3350             break;
3351       }
3352
3353       if (!allocated_without_spills) {
3354          /* We assume that any spilling is worse than just dropping back to
3355           * SIMD8.  There's probably actually some intermediate point where
3356           * SIMD16 with a couple of spills is still better.
3357           */
3358          if (dispatch_width == 16) {
3359             fail("Failure to register allocate.  Reduce number of "
3360                  "live scalar values to avoid this.");
3361          } else {
3362             perf_debug("Fragment shader triggered register spilling.  "
3363                        "Try reducing the number of live scalar values to "
3364                        "improve performance.\n");
3365          }
3366
3367          /* Since we're out of heuristics, just go spill registers until we
3368           * get an allocation.
3369           */
3370          while (!assign_regs(true)) {
3371             if (failed)
3372                break;
3373          }
3374       }
3375    }
3376    assert(force_uncompressed_stack == 0);
3377
3378    /* This must come after all optimization and register allocation, since
3379     * it inserts dead code that happens to have side effects, and it does
3380     * so based on the actual physical registers in use.
3381     */
3382    insert_gen4_send_dependency_workarounds();
3383
3384    if (failed)
3385       return false;
3386
3387    if (!allocated_without_spills)
3388       schedule_instructions(SCHEDULE_POST);
3389
3390    if (last_scratch > 0) {
3391       prog_data->total_scratch = brw_get_scratch_size(last_scratch);
3392    }
3393
3394    if (brw->use_rep_send)
3395       try_rep_send();
3396
3397    if (stage == MESA_SHADER_FRAGMENT) {
3398       brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3399       if (dispatch_width == 8)
3400          prog_data->reg_blocks = brw_register_blocks(grf_used);
3401       else
3402          prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
3403    }
3404
3405    /* If any state parameters were appended, then ParameterValues could have
3406     * been realloced, in which case the driver uniform storage set up by
3407     * _mesa_associate_uniform_storage() would point to freed memory.  Make
3408     * sure that didn't happen.
3409     */
3410    assert(sanity_param_count == prog->Parameters->NumParameters);
3411
3412    return !failed;
3413 }
3414
3415 const unsigned *
3416 brw_wm_fs_emit(struct brw_context *brw,
3417                void *mem_ctx,
3418                const struct brw_wm_prog_key *key,
3419                struct brw_wm_prog_data *prog_data,
3420                struct gl_fragment_program *fp,
3421                struct gl_shader_program *prog,
3422                unsigned *final_assembly_size)
3423 {
3424    bool start_busy = false;
3425    double start_time = 0;
3426
3427    if (unlikely(brw->perf_debug)) {
3428       start_busy = (brw->batch.last_bo &&
3429                     drm_intel_bo_busy(brw->batch.last_bo));
3430       start_time = get_time();
3431    }
3432
3433    struct brw_shader *shader = NULL;
3434    if (prog)
3435       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3436
3437    if (unlikely(INTEL_DEBUG & DEBUG_WM))
3438       brw_dump_ir(brw, "fragment", prog, &shader->base, &fp->Base);
3439
3440    /* Now the main event: Visit the shader IR and generate our FS IR for it.
3441     */
3442    fs_visitor v(brw, mem_ctx, key, prog_data, prog, fp, 8);
3443    if (!v.run()) {
3444       if (prog) {
3445          prog->LinkStatus = false;
3446          ralloc_strcat(&prog->InfoLog, v.fail_msg);
3447       }
3448
3449       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3450                     v.fail_msg);
3451
3452       return NULL;
3453    }
3454
3455    cfg_t *simd16_cfg = NULL;
3456    fs_visitor v2(brw, mem_ctx, key, prog_data, prog, fp, 16);
3457    if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16) ||
3458                                brw->use_rep_send)) {
3459       if (!v.simd16_unsupported) {
3460          /* Try a SIMD16 compile */
3461          v2.import_uniforms(&v);
3462          if (!v2.run()) {
3463             perf_debug("SIMD16 shader failed to compile, falling back to "
3464                        "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
3465          } else {
3466             simd16_cfg = v2.cfg;
3467          }
3468       } else {
3469          perf_debug("SIMD16 shader unsupported, falling back to "
3470                     "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
3471       }
3472    }
3473
3474    cfg_t *simd8_cfg;
3475    int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || brw->no_simd8;
3476    if (no_simd8 && simd16_cfg) {
3477       simd8_cfg = NULL;
3478       prog_data->no_8 = true;
3479    } else {
3480       simd8_cfg = v.cfg;
3481       prog_data->no_8 = false;
3482    }
3483
3484    const unsigned *assembly = NULL;
3485    fs_generator g(brw, mem_ctx, key, prog_data, prog, fp,
3486                   v.runtime_check_aads_emit, INTEL_DEBUG & DEBUG_WM);
3487    assembly = g.generate_assembly(simd8_cfg, simd16_cfg,
3488                                   final_assembly_size);
3489
3490    if (unlikely(brw->perf_debug) && shader) {
3491       if (shader->compiled_once)
3492          brw_wm_debug_recompile(brw, prog, key);
3493       shader->compiled_once = true;
3494
3495       if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
3496          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3497                     (get_time() - start_time) * 1000);
3498       }
3499    }
3500
3501    return assembly;
3502 }
3503
3504 bool
3505 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
3506 {
3507    struct brw_context *brw = brw_context(ctx);
3508    struct brw_wm_prog_key key;
3509
3510    if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
3511       return true;
3512
3513    struct gl_fragment_program *fp = (struct gl_fragment_program *)
3514       prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
3515    struct brw_fragment_program *bfp = brw_fragment_program(fp);
3516    bool program_uses_dfdy = fp->UsesDFdy;
3517
3518    memset(&key, 0, sizeof(key));
3519
3520    if (brw->gen < 6) {
3521       if (fp->UsesKill)
3522          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3523
3524       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3525          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3526
3527       /* Just assume depth testing. */
3528       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3529       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3530    }
3531
3532    if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
3533                                          BRW_FS_VARYING_INPUT_MASK) > 16)
3534       key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
3535
3536    unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
3537    for (unsigned i = 0; i < sampler_count; i++) {
3538       if (fp->Base.ShadowSamplers & (1 << i)) {
3539          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3540          key.tex.swizzles[i] =
3541             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3542       } else {
3543          /* Color sampler: assume no swizzling. */
3544          key.tex.swizzles[i] = SWIZZLE_XYZW;
3545       }
3546    }
3547
3548    if (fp->Base.InputsRead & VARYING_BIT_POS) {
3549       key.drawable_height = ctx->DrawBuffer->Height;
3550    }
3551
3552    key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
3553          ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
3554          BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
3555
3556    if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3557       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
3558                           key.nr_color_regions > 1;
3559    }
3560
3561    /* GL_FRAGMENT_SHADER_DERIVATIVE_HINT is almost always GL_DONT_CARE.  The
3562     * quality of the derivatives is likely to be determined by the driconf
3563     * option.
3564     */
3565    key.high_quality_derivatives = brw->disable_derivative_optimization;
3566
3567    key.program_string_id = bfp->id;
3568
3569    uint32_t old_prog_offset = brw->wm.base.prog_offset;
3570    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3571
3572    bool success = do_wm_prog(brw, prog, bfp, &key);
3573
3574    brw->wm.base.prog_offset = old_prog_offset;
3575    brw->wm.prog_data = old_prog_data;
3576
3577    return success;
3578 }