src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 extern "C" {
  32
  33 #include <sys/types.h>
  34
  35 #include "util/hash_table.h"
  36 #include "main/macros.h"
  37 #include "main/shaderobj.h"
  38 #include "main/fbobject.h"
  39 #include "program/prog_parameter.h"
  40 #include "program/prog_print.h"
  41 #include "program/register_allocate.h"
  42 #include "program/sampler.h"
  43 #include "program/hash_table.h"
  44 #include "brw_context.h"
  45 #include "brw_eu.h"
  46 #include "brw_wm.h"
  47 }
  48 #include "brw_fs.h"
  49 #include "brw_cfg.h"
  50 #include "brw_dead_control_flow.h"
  51 #include "main/uniforms.h"
  52 #include "brw_fs_live_variables.h"
  53 #include "glsl/glsl_types.h"
  54
  55 void
  56 fs_inst::init(enum opcode opcode, const fs_reg &dst, fs_reg *src, int sources)
  57 {
  58    memset(this, 0, sizeof(*this));
  59
  60    this->opcode = opcode;
  61    this->dst = dst;
  62    this->src = src;
  63    this->sources = sources;
  64
  65    this->conditional_mod = BRW_CONDITIONAL_NONE;
  66
  67    /* This will be the case for almost all instructions. */
  68    this->regs_written = 1;
  69
  70    this->writes_accumulator = false;
  71 }
  72
  73 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst)
  74 {
  75    fs_reg *src = ralloc_array(this, fs_reg, 3);
  76    init(opcode, dst, src, 0);
  77 }
  78
  79 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
  80 {
  81    fs_reg *src = ralloc_array(this, fs_reg, 3);
  82    src[0] = src0;
  83    init(opcode, dst, src, 1);
  84 }
  85
  86 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
  87                  const fs_reg &src1)
  88 {
  89    fs_reg *src = ralloc_array(this, fs_reg, 3);
  90    src[0] = src0;
  91    src[1] = src1;
  92    init(opcode, dst, src, 2);
  93 }
  94
  95 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
  96                  const fs_reg &src1, const fs_reg &src2)
  97 {
  98    fs_reg *src = ralloc_array(this, fs_reg, 3);
  99    src[0] = src0;
 100    src[1] = src1;
 101    src[2] = src2;
 102    init(opcode, dst, src, 3);
 103 }
 104
 105 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, fs_reg src[], int sources)
 106 {
 107    init(opcode, dst, src, sources);
 108 }
 109
 110 fs_inst::fs_inst(const fs_inst &that)
 111 {
 112    memcpy(this, &that, sizeof(that));
 113
 114    this->src = ralloc_array(this, fs_reg, that.sources);
 115
 116    for (int i = 0; i < that.sources; i++)
 117       this->src[i] = that.src[i];
 118 }
 119
 120 void
 121 fs_inst::resize_sources(uint8_t num_sources)
 122 {
 123    if (this->sources != num_sources) {
 124       this->src = reralloc(this, this->src, fs_reg, num_sources);
 125       this->sources = num_sources;
 126    }
 127 }
 128
 129 #define ALU1(op)                                                        \
 130    fs_inst *                                                            \
 131    fs_visitor::op(const fs_reg &dst, const fs_reg &src0)                \
 132    {                                                                    \
 133       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
 134    }
 135
 136 #define ALU2(op)                                                        \
 137    fs_inst *                                                            \
 138    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 139                   const fs_reg &src1)                                   \
 140    {                                                                    \
 141       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
 142    }
 143
 144 #define ALU2_ACC(op)                                                    \
 145    fs_inst *                                                            \
 146    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 147                   const fs_reg &src1)                                   \
 148    {                                                                    \
 149       fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
 150       inst->writes_accumulator = true;                                  \
 151       return inst;                                                      \
 152    }
 153
 154 #define ALU3(op)                                                        \
 155    fs_inst *                                                            \
 156    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 157                   const fs_reg &src1, const fs_reg &src2)               \
 158    {                                                                    \
 159       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
 160    }
 161
 162 ALU1(NOT)
 163 ALU1(MOV)
 164 ALU1(FRC)
 165 ALU1(RNDD)
 166 ALU1(RNDE)
 167 ALU1(RNDZ)
 168 ALU2(ADD)
 169 ALU2(MUL)
 170 ALU2_ACC(MACH)
 171 ALU2(AND)
 172 ALU2(OR)
 173 ALU2(XOR)
 174 ALU2(SHL)
 175 ALU2(SHR)
 176 ALU2(ASR)
 177 ALU3(LRP)
 178 ALU1(BFREV)
 179 ALU3(BFE)
 180 ALU2(BFI1)
 181 ALU3(BFI2)
 182 ALU1(FBH)
 183 ALU1(FBL)
 184 ALU1(CBIT)
 185 ALU3(MAD)
 186 ALU2_ACC(ADDC)
 187 ALU2_ACC(SUBB)
 188 ALU2(SEL)
 189 ALU2(MAC)
 190
 191 /** Gen4 predicated IF. */
 192 fs_inst *
 193 fs_visitor::IF(enum brw_predicate predicate)
 194 {
 195    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
 196    inst->predicate = predicate;
 197    return inst;
 198 }
 199
 200 /** Gen6 IF with embedded comparison. */
 201 fs_inst *
 202 fs_visitor::IF(const fs_reg &src0, const fs_reg &src1,
 203                enum brw_conditional_mod condition)
 204 {
 205    assert(brw->gen == 6);
 206    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
 207                                         reg_null_d, src0, src1);
 208    inst->conditional_mod = condition;
 209    return inst;
 210 }
 211
 212 /**
 213  * CMP: Sets the low bit of the destination channels with the result
 214  * of the comparison, while the upper bits are undefined, and updates
 215  * the flag register with the packed 16 bits of the result.
 216  */
 217 fs_inst *
 218 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1,
 219                 enum brw_conditional_mod condition)
 220 {
 221    fs_inst *inst;
 222
 223    /* Take the instruction:
 224     *
 225     * CMP null<d> src0<f> src1<f>
 226     *
 227     * Original gen4 does type conversion to the destination type before
 228     * comparison, producing garbage results for floating point comparisons.
 229     * gen5 does the comparison on the execution type (resolved source types),
 230     * so dst type doesn't matter.  gen6 does comparison and then uses the
 231     * result as if it was the dst type with no conversion, which happens to
 232     * mostly work out for float-interpreted-as-int since our comparisons are
 233     * for >0, =0, <0.
 234     */
 235    if (brw->gen == 4) {
 236       dst.type = src0.type;
 237       if (dst.file == HW_REG)
 238          dst.fixed_hw_reg.type = dst.type;
 239    }
 240
 241    resolve_ud_negate(&src0);
 242    resolve_ud_negate(&src1);
 243
 244    inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
 245    inst->conditional_mod = condition;
 246
 247    return inst;
 248 }
 249
 250 fs_inst *
 251 fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources)
 252 {
 253    fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, dst, src,
 254                                         sources);
 255    inst->regs_written = sources;
 256
 257    return inst;
 258 }
 259
 260 exec_list
 261 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
 262                                        const fs_reg &surf_index,
 263                                        const fs_reg &varying_offset,
 264                                        uint32_t const_offset)
 265 {
 266    exec_list instructions;
 267    fs_inst *inst;
 268
 269    /* We have our constant surface use a pitch of 4 bytes, so our index can
 270     * be any component of a vector, and then we load 4 contiguous
 271     * components starting from that.
 272     *
 273     * We break down the const_offset to a portion added to the variable
 274     * offset and a portion done using reg_offset, which means that if you
 275     * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
 276     * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
 277     * CSE can later notice that those loads are all the same and eliminate
 278     * the redundant ones.
 279     */
 280    fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
 281    instructions.push_tail(ADD(vec4_offset,
 282                               varying_offset, const_offset & ~3));
 283
 284    int scale = 1;
 285    if (brw->gen == 4 && dispatch_width == 8) {
 286       /* Pre-gen5, we can either use a SIMD8 message that requires (header,
 287        * u, v, r) as parameters, or we can just use the SIMD16 message
 288        * consisting of (header, u).  We choose the second, at the cost of a
 289        * longer return length.
 290        */
 291       scale = 2;
 292    }
 293
 294    enum opcode op;
 295    if (brw->gen >= 7)
 296       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
 297    else
 298       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
 299    fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4 * scale), dst.type);
 300    inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
 301    inst->regs_written = 4 * scale;
 302    instructions.push_tail(inst);
 303
 304    if (brw->gen < 7) {
 305       inst->base_mrf = 13;
 306       inst->header_present = true;
 307       if (brw->gen == 4)
 308          inst->mlen = 3;
 309       else
 310          inst->mlen = 1 + dispatch_width / 8;
 311    }
 312
 313    vec4_result.reg_offset += (const_offset & 3) * scale;
 314    instructions.push_tail(MOV(dst, vec4_result));
 315
 316    return instructions;
 317 }
 318
 319 /**
 320  * A helper for MOV generation for fixing up broken hardware SEND dependency
 321  * handling.
 322  */
 323 fs_inst *
 324 fs_visitor::DEP_RESOLVE_MOV(int grf)
 325 {
 326    fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
 327
 328    inst->ir = NULL;
 329    inst->annotation = "send dependency resolve";
 330
 331    /* The caller always wants uncompressed to emit the minimal extra
 332     * dependencies, and to avoid having to deal with aligning its regs to 2.
 333     */
 334    inst->force_uncompressed = true;
 335
 336    return inst;
 337 }
 338
 339 bool
 340 fs_inst::equals(fs_inst *inst) const
 341 {
 342    return (opcode == inst->opcode &&
 343            dst.equals(inst->dst) &&
 344            src[0].equals(inst->src[0]) &&
 345            src[1].equals(inst->src[1]) &&
 346            src[2].equals(inst->src[2]) &&
 347            saturate == inst->saturate &&
 348            predicate == inst->predicate &&
 349            conditional_mod == inst->conditional_mod &&
 350            mlen == inst->mlen &&
 351            base_mrf == inst->base_mrf &&
 352            target == inst->target &&
 353            eot == inst->eot &&
 354            header_present == inst->header_present &&
 355            shadow_compare == inst->shadow_compare &&
 356            offset == inst->offset);
 357 }
 358
 359 bool
 360 fs_inst::overwrites_reg(const fs_reg &reg) const
 361 {
 362    return (reg.file == dst.file &&
 363            reg.reg == dst.reg &&
 364            reg.reg_offset >= dst.reg_offset  &&
 365            reg.reg_offset < dst.reg_offset + regs_written);
 366 }
 367
 368 bool
 369 fs_inst::is_send_from_grf() const
 370 {
 371    return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
 372            opcode == SHADER_OPCODE_SHADER_TIME_ADD ||
 373            opcode == FS_OPCODE_INTERPOLATE_AT_CENTROID ||
 374            opcode == FS_OPCODE_INTERPOLATE_AT_SAMPLE ||
 375            opcode == FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET ||
 376            opcode == FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET ||
 377            (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
 378             src[1].file == GRF) ||
 379            (is_tex() && src[0].file == GRF));
 380 }
 381
 382 bool
 383 fs_inst::can_do_source_mods(struct brw_context *brw)
 384 {
 385    if (brw->gen == 6 && is_math())
 386       return false;
 387
 388    if (is_send_from_grf())
 389       return false;
 390
 391    if (!backend_instruction::can_do_source_mods())
 392       return false;
 393
 394    return true;
 395 }
 396
 397 void
 398 fs_reg::init()
 399 {
 400    memset(this, 0, sizeof(*this));
 401    stride = 1;
 402 }
 403
 404 /** Generic unset register constructor. */
 405 fs_reg::fs_reg()
 406 {
 407    init();
 408    this->file = BAD_FILE;
 409 }
 410
 411 /** Immediate value constructor. */
 412 fs_reg::fs_reg(float f)
 413 {
 414    init();
 415    this->file = IMM;
 416    this->type = BRW_REGISTER_TYPE_F;
 417    this->fixed_hw_reg.dw1.f = f;
 418 }
 419
 420 /** Immediate value constructor. */
 421 fs_reg::fs_reg(int32_t i)
 422 {
 423    init();
 424    this->file = IMM;
 425    this->type = BRW_REGISTER_TYPE_D;
 426    this->fixed_hw_reg.dw1.d = i;
 427 }
 428
 429 /** Immediate value constructor. */
 430 fs_reg::fs_reg(uint32_t u)
 431 {
 432    init();
 433    this->file = IMM;
 434    this->type = BRW_REGISTER_TYPE_UD;
 435    this->fixed_hw_reg.dw1.ud = u;
 436 }
 437
 438 /** Fixed brw_reg. */
 439 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
 440 {
 441    init();
 442    this->file = HW_REG;
 443    this->fixed_hw_reg = fixed_hw_reg;
 444    this->type = fixed_hw_reg.type;
 445 }
 446
 447 bool
 448 fs_reg::equals(const fs_reg &r) const
 449 {
 450    return (file == r.file &&
 451            reg == r.reg &&
 452            reg_offset == r.reg_offset &&
 453            subreg_offset == r.subreg_offset &&
 454            type == r.type &&
 455            negate == r.negate &&
 456            abs == r.abs &&
 457            !reladdr && !r.reladdr &&
 458            memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
 459                   sizeof(fixed_hw_reg)) == 0 &&
 460            stride == r.stride);
 461 }
 462
 463 fs_reg &
 464 fs_reg::apply_stride(unsigned stride)
 465 {
 466    assert((this->stride * stride) <= 4 &&
 467           (is_power_of_two(stride) || stride == 0) &&
 468           file != HW_REG && file != IMM);
 469    this->stride *= stride;
 470    return *this;
 471 }
 472
 473 fs_reg &
 474 fs_reg::set_smear(unsigned subreg)
 475 {
 476    assert(file != HW_REG && file != IMM);
 477    subreg_offset = subreg * type_sz(type);
 478    stride = 0;
 479    return *this;
 480 }
 481
 482 bool
 483 fs_reg::is_contiguous() const
 484 {
 485    return stride == 1;
 486 }
 487
 488 bool
 489 fs_reg::is_valid_3src() const
 490 {
 491    return file == GRF || file == UNIFORM;
 492 }
 493
 494 int
 495 fs_visitor::type_size(const struct glsl_type *type)
 496 {
 497    unsigned int size, i;
 498
 499    switch (type->base_type) {
 500    case GLSL_TYPE_UINT:
 501    case GLSL_TYPE_INT:
 502    case GLSL_TYPE_FLOAT:
 503    case GLSL_TYPE_BOOL:
 504       return type->components();
 505    case GLSL_TYPE_ARRAY:
 506       return type_size(type->fields.array) * type->length;
 507    case GLSL_TYPE_STRUCT:
 508       size = 0;
 509       for (i = 0; i < type->length; i++) {
 510          size += type_size(type->fields.structure[i].type);
 511       }
 512       return size;
 513    case GLSL_TYPE_SAMPLER:
 514       /* Samplers take up no register space, since they're baked in at
 515        * link time.
 516        */
 517       return 0;
 518    case GLSL_TYPE_ATOMIC_UINT:
 519       return 0;
 520    case GLSL_TYPE_IMAGE:
 521    case GLSL_TYPE_VOID:
 522    case GLSL_TYPE_ERROR:
 523    case GLSL_TYPE_INTERFACE:
 524       unreachable("not reached");
 525    }
 526
 527    return 0;
 528 }
 529
 530 fs_reg
 531 fs_visitor::get_timestamp()
 532 {
 533    assert(brw->gen >= 7);
 534
 535    fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 536                                           BRW_ARF_TIMESTAMP,
 537                                           0),
 538                              BRW_REGISTER_TYPE_UD));
 539
 540    fs_reg dst = fs_reg(this, glsl_type::uint_type);
 541
 542    fs_inst *mov = emit(MOV(dst, ts));
 543    /* We want to read the 3 fields we care about (mostly field 0, but also 2)
 544     * even if it's not enabled in the dispatch.
 545     */
 546    mov->force_writemask_all = true;
 547    mov->force_uncompressed = true;
 548
 549    /* The caller wants the low 32 bits of the timestamp.  Since it's running
 550     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
 551     * which is plenty of time for our purposes.  It is identical across the
 552     * EUs, but since it's tracking GPU core speed it will increment at a
 553     * varying rate as render P-states change.
 554     *
 555     * The caller could also check if render P-states have changed (or anything
 556     * else that might disrupt timing) by setting smear to 2 and checking if
 557     * that field is != 0.
 558     */
 559    dst.set_smear(0);
 560
 561    return dst;
 562 }
 563
 564 void
 565 fs_visitor::emit_shader_time_begin()
 566 {
 567    current_annotation = "shader time start";
 568    shader_start_time = get_timestamp();
 569 }
 570
 571 void
 572 fs_visitor::emit_shader_time_end()
 573 {
 574    current_annotation = "shader time end";
 575
 576    enum shader_time_shader_type type, written_type, reset_type;
 577    if (dispatch_width == 8) {
 578       type = ST_FS8;
 579       written_type = ST_FS8_WRITTEN;
 580       reset_type = ST_FS8_RESET;
 581    } else {
 582       assert(dispatch_width == 16);
 583       type = ST_FS16;
 584       written_type = ST_FS16_WRITTEN;
 585       reset_type = ST_FS16_RESET;
 586    }
 587
 588    fs_reg shader_end_time = get_timestamp();
 589
 590    /* Check that there weren't any timestamp reset events (assuming these
 591     * were the only two timestamp reads that happened).
 592     */
 593    fs_reg reset = shader_end_time;
 594    reset.set_smear(2);
 595    fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
 596    test->conditional_mod = BRW_CONDITIONAL_Z;
 597    emit(IF(BRW_PREDICATE_NORMAL));
 598
 599    push_force_uncompressed();
 600    fs_reg start = shader_start_time;
 601    start.negate = true;
 602    fs_reg diff = fs_reg(this, glsl_type::uint_type);
 603    emit(ADD(diff, start, shader_end_time));
 604
 605    /* If there were no instructions between the two timestamp gets, the diff
 606     * is 2 cycles.  Remove that overhead, so I can forget about that when
 607     * trying to determine the time taken for single instructions.
 608     */
 609    emit(ADD(diff, diff, fs_reg(-2u)));
 610
 611    emit_shader_time_write(type, diff);
 612    emit_shader_time_write(written_type, fs_reg(1u));
 613    emit(BRW_OPCODE_ELSE);
 614    emit_shader_time_write(reset_type, fs_reg(1u));
 615    emit(BRW_OPCODE_ENDIF);
 616
 617    pop_force_uncompressed();
 618 }
 619
 620 void
 621 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
 622                                    fs_reg value)
 623 {
 624    int shader_time_index =
 625       brw_get_shader_time_index(brw, shader_prog, &fp->Base, type);
 626    fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
 627
 628    fs_reg payload;
 629    if (dispatch_width == 8)
 630       payload = fs_reg(this, glsl_type::uvec2_type);
 631    else
 632       payload = fs_reg(this, glsl_type::uint_type);
 633
 634    emit(new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
 635                              fs_reg(), payload, offset, value));
 636 }
 637
 638 void
 639 fs_visitor::vfail(const char *format, va_list va)
 640 {
 641    char *msg;
 642
 643    if (failed)
 644       return;
 645
 646    failed = true;
 647
 648    msg = ralloc_vasprintf(mem_ctx, format, va);
 649    msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
 650
 651    this->fail_msg = msg;
 652
 653    if (INTEL_DEBUG & DEBUG_WM) {
 654       fprintf(stderr, "%s",  msg);
 655    }
 656 }
 657
 658 void
 659 fs_visitor::fail(const char *format, ...)
 660 {
 661    va_list va;
 662
 663    va_start(va, format);
 664    vfail(format, va);
 665    va_end(va);
 666 }
 667
 668 /**
 669  * Mark this program as impossible to compile in SIMD16 mode.
 670  *
 671  * During the SIMD8 compile (which happens first), we can detect and flag
 672  * things that are unsupported in SIMD16 mode, so the compiler can skip
 673  * the SIMD16 compile altogether.
 674  *
 675  * During a SIMD16 compile (if one happens anyway), this just calls fail().
 676  */
 677 void
 678 fs_visitor::no16(const char *format, ...)
 679 {
 680    va_list va;
 681
 682    va_start(va, format);
 683
 684    if (dispatch_width == 16) {
 685       vfail(format, va);
 686    } else {
 687       simd16_unsupported = true;
 688
 689       if (brw->perf_debug) {
 690          if (no16_msg)
 691             ralloc_vasprintf_append(&no16_msg, format, va);
 692          else
 693             no16_msg = ralloc_vasprintf(mem_ctx, format, va);
 694       }
 695    }
 696
 697    va_end(va);
 698 }
 699
 700 fs_inst *
 701 fs_visitor::emit(enum opcode opcode)
 702 {
 703    return emit(new(mem_ctx) fs_inst(opcode));
 704 }
 705
 706 fs_inst *
 707 fs_visitor::emit(enum opcode opcode, const fs_reg &dst)
 708 {
 709    return emit(new(mem_ctx) fs_inst(opcode, dst));
 710 }
 711
 712 fs_inst *
 713 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
 714 {
 715    return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
 716 }
 717
 718 fs_inst *
 719 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 720                  const fs_reg &src1)
 721 {
 722    return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
 723 }
 724
 725 fs_inst *
 726 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 727                  const fs_reg &src1, const fs_reg &src2)
 728 {
 729    return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
 730 }
 731
 732 fs_inst *
 733 fs_visitor::emit(enum opcode opcode, const fs_reg &dst,
 734                  fs_reg src[], int sources)
 735 {
 736    return emit(new(mem_ctx) fs_inst(opcode, dst, src, sources));
 737 }
 738
 739 void
 740 fs_visitor::push_force_uncompressed()
 741 {
 742    force_uncompressed_stack++;
 743 }
 744
 745 void
 746 fs_visitor::pop_force_uncompressed()
 747 {
 748    force_uncompressed_stack--;
 749    assert(force_uncompressed_stack >= 0);
 750 }
 751
 752 /**
 753  * Returns true if the instruction has a flag that means it won't
 754  * update an entire destination register.
 755  *
 756  * For example, dead code elimination and live variable analysis want to know
 757  * when a write to a variable screens off any preceding values that were in
 758  * it.
 759  */
 760 bool
 761 fs_inst::is_partial_write() const
 762 {
 763    return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
 764            this->force_uncompressed ||
 765            this->force_sechalf || !this->dst.is_contiguous());
 766 }
 767
 768 int
 769 fs_inst::regs_read(fs_visitor *v, int arg) const
 770 {
 771    if (is_tex() && arg == 0 && src[0].file == GRF) {
 772       if (v->dispatch_width == 16)
 773          return (mlen + 1) / 2;
 774       else
 775          return mlen;
 776    }
 777    return 1;
 778 }
 779
 780 bool
 781 fs_inst::reads_flag() const
 782 {
 783    return predicate;
 784 }
 785
 786 bool
 787 fs_inst::writes_flag() const
 788 {
 789    return (conditional_mod && opcode != BRW_OPCODE_SEL) ||
 790           opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
 791 }
 792
 793 /**
 794  * Returns how many MRFs an FS opcode will write over.
 795  *
 796  * Note that this is not the 0 or 1 implied writes in an actual gen
 797  * instruction -- the FS opcodes often generate MOVs in addition.
 798  */
 799 int
 800 fs_visitor::implied_mrf_writes(fs_inst *inst)
 801 {
 802    if (inst->mlen == 0)
 803       return 0;
 804
 805    if (inst->base_mrf == -1)
 806       return 0;
 807
 808    switch (inst->opcode) {
 809    case SHADER_OPCODE_RCP:
 810    case SHADER_OPCODE_RSQ:
 811    case SHADER_OPCODE_SQRT:
 812    case SHADER_OPCODE_EXP2:
 813    case SHADER_OPCODE_LOG2:
 814    case SHADER_OPCODE_SIN:
 815    case SHADER_OPCODE_COS:
 816       return 1 * dispatch_width / 8;
 817    case SHADER_OPCODE_POW:
 818    case SHADER_OPCODE_INT_QUOTIENT:
 819    case SHADER_OPCODE_INT_REMAINDER:
 820       return 2 * dispatch_width / 8;
 821    case SHADER_OPCODE_TEX:
 822    case FS_OPCODE_TXB:
 823    case SHADER_OPCODE_TXD:
 824    case SHADER_OPCODE_TXF:
 825    case SHADER_OPCODE_TXF_CMS:
 826    case SHADER_OPCODE_TXF_MCS:
 827    case SHADER_OPCODE_TG4:
 828    case SHADER_OPCODE_TG4_OFFSET:
 829    case SHADER_OPCODE_TXL:
 830    case SHADER_OPCODE_TXS:
 831    case SHADER_OPCODE_LOD:
 832       return 1;
 833    case FS_OPCODE_FB_WRITE:
 834       return 2;
 835    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 836    case SHADER_OPCODE_GEN4_SCRATCH_READ:
 837       return 1;
 838    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
 839       return inst->mlen;
 840    case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
 841       return 2;
 842    case SHADER_OPCODE_UNTYPED_ATOMIC:
 843    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
 844    case FS_OPCODE_INTERPOLATE_AT_CENTROID:
 845    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
 846    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
 847    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
 848       return 0;
 849    default:
 850       unreachable("not reached");
 851    }
 852 }
 853
 854 int
 855 fs_visitor::virtual_grf_alloc(int size)
 856 {
 857    if (virtual_grf_array_size <= virtual_grf_count) {
 858       if (virtual_grf_array_size == 0)
 859          virtual_grf_array_size = 16;
 860       else
 861          virtual_grf_array_size *= 2;
 862       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 863                                    virtual_grf_array_size);
 864    }
 865    virtual_grf_sizes[virtual_grf_count] = size;
 866    return virtual_grf_count++;
 867 }
 868
 869 /** Fixed HW reg constructor. */
 870 fs_reg::fs_reg(enum register_file file, int reg)
 871 {
 872    init();
 873    this->file = file;
 874    this->reg = reg;
 875    this->type = BRW_REGISTER_TYPE_F;
 876 }
 877
 878 /** Fixed HW reg constructor. */
 879 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type)
 880 {
 881    init();
 882    this->file = file;
 883    this->reg = reg;
 884    this->type = type;
 885 }
 886
 887 /** Automatic reg constructor. */
 888 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
 889 {
 890    init();
 891
 892    this->file = GRF;
 893    this->reg = v->virtual_grf_alloc(v->type_size(type));
 894    this->reg_offset = 0;
 895    this->type = brw_type_for_base_type(type);
 896 }
 897
 898 fs_reg *
 899 fs_visitor::variable_storage(ir_variable *var)
 900 {
 901    return (fs_reg *)hash_table_find(this->variable_ht, var);
 902 }
 903
 904 void
 905 import_uniforms_callback(const void *key,
 906                          void *data,
 907                          void *closure)
 908 {
 909    struct hash_table *dst_ht = (struct hash_table *)closure;
 910    const fs_reg *reg = (const fs_reg *)data;
 911
 912    if (reg->file != UNIFORM)
 913       return;
 914
 915    hash_table_insert(dst_ht, data, key);
 916 }
 917
 918 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
 919  * This brings in those uniform definitions
 920  */
 921 void
 922 fs_visitor::import_uniforms(fs_visitor *v)
 923 {
 924    hash_table_call_foreach(v->variable_ht,
 925                            import_uniforms_callback,
 926                            variable_ht);
 927    this->push_constant_loc = v->push_constant_loc;
 928    this->pull_constant_loc = v->pull_constant_loc;
 929    this->uniforms = v->uniforms;
 930    this->param_size = v->param_size;
 931 }
 932
 933 /* Our support for uniforms is piggy-backed on the struct
 934  * gl_fragment_program, because that's where the values actually
 935  * get stored, rather than in some global gl_shader_program uniform
 936  * store.
 937  */
 938 void
 939 fs_visitor::setup_uniform_values(ir_variable *ir)
 940 {
 941    int namelen = strlen(ir->name);
 942
 943    /* The data for our (non-builtin) uniforms is stored in a series of
 944     * gl_uniform_driver_storage structs for each subcomponent that
 945     * glGetUniformLocation() could name.  We know it's been set up in the same
 946     * order we'd walk the type, so walk the list of storage and find anything
 947     * with our name, or the prefix of a component that starts with our name.
 948     */
 949    unsigned params_before = uniforms;
 950    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
 951       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 952
 953       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 954           (storage->name[namelen] != 0 &&
 955            storage->name[namelen] != '.' &&
 956            storage->name[namelen] != '[')) {
 957          continue;
 958       }
 959
 960       unsigned slots = storage->type->component_slots();
 961       if (storage->array_elements)
 962          slots *= storage->array_elements;
 963
 964       for (unsigned i = 0; i < slots; i++) {
 965          stage_prog_data->param[uniforms++] = &storage->storage[i];
 966       }
 967    }
 968
 969    /* Make sure we actually initialized the right amount of stuff here. */
 970    assert(params_before + ir->type->component_slots() == uniforms);
 971    (void)params_before;
 972 }
 973
 974
 975 /* Our support for builtin uniforms is even scarier than non-builtin.
 976  * It sits on top of the PROG_STATE_VAR parameters that are
 977  * automatically updated from GL context state.
 978  */
 979 void
 980 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
 981 {
 982    const ir_state_slot *const slots = ir->state_slots;
 983    assert(ir->state_slots != NULL);
 984
 985    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 986       /* This state reference has already been setup by ir_to_mesa, but we'll
 987        * get the same index back here.
 988        */
 989       int index = _mesa_add_state_reference(this->fp->Base.Parameters,
 990                                             (gl_state_index *)slots[i].tokens);
 991
 992       /* Add each of the unique swizzles of the element as a parameter.
 993        * This'll end up matching the expected layout of the
 994        * array/matrix/structure we're trying to fill in.
 995        */
 996       int last_swiz = -1;
 997       for (unsigned int j = 0; j < 4; j++) {
 998          int swiz = GET_SWZ(slots[i].swizzle, j);
 999          if (swiz == last_swiz)
1000             break;
1001          last_swiz = swiz;
1002
1003          stage_prog_data->param[uniforms++] =
1004             &fp->Base.Parameters->ParameterValues[index][swiz];
1005       }
1006    }
1007 }
1008
1009 fs_reg *
1010 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
1011 {
1012    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1013    fs_reg wpos = *reg;
1014    bool flip = !ir->data.origin_upper_left ^ key->render_to_fbo;
1015
1016    /* gl_FragCoord.x */
1017    if (ir->data.pixel_center_integer) {
1018       emit(MOV(wpos, this->pixel_x));
1019    } else {
1020       emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
1021    }
1022    wpos.reg_offset++;
1023
1024    /* gl_FragCoord.y */
1025    if (!flip && ir->data.pixel_center_integer) {
1026       emit(MOV(wpos, this->pixel_y));
1027    } else {
1028       fs_reg pixel_y = this->pixel_y;
1029       float offset = (ir->data.pixel_center_integer ? 0.0 : 0.5);
1030
1031       if (flip) {
1032          pixel_y.negate = true;
1033          offset += key->drawable_height - 1.0;
1034       }
1035
1036       emit(ADD(wpos, pixel_y, fs_reg(offset)));
1037    }
1038    wpos.reg_offset++;
1039
1040    /* gl_FragCoord.z */
1041    if (brw->gen >= 6) {
1042       emit(MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
1043    } else {
1044       emit(FS_OPCODE_LINTERP, wpos,
1045            this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1046            this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1047            interp_reg(VARYING_SLOT_POS, 2));
1048    }
1049    wpos.reg_offset++;
1050
1051    /* gl_FragCoord.w: Already set up in emit_interpolation */
1052    emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1053
1054    return reg;
1055 }
1056
1057 fs_inst *
1058 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1059                          glsl_interp_qualifier interpolation_mode,
1060                          bool is_centroid, bool is_sample)
1061 {
1062    brw_wm_barycentric_interp_mode barycoord_mode;
1063    if (brw->gen >= 6) {
1064       if (is_centroid) {
1065          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1066             barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1067          else
1068             barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1069       } else if (is_sample) {
1070           if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1071             barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1072          else
1073             barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1074       } else {
1075          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1076             barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1077          else
1078             barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1079       }
1080    } else {
1081       /* On Ironlake and below, there is only one interpolation mode.
1082        * Centroid interpolation doesn't mean anything on this hardware --
1083        * there is no multisampling.
1084        */
1085       barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1086    }
1087    return emit(FS_OPCODE_LINTERP, attr,
1088                this->delta_x[barycoord_mode],
1089                this->delta_y[barycoord_mode], interp);
1090 }
1091
1092 fs_reg *
1093 fs_visitor::emit_general_interpolation(ir_variable *ir)
1094 {
1095    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1096    reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
1097    fs_reg attr = *reg;
1098
1099    unsigned int array_elements;
1100    const glsl_type *type;
1101
1102    if (ir->type->is_array()) {
1103       array_elements = ir->type->length;
1104       if (array_elements == 0) {
1105          fail("dereferenced array '%s' has length 0\n", ir->name);
1106       }
1107       type = ir->type->fields.array;
1108    } else {
1109       array_elements = 1;
1110       type = ir->type;
1111    }
1112
1113    glsl_interp_qualifier interpolation_mode =
1114       ir->determine_interpolation_mode(key->flat_shade);
1115
1116    int location = ir->data.location;
1117    for (unsigned int i = 0; i < array_elements; i++) {
1118       for (unsigned int j = 0; j < type->matrix_columns; j++) {
1119          if (prog_data->urb_setup[location] == -1) {
1120             /* If there's no incoming setup data for this slot, don't
1121              * emit interpolation for it.
1122              */
1123             attr.reg_offset += type->vector_elements;
1124             location++;
1125             continue;
1126          }
1127
1128          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1129             /* Constant interpolation (flat shading) case. The SF has
1130              * handed us defined values in only the constant offset
1131              * field of the setup reg.
1132              */
1133             for (unsigned int k = 0; k < type->vector_elements; k++) {
1134                struct brw_reg interp = interp_reg(location, k);
1135                interp = suboffset(interp, 3);
1136                interp.type = reg->type;
1137                emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1138                attr.reg_offset++;
1139             }
1140          } else {
1141             /* Smooth/noperspective interpolation case. */
1142             for (unsigned int k = 0; k < type->vector_elements; k++) {
1143                struct brw_reg interp = interp_reg(location, k);
1144                if (brw->needs_unlit_centroid_workaround && ir->data.centroid) {
1145                   /* Get the pixel/sample mask into f0 so that we know
1146                    * which pixels are lit.  Then, for each channel that is
1147                    * unlit, replace the centroid data with non-centroid
1148                    * data.
1149                    */
1150                   emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1151
1152                   fs_inst *inst;
1153                   inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1154                                       false, false);
1155                   inst->predicate = BRW_PREDICATE_NORMAL;
1156                   inst->predicate_inverse = true;
1157                   if (brw->has_pln)
1158                      inst->no_dd_clear = true;
1159
1160                   inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1161                                       ir->data.centroid && !key->persample_shading,
1162                                       ir->data.sample || key->persample_shading);
1163                   inst->predicate = BRW_PREDICATE_NORMAL;
1164                   inst->predicate_inverse = false;
1165                   if (brw->has_pln)
1166                      inst->no_dd_check = true;
1167
1168                } else {
1169                   emit_linterp(attr, fs_reg(interp), interpolation_mode,
1170                                ir->data.centroid && !key->persample_shading,
1171                                ir->data.sample || key->persample_shading);
1172                }
1173                if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1174                   emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1175                }
1176                attr.reg_offset++;
1177             }
1178
1179          }
1180          location++;
1181       }
1182    }
1183
1184    return reg;
1185 }
1186
1187 fs_reg *
1188 fs_visitor::emit_frontfacing_interpolation()
1189 {
1190    fs_reg *reg = new(this->mem_ctx) fs_reg(this, glsl_type::bool_type);
1191
1192    if (brw->gen >= 6) {
1193       /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
1194        * a boolean result from this (~0/true or 0/false).
1195        *
1196        * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
1197        * this task in only one instruction:
1198        *    - a negation source modifier will flip the bit; and
1199        *    - a W -> D type conversion will sign extend the bit into the high
1200        *      word of the destination.
1201        *
1202        * An ASR 15 fills the low word of the destination.
1203        */
1204       fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
1205       g0.negate = true;
1206
1207       emit(ASR(*reg, g0, fs_reg(15)));
1208    } else {
1209       /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
1210        * a boolean result from this (1/true or 0/false).
1211        *
1212        * Like in the above case, since the bit is the MSB of g1.6:UD we can use
1213        * the negation source modifier to flip it. Unfortunately the SHR
1214        * instruction only operates on UD (or D with an abs source modifier)
1215        * sources without negation.
1216        *
1217        * Instead, use ASR (which will give ~0/true or 0/false) followed by an
1218        * AND 1.
1219        */
1220       fs_reg asr = fs_reg(this, glsl_type::bool_type);
1221       fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
1222       g1_6.negate = true;
1223
1224       emit(ASR(asr, g1_6, fs_reg(31)));
1225       emit(AND(*reg, asr, fs_reg(1)));
1226    }
1227
1228    return reg;
1229 }
1230
1231 void
1232 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1233 {
1234    assert(dst.type == BRW_REGISTER_TYPE_F);
1235
1236    if (key->compute_pos_offset) {
1237       /* Convert int_sample_pos to floating point */
1238       emit(MOV(dst, int_sample_pos));
1239       /* Scale to the range [0, 1] */
1240       emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1241    }
1242    else {
1243       /* From ARB_sample_shading specification:
1244        * "When rendering to a non-multisample buffer, or if multisample
1245        *  rasterization is disabled, gl_SamplePosition will always be
1246        *  (0.5, 0.5).
1247        */
1248       emit(MOV(dst, fs_reg(0.5f)));
1249    }
1250 }
1251
1252 fs_reg *
1253 fs_visitor::emit_samplepos_setup()
1254 {
1255    assert(brw->gen >= 6);
1256
1257    this->current_annotation = "compute sample position";
1258    fs_reg *reg = new(this->mem_ctx) fs_reg(this, glsl_type::vec2_type);
1259    fs_reg pos = *reg;
1260    fs_reg int_sample_x = fs_reg(this, glsl_type::int_type);
1261    fs_reg int_sample_y = fs_reg(this, glsl_type::int_type);
1262
1263    /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1264     * mode will be enabled.
1265     *
1266     * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1267     * R31.1:0         Position Offset X/Y for Slot[3:0]
1268     * R31.3:2         Position Offset X/Y for Slot[7:4]
1269     * .....
1270     *
1271     * The X, Y sample positions come in as bytes in  thread payload. So, read
1272     * the positions using vstride=16, width=8, hstride=2.
1273     */
1274    struct brw_reg sample_pos_reg =
1275       stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
1276                     BRW_REGISTER_TYPE_B), 16, 8, 2);
1277
1278    fs_inst *inst = emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1279    if (dispatch_width == 16) {
1280       inst->force_uncompressed = true;
1281       inst = emit(MOV(half(int_sample_x, 1),
1282                       fs_reg(suboffset(sample_pos_reg, 16))));
1283       inst->force_sechalf = true;
1284    }
1285    /* Compute gl_SamplePosition.x */
1286    compute_sample_position(pos, int_sample_x);
1287    pos.reg_offset++;
1288    inst = emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1289    if (dispatch_width == 16) {
1290       inst->force_uncompressed = true;
1291       inst = emit(MOV(half(int_sample_y, 1),
1292                       fs_reg(suboffset(sample_pos_reg, 17))));
1293       inst->force_sechalf = true;
1294    }
1295    /* Compute gl_SamplePosition.y */
1296    compute_sample_position(pos, int_sample_y);
1297    return reg;
1298 }
1299
1300 fs_reg *
1301 fs_visitor::emit_sampleid_setup(ir_variable *ir)
1302 {
1303    assert(brw->gen >= 6);
1304
1305    this->current_annotation = "compute sample id";
1306    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1307
1308    if (key->compute_sample_id) {
1309       fs_reg t1 = fs_reg(this, glsl_type::int_type);
1310       fs_reg t2 = fs_reg(this, glsl_type::int_type);
1311       t2.type = BRW_REGISTER_TYPE_UW;
1312
1313       /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1314        * 8x multisampling, subspan 0 will represent sample N (where N
1315        * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1316        * 7. We can find the value of N by looking at R0.0 bits 7:6
1317        * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1318        * (since samples are always delivered in pairs). That is, we
1319        * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1320        * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1321        * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1322        * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1323        * populating a temporary variable with the sequence (0, 1, 2, 3),
1324        * and then reading from it using vstride=1, width=4, hstride=0.
1325        * These computations hold good for 4x multisampling as well.
1326        *
1327        * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
1328        * the first four slots are sample 0 of subspan 0; the next four
1329        * are sample 1 of subspan 0; the third group is sample 0 of
1330        * subspan 1, and finally sample 1 of subspan 1.
1331        */
1332       fs_inst *inst;
1333       inst = emit(BRW_OPCODE_AND, t1,
1334                   fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
1335                   fs_reg(0xc0));
1336       inst->force_writemask_all = true;
1337       inst = emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1338       inst->force_writemask_all = true;
1339       /* This works for both SIMD8 and SIMD16 */
1340       inst = emit(MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210)));
1341       inst->force_writemask_all = true;
1342       /* This special instruction takes care of setting vstride=1,
1343        * width=4, hstride=0 of t2 during an ADD instruction.
1344        */
1345       emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1346    } else {
1347       /* As per GL_ARB_sample_shading specification:
1348        * "When rendering to a non-multisample buffer, or if multisample
1349        *  rasterization is disabled, gl_SampleID will always be zero."
1350        */
1351       emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1352    }
1353
1354    return reg;
1355 }
1356
1357 fs_reg
1358 fs_visitor::fix_math_operand(fs_reg src)
1359 {
1360    /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1361     * might be able to do better by doing execsize = 1 math and then
1362     * expanding that result out, but we would need to be careful with
1363     * masking.
1364     *
1365     * The hardware ignores source modifiers (negate and abs) on math
1366     * instructions, so we also move to a temp to set those up.
1367     */
1368    if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1369        !src.abs && !src.negate)
1370       return src;
1371
1372    /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1373     * operands to math
1374     */
1375    if (brw->gen >= 7 && src.file != IMM)
1376       return src;
1377
1378    fs_reg expanded = fs_reg(this, glsl_type::float_type);
1379    expanded.type = src.type;
1380    emit(BRW_OPCODE_MOV, expanded, src);
1381    return expanded;
1382 }
1383
1384 fs_inst *
1385 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1386 {
1387    switch (opcode) {
1388    case SHADER_OPCODE_RCP:
1389    case SHADER_OPCODE_RSQ:
1390    case SHADER_OPCODE_SQRT:
1391    case SHADER_OPCODE_EXP2:
1392    case SHADER_OPCODE_LOG2:
1393    case SHADER_OPCODE_SIN:
1394    case SHADER_OPCODE_COS:
1395       break;
1396    default:
1397       unreachable("not reached: bad math opcode");
1398    }
1399
1400    /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
1401     * might be able to do better by doing execsize = 1 math and then
1402     * expanding that result out, but we would need to be careful with
1403     * masking.
1404     *
1405     * Gen 6 hardware ignores source modifiers (negate and abs) on math
1406     * instructions, so we also move to a temp to set those up.
1407     */
1408    if (brw->gen == 6 || brw->gen == 7)
1409       src = fix_math_operand(src);
1410
1411    fs_inst *inst = emit(opcode, dst, src);
1412
1413    if (brw->gen < 6) {
1414       inst->base_mrf = 2;
1415       inst->mlen = dispatch_width / 8;
1416    }
1417
1418    return inst;
1419 }
1420
1421 fs_inst *
1422 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1423 {
1424    int base_mrf = 2;
1425    fs_inst *inst;
1426
1427    if (brw->gen >= 8) {
1428       inst = emit(opcode, dst, src0, src1);
1429    } else if (brw->gen >= 6) {
1430       src0 = fix_math_operand(src0);
1431       src1 = fix_math_operand(src1);
1432
1433       inst = emit(opcode, dst, src0, src1);
1434    } else {
1435       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1436        * "Message Payload":
1437        *
1438        * "Operand0[7].  For the INT DIV functions, this operand is the
1439        *  denominator."
1440        *  ...
1441        * "Operand1[7].  For the INT DIV functions, this operand is the
1442        *  numerator."
1443        */
1444       bool is_int_div = opcode != SHADER_OPCODE_POW;
1445       fs_reg &op0 = is_int_div ? src1 : src0;
1446       fs_reg &op1 = is_int_div ? src0 : src1;
1447
1448       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1449       inst = emit(opcode, dst, op0, reg_null_f);
1450
1451       inst->base_mrf = base_mrf;
1452       inst->mlen = 2 * dispatch_width / 8;
1453    }
1454    return inst;
1455 }
1456
1457 void
1458 fs_visitor::assign_curb_setup()
1459 {
1460    if (dispatch_width == 8) {
1461       prog_data->base.dispatch_grf_start_reg = payload.num_regs;
1462    } else {
1463       prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1464    }
1465
1466    prog_data->base.curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1467
1468    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1469    foreach_in_list(fs_inst, inst, &instructions) {
1470       for (unsigned int i = 0; i < inst->sources; i++) {
1471          if (inst->src[i].file == UNIFORM) {
1472             int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1473             int constant_nr;
1474             if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1475                constant_nr = push_constant_loc[uniform_nr];
1476             } else {
1477                /* Section 5.11 of the OpenGL 4.1 spec says:
1478                 * "Out-of-bounds reads return undefined values, which include
1479                 *  values from other variables of the active program or zero."
1480                 * Just return the first push constant.
1481                 */
1482                constant_nr = 0;
1483             }
1484
1485             struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1486                                                   constant_nr / 8,
1487                                                   constant_nr % 8);
1488
1489             inst->src[i].file = HW_REG;
1490             inst->src[i].fixed_hw_reg = byte_offset(
1491                retype(brw_reg, inst->src[i].type),
1492                inst->src[i].subreg_offset);
1493          }
1494       }
1495    }
1496 }
1497
1498 void
1499 fs_visitor::calculate_urb_setup()
1500 {
1501    for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1502       prog_data->urb_setup[i] = -1;
1503    }
1504
1505    int urb_next = 0;
1506    /* Figure out where each of the incoming setup attributes lands. */
1507    if (brw->gen >= 6) {
1508       if (_mesa_bitcount_64(fp->Base.InputsRead &
1509                             BRW_FS_VARYING_INPUT_MASK) <= 16) {
1510          /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1511           * first 16 varying inputs, so we can put them wherever we want.
1512           * Just put them in order.
1513           *
1514           * This is useful because it means that (a) inputs not used by the
1515           * fragment shader won't take up valuable register space, and (b) we
1516           * won't have to recompile the fragment shader if it gets paired with
1517           * a different vertex (or geometry) shader.
1518           */
1519          for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1520             if (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1521                 BITFIELD64_BIT(i)) {
1522                prog_data->urb_setup[i] = urb_next++;
1523             }
1524          }
1525       } else {
1526          /* We have enough input varyings that the SF/SBE pipeline stage can't
1527           * arbitrarily rearrange them to suit our whim; we have to put them
1528           * in an order that matches the output of the previous pipeline stage
1529           * (geometry or vertex shader).
1530           */
1531          struct brw_vue_map prev_stage_vue_map;
1532          brw_compute_vue_map(brw, &prev_stage_vue_map,
1533                              key->input_slots_valid);
1534          int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1535          assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1536          for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1537               slot++) {
1538             int varying = prev_stage_vue_map.slot_to_varying[slot];
1539             /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1540              * unused.
1541              */
1542             if (varying != BRW_VARYING_SLOT_COUNT &&
1543                 (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1544                  BITFIELD64_BIT(varying))) {
1545                prog_data->urb_setup[varying] = slot - first_slot;
1546             }
1547          }
1548          urb_next = prev_stage_vue_map.num_slots - first_slot;
1549       }
1550    } else {
1551       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1552       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1553          /* Point size is packed into the header, not as a general attribute */
1554          if (i == VARYING_SLOT_PSIZ)
1555             continue;
1556
1557          if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1558             /* The back color slot is skipped when the front color is
1559              * also written to.  In addition, some slots can be
1560              * written in the vertex shader and not read in the
1561              * fragment shader.  So the register number must always be
1562              * incremented, mapped or not.
1563              */
1564             if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1565                prog_data->urb_setup[i] = urb_next;
1566             urb_next++;
1567          }
1568       }
1569
1570       /*
1571        * It's a FS only attribute, and we did interpolation for this attribute
1572        * in SF thread. So, count it here, too.
1573        *
1574        * See compile_sf_prog() for more info.
1575        */
1576       if (fp->Base.InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1577          prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1578    }
1579
1580    prog_data->num_varying_inputs = urb_next;
1581 }
1582
1583 void
1584 fs_visitor::assign_urb_setup()
1585 {
1586    int urb_start = payload.num_regs + prog_data->base.curb_read_length;
1587
1588    /* Offset all the urb_setup[] index by the actual position of the
1589     * setup regs, now that the location of the constants has been chosen.
1590     */
1591    foreach_in_list(fs_inst, inst, &instructions) {
1592       if (inst->opcode == FS_OPCODE_LINTERP) {
1593          assert(inst->src[2].file == HW_REG);
1594          inst->src[2].fixed_hw_reg.nr += urb_start;
1595       }
1596
1597       if (inst->opcode == FS_OPCODE_CINTERP) {
1598          assert(inst->src[0].file == HW_REG);
1599          inst->src[0].fixed_hw_reg.nr += urb_start;
1600       }
1601    }
1602
1603    /* Each attribute is 4 setup channels, each of which is half a reg. */
1604    this->first_non_payload_grf =
1605       urb_start + prog_data->num_varying_inputs * 2;
1606 }
1607
1608 /**
1609  * Split large virtual GRFs into separate components if we can.
1610  *
1611  * This is mostly duplicated with what brw_fs_vector_splitting does,
1612  * but that's really conservative because it's afraid of doing
1613  * splitting that doesn't result in real progress after the rest of
1614  * the optimization phases, which would cause infinite looping in
1615  * optimization.  We can do it once here, safely.  This also has the
1616  * opportunity to split interpolated values, or maybe even uniforms,
1617  * which we don't have at the IR level.
1618  *
1619  * We want to split, because virtual GRFs are what we register
1620  * allocate and spill (due to contiguousness requirements for some
1621  * instructions), and they're what we naturally generate in the
1622  * codegen process, but most virtual GRFs don't actually need to be
1623  * contiguous sets of GRFs.  If we split, we'll end up with reduced
1624  * live intervals and better dead code elimination and coalescing.
1625  */
1626 void
1627 fs_visitor::split_virtual_grfs()
1628 {
1629    int num_vars = this->virtual_grf_count;
1630    bool split_grf[num_vars];
1631    int new_virtual_grf[num_vars];
1632
1633    /* Try to split anything > 0 sized. */
1634    for (int i = 0; i < num_vars; i++) {
1635       if (this->virtual_grf_sizes[i] != 1)
1636          split_grf[i] = true;
1637       else
1638          split_grf[i] = false;
1639    }
1640
1641    if (brw->has_pln &&
1642        this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1643       /* PLN opcodes rely on the delta_xy being contiguous.  We only have to
1644        * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1645        * Gen6, that was the only supported interpolation mode, and since Gen6,
1646        * delta_x and delta_y are in fixed hardware registers.
1647        */
1648       split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1649          false;
1650    }
1651
1652    foreach_in_list(fs_inst, inst, &instructions) {
1653       /* If there's a SEND message that requires contiguous destination
1654        * registers, no splitting is allowed.
1655        */
1656       if (inst->regs_written > 1) {
1657          split_grf[inst->dst.reg] = false;
1658       }
1659
1660       /* If we're sending from a GRF, don't split it, on the assumption that
1661        * the send is reading the whole thing.
1662        */
1663       if (inst->is_send_from_grf()) {
1664          for (int i = 0; i < inst->sources; i++) {
1665             if (inst->src[i].file == GRF) {
1666                split_grf[inst->src[i].reg] = false;
1667             }
1668          }
1669       }
1670    }
1671
1672    /* Allocate new space for split regs.  Note that the virtual
1673     * numbers will be contiguous.
1674     */
1675    for (int i = 0; i < num_vars; i++) {
1676       if (split_grf[i]) {
1677          new_virtual_grf[i] = virtual_grf_alloc(1);
1678          for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1679             int reg = virtual_grf_alloc(1);
1680             assert(reg == new_virtual_grf[i] + j - 1);
1681             (void) reg;
1682          }
1683          this->virtual_grf_sizes[i] = 1;
1684       }
1685    }
1686
1687    foreach_in_list(fs_inst, inst, &instructions) {
1688       if (inst->dst.file == GRF &&
1689           split_grf[inst->dst.reg] &&
1690           inst->dst.reg_offset != 0) {
1691          inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1692                           inst->dst.reg_offset - 1);
1693          inst->dst.reg_offset = 0;
1694       }
1695       for (int i = 0; i < inst->sources; i++) {
1696          if (inst->src[i].file == GRF &&
1697              split_grf[inst->src[i].reg] &&
1698              inst->src[i].reg_offset != 0) {
1699             inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1700                                 inst->src[i].reg_offset - 1);
1701             inst->src[i].reg_offset = 0;
1702          }
1703       }
1704    }
1705    invalidate_live_intervals(false);
1706 }
1707
1708 /**
1709  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1710  *
1711  * During code generation, we create tons of temporary variables, many of
1712  * which get immediately killed and are never used again.  Yet, in later
1713  * optimization and analysis passes, such as compute_live_intervals, we need
1714  * to loop over all the virtual GRFs.  Compacting them can save a lot of
1715  * overhead.
1716  */
1717 void
1718 fs_visitor::compact_virtual_grfs()
1719 {
1720    if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER))
1721       return;
1722
1723    /* Mark which virtual GRFs are used, and count how many. */
1724    int remap_table[this->virtual_grf_count];
1725    memset(remap_table, -1, sizeof(remap_table));
1726
1727    foreach_in_list(const fs_inst, inst, &instructions) {
1728       if (inst->dst.file == GRF)
1729          remap_table[inst->dst.reg] = 0;
1730
1731       for (int i = 0; i < inst->sources; i++) {
1732          if (inst->src[i].file == GRF)
1733             remap_table[inst->src[i].reg] = 0;
1734       }
1735    }
1736
1737    /* Compact the GRF arrays. */
1738    int new_index = 0;
1739    for (int i = 0; i < this->virtual_grf_count; i++) {
1740       if (remap_table[i] != -1) {
1741          remap_table[i] = new_index;
1742          virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1743          invalidate_live_intervals(false);
1744          ++new_index;
1745       }
1746    }
1747
1748    this->virtual_grf_count = new_index;
1749
1750    /* Patch all the instructions to use the newly renumbered registers */
1751    foreach_in_list(fs_inst, inst, &instructions) {
1752       if (inst->dst.file == GRF)
1753          inst->dst.reg = remap_table[inst->dst.reg];
1754
1755       for (int i = 0; i < inst->sources; i++) {
1756          if (inst->src[i].file == GRF)
1757             inst->src[i].reg = remap_table[inst->src[i].reg];
1758       }
1759    }
1760
1761    /* Patch all the references to delta_x/delta_y, since they're used in
1762     * register allocation.
1763     */
1764    for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
1765       if (delta_x[i].file == GRF && remap_table[delta_x[i].reg] != -1) {
1766          delta_x[i].reg = remap_table[delta_x[i].reg];
1767       }
1768    }
1769    for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
1770       if (delta_y[i].file == GRF && remap_table[delta_y[i].reg] != -1) {
1771          delta_y[i].reg = remap_table[delta_y[i].reg];
1772       }
1773    }
1774 }
1775
1776 /*
1777  * Implements array access of uniforms by inserting a
1778  * PULL_CONSTANT_LOAD instruction.
1779  *
1780  * Unlike temporary GRF array access (where we don't support it due to
1781  * the difficulty of doing relative addressing on instruction
1782  * destinations), we could potentially do array access of uniforms
1783  * that were loaded in GRF space as push constants.  In real-world
1784  * usage we've seen, though, the arrays being used are always larger
1785  * than we could load as push constants, so just always move all
1786  * uniform array access out to a pull constant buffer.
1787  */
1788 void
1789 fs_visitor::move_uniform_array_access_to_pull_constants()
1790 {
1791    if (dispatch_width != 8)
1792       return;
1793
1794    pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
1795
1796    for (unsigned int i = 0; i < uniforms; i++) {
1797       pull_constant_loc[i] = -1;
1798    }
1799
1800    /* Walk through and find array access of uniforms.  Put a copy of that
1801     * uniform in the pull constant buffer.
1802     *
1803     * Note that we don't move constant-indexed accesses to arrays.  No
1804     * testing has been done of the performance impact of this choice.
1805     */
1806    foreach_in_list_safe(fs_inst, inst, &instructions) {
1807       for (int i = 0 ; i < inst->sources; i++) {
1808          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1809             continue;
1810
1811          int uniform = inst->src[i].reg;
1812
1813          /* If this array isn't already present in the pull constant buffer,
1814           * add it.
1815           */
1816          if (pull_constant_loc[uniform] == -1) {
1817             const gl_constant_value **values = &stage_prog_data->param[uniform];
1818
1819             assert(param_size[uniform]);
1820
1821             for (int j = 0; j < param_size[uniform]; j++) {
1822                pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
1823
1824                stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
1825                   values[j];
1826             }
1827          }
1828       }
1829    }
1830 }
1831
1832 /**
1833  * Assign UNIFORM file registers to either push constants or pull constants.
1834  *
1835  * We allow a fragment shader to have more than the specified minimum
1836  * maximum number of fragment shader uniform components (64).  If
1837  * there are too many of these, they'd fill up all of register space.
1838  * So, this will push some of them out to the pull constant buffer and
1839  * update the program to load them.
1840  */
1841 void
1842 fs_visitor::assign_constant_locations()
1843 {
1844    /* Only the first compile (SIMD8 mode) gets to decide on locations. */
1845    if (dispatch_width != 8)
1846       return;
1847
1848    /* Find which UNIFORM registers are still in use. */
1849    bool is_live[uniforms];
1850    for (unsigned int i = 0; i < uniforms; i++) {
1851       is_live[i] = false;
1852    }
1853
1854    foreach_in_list(fs_inst, inst, &instructions) {
1855       for (int i = 0; i < inst->sources; i++) {
1856          if (inst->src[i].file != UNIFORM)
1857             continue;
1858
1859          int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1860          if (constant_nr >= 0 && constant_nr < (int) uniforms)
1861             is_live[constant_nr] = true;
1862       }
1863    }
1864
1865    /* Only allow 16 registers (128 uniform components) as push constants.
1866     *
1867     * Just demote the end of the list.  We could probably do better
1868     * here, demoting things that are rarely used in the program first.
1869     *
1870     * If changing this value, note the limitation about total_regs in
1871     * brw_curbe.c.
1872     */
1873    unsigned int max_push_components = 16 * 8;
1874    unsigned int num_push_constants = 0;
1875
1876    push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
1877
1878    for (unsigned int i = 0; i < uniforms; i++) {
1879       if (!is_live[i] || pull_constant_loc[i] != -1) {
1880          /* This UNIFORM register is either dead, or has already been demoted
1881           * to a pull const.  Mark it as no longer living in the param[] array.
1882           */
1883          push_constant_loc[i] = -1;
1884          continue;
1885       }
1886
1887       if (num_push_constants < max_push_components) {
1888          /* Retain as a push constant.  Record the location in the params[]
1889           * array.
1890           */
1891          push_constant_loc[i] = num_push_constants++;
1892       } else {
1893          /* Demote to a pull constant. */
1894          push_constant_loc[i] = -1;
1895
1896          int pull_index = stage_prog_data->nr_pull_params++;
1897          stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
1898          pull_constant_loc[i] = pull_index;
1899       }
1900    }
1901
1902    stage_prog_data->nr_params = num_push_constants;
1903
1904    /* Up until now, the param[] array has been indexed by reg + reg_offset
1905     * of UNIFORM registers.  Condense it to only contain the uniforms we
1906     * chose to upload as push constants.
1907     */
1908    for (unsigned int i = 0; i < uniforms; i++) {
1909       int remapped = push_constant_loc[i];
1910
1911       if (remapped == -1)
1912          continue;
1913
1914       assert(remapped <= (int)i);
1915       stage_prog_data->param[remapped] = stage_prog_data->param[i];
1916    }
1917 }
1918
1919 /**
1920  * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
1921  * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
1922  */
1923 void
1924 fs_visitor::demote_pull_constants()
1925 {
1926    calculate_cfg();
1927
1928    foreach_block_and_inst (block, fs_inst, inst, cfg) {
1929       for (int i = 0; i < inst->sources; i++) {
1930          if (inst->src[i].file != UNIFORM)
1931             continue;
1932
1933          int pull_index = pull_constant_loc[inst->src[i].reg +
1934                                             inst->src[i].reg_offset];
1935          if (pull_index == -1)
1936             continue;
1937
1938          /* Set up the annotation tracking for new generated instructions. */
1939          base_ir = inst->ir;
1940          current_annotation = inst->annotation;
1941
1942          fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
1943          fs_reg dst = fs_reg(this, glsl_type::float_type);
1944
1945          /* Generate a pull load into dst. */
1946          if (inst->src[i].reladdr) {
1947             exec_list list = VARYING_PULL_CONSTANT_LOAD(dst,
1948                                                         surf_index,
1949                                                         *inst->src[i].reladdr,
1950                                                         pull_index);
1951             inst->insert_before(block, &list);
1952             inst->src[i].reladdr = NULL;
1953          } else {
1954             fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1955             fs_inst *pull =
1956                new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1957                                     dst, surf_index, offset);
1958             inst->insert_before(block, pull);
1959             inst->src[i].set_smear(pull_index & 3);
1960          }
1961
1962          /* Rewrite the instruction to use the temporary VGRF. */
1963          inst->src[i].file = GRF;
1964          inst->src[i].reg = dst.reg;
1965          inst->src[i].reg_offset = 0;
1966       }
1967    }
1968    invalidate_live_intervals(false);
1969 }
1970
1971 bool
1972 fs_visitor::opt_algebraic()
1973 {
1974    bool progress = false;
1975
1976    foreach_in_list(fs_inst, inst, &instructions) {
1977       switch (inst->opcode) {
1978       case BRW_OPCODE_MUL:
1979          if (inst->src[1].file != IMM)
1980             continue;
1981
1982          /* a * 1.0 = a */
1983          if (inst->src[1].is_one()) {
1984             inst->opcode = BRW_OPCODE_MOV;
1985             inst->src[1] = reg_undef;
1986             progress = true;
1987             break;
1988          }
1989
1990          /* a * 0.0 = 0.0 */
1991          if (inst->src[1].is_zero()) {
1992             inst->opcode = BRW_OPCODE_MOV;
1993             inst->src[0] = inst->src[1];
1994             inst->src[1] = reg_undef;
1995             progress = true;
1996             break;
1997          }
1998
1999          break;
2000       case BRW_OPCODE_ADD:
2001          if (inst->src[1].file != IMM)
2002             continue;
2003
2004          /* a + 0.0 = a */
2005          if (inst->src[1].is_zero()) {
2006             inst->opcode = BRW_OPCODE_MOV;
2007             inst->src[1] = reg_undef;
2008             progress = true;
2009             break;
2010          }
2011          break;
2012       case BRW_OPCODE_OR:
2013          if (inst->src[0].equals(inst->src[1])) {
2014             inst->opcode = BRW_OPCODE_MOV;
2015             inst->src[1] = reg_undef;
2016             progress = true;
2017             break;
2018          }
2019          break;
2020       case BRW_OPCODE_LRP:
2021          if (inst->src[1].equals(inst->src[2])) {
2022             inst->opcode = BRW_OPCODE_MOV;
2023             inst->src[0] = inst->src[1];
2024             inst->src[1] = reg_undef;
2025             inst->src[2] = reg_undef;
2026             progress = true;
2027             break;
2028          }
2029          break;
2030       case BRW_OPCODE_SEL:
2031          if (inst->src[0].equals(inst->src[1])) {
2032             inst->opcode = BRW_OPCODE_MOV;
2033             inst->src[1] = reg_undef;
2034             inst->predicate = BRW_PREDICATE_NONE;
2035             inst->predicate_inverse = false;
2036             progress = true;
2037          } else if (inst->saturate && inst->src[1].file == IMM) {
2038             switch (inst->conditional_mod) {
2039             case BRW_CONDITIONAL_LE:
2040             case BRW_CONDITIONAL_L:
2041                switch (inst->src[1].type) {
2042                case BRW_REGISTER_TYPE_F:
2043                   if (inst->src[1].fixed_hw_reg.dw1.f >= 1.0f) {
2044                      inst->opcode = BRW_OPCODE_MOV;
2045                      inst->src[1] = reg_undef;
2046                      progress = true;
2047                   }
2048                   break;
2049                default:
2050                   break;
2051                }
2052                break;
2053             case BRW_CONDITIONAL_GE:
2054             case BRW_CONDITIONAL_G:
2055                switch (inst->src[1].type) {
2056                case BRW_REGISTER_TYPE_F:
2057                   if (inst->src[1].fixed_hw_reg.dw1.f <= 0.0f) {
2058                      inst->opcode = BRW_OPCODE_MOV;
2059                      inst->src[1] = reg_undef;
2060                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
2061                      progress = true;
2062                   }
2063                   break;
2064                default:
2065                   break;
2066                }
2067             default:
2068                break;
2069             }
2070          }
2071          break;
2072       default:
2073          break;
2074       }
2075    }
2076
2077    return progress;
2078 }
2079
2080 bool
2081 fs_visitor::opt_register_renaming()
2082 {
2083    bool progress = false;
2084    int depth = 0;
2085
2086    int remap[virtual_grf_count];
2087    memset(remap, -1, sizeof(int) * virtual_grf_count);
2088
2089    foreach_in_list(fs_inst, inst, &this->instructions) {
2090       if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
2091          depth++;
2092       } else if (inst->opcode == BRW_OPCODE_ENDIF ||
2093                  inst->opcode == BRW_OPCODE_WHILE) {
2094          depth--;
2095       }
2096
2097       /* Rewrite instruction sources. */
2098       for (int i = 0; i < inst->sources; i++) {
2099          if (inst->src[i].file == GRF &&
2100              remap[inst->src[i].reg] != -1 &&
2101              remap[inst->src[i].reg] != inst->src[i].reg) {
2102             inst->src[i].reg = remap[inst->src[i].reg];
2103             progress = true;
2104          }
2105       }
2106
2107       const int dst = inst->dst.reg;
2108
2109       if (depth == 0 &&
2110           inst->dst.file == GRF &&
2111           virtual_grf_sizes[inst->dst.reg] == 1 &&
2112           !inst->is_partial_write()) {
2113          if (remap[dst] == -1) {
2114             remap[dst] = dst;
2115          } else {
2116             remap[dst] = virtual_grf_alloc(1);
2117             inst->dst.reg = remap[dst];
2118             progress = true;
2119          }
2120       } else if (inst->dst.file == GRF &&
2121                  remap[dst] != -1 &&
2122                  remap[dst] != dst) {
2123          inst->dst.reg = remap[dst];
2124          progress = true;
2125       }
2126    }
2127
2128    if (progress) {
2129       invalidate_live_intervals();
2130
2131       for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
2132          if (delta_x[i].file == GRF && remap[delta_x[i].reg] != -1) {
2133             delta_x[i].reg = remap[delta_x[i].reg];
2134          }
2135       }
2136       for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
2137          if (delta_y[i].file == GRF && remap[delta_y[i].reg] != -1) {
2138             delta_y[i].reg = remap[delta_y[i].reg];
2139          }
2140       }
2141    }
2142
2143    return progress;
2144 }
2145
2146 bool
2147 fs_visitor::compute_to_mrf()
2148 {
2149    bool progress = false;
2150    int next_ip = 0;
2151
2152    calculate_live_intervals();
2153
2154    foreach_in_list_safe(fs_inst, inst, &instructions) {
2155       int ip = next_ip;
2156       next_ip++;
2157
2158       if (inst->opcode != BRW_OPCODE_MOV ||
2159           inst->is_partial_write() ||
2160           inst->dst.file != MRF || inst->src[0].file != GRF ||
2161           inst->dst.type != inst->src[0].type ||
2162           inst->src[0].abs || inst->src[0].negate ||
2163           !inst->src[0].is_contiguous() ||
2164           inst->src[0].subreg_offset)
2165          continue;
2166
2167       /* Work out which hardware MRF registers are written by this
2168        * instruction.
2169        */
2170       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2171       int mrf_high;
2172       if (inst->dst.reg & BRW_MRF_COMPR4) {
2173          mrf_high = mrf_low + 4;
2174       } else if (dispatch_width == 16 &&
2175                  (!inst->force_uncompressed && !inst->force_sechalf)) {
2176          mrf_high = mrf_low + 1;
2177       } else {
2178          mrf_high = mrf_low;
2179       }
2180
2181       /* Can't compute-to-MRF this GRF if someone else was going to
2182        * read it later.
2183        */
2184       if (this->virtual_grf_end[inst->src[0].reg] > ip)
2185          continue;
2186
2187       /* Found a move of a GRF to a MRF.  Let's see if we can go
2188        * rewrite the thing that made this GRF to write into the MRF.
2189        */
2190       fs_inst *scan_inst;
2191       for (scan_inst = (fs_inst *)inst->prev;
2192            !scan_inst->is_head_sentinel();
2193            scan_inst = (fs_inst *)scan_inst->prev) {
2194          if (scan_inst->dst.file == GRF &&
2195              scan_inst->dst.reg == inst->src[0].reg) {
2196             /* Found the last thing to write our reg we want to turn
2197              * into a compute-to-MRF.
2198              */
2199
2200             /* If this one instruction didn't populate all the
2201              * channels, bail.  We might be able to rewrite everything
2202              * that writes that reg, but it would require smarter
2203              * tracking to delay the rewriting until complete success.
2204              */
2205             if (scan_inst->is_partial_write())
2206                break;
2207
2208             /* Things returning more than one register would need us to
2209              * understand coalescing out more than one MOV at a time.
2210              */
2211             if (scan_inst->regs_written > 1)
2212                break;
2213
2214             /* SEND instructions can't have MRF as a destination. */
2215             if (scan_inst->mlen)
2216                break;
2217
2218             if (brw->gen == 6) {
2219                /* gen6 math instructions must have the destination be
2220                 * GRF, so no compute-to-MRF for them.
2221                 */
2222                if (scan_inst->is_math()) {
2223                   break;
2224                }
2225             }
2226
2227             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2228                /* Found the creator of our MRF's source value. */
2229                scan_inst->dst.file = MRF;
2230                scan_inst->dst.reg = inst->dst.reg;
2231                scan_inst->saturate |= inst->saturate;
2232                inst->remove();
2233                progress = true;
2234             }
2235             break;
2236          }
2237
2238          /* We don't handle control flow here.  Most computation of
2239           * values that end up in MRFs are shortly before the MRF
2240           * write anyway.
2241           */
2242          if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2243             break;
2244
2245          /* You can't read from an MRF, so if someone else reads our
2246           * MRF's source GRF that we wanted to rewrite, that stops us.
2247           */
2248          bool interfered = false;
2249          for (int i = 0; i < scan_inst->sources; i++) {
2250             if (scan_inst->src[i].file == GRF &&
2251                 scan_inst->src[i].reg == inst->src[0].reg &&
2252                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2253                interfered = true;
2254             }
2255          }
2256          if (interfered)
2257             break;
2258
2259          if (scan_inst->dst.file == MRF) {
2260             /* If somebody else writes our MRF here, we can't
2261              * compute-to-MRF before that.
2262              */
2263             int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2264             int scan_mrf_high;
2265
2266             if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2267                scan_mrf_high = scan_mrf_low + 4;
2268             } else if (dispatch_width == 16 &&
2269                        (!scan_inst->force_uncompressed &&
2270                         !scan_inst->force_sechalf)) {
2271                scan_mrf_high = scan_mrf_low + 1;
2272             } else {
2273                scan_mrf_high = scan_mrf_low;
2274             }
2275
2276             if (mrf_low == scan_mrf_low ||
2277                 mrf_low == scan_mrf_high ||
2278                 mrf_high == scan_mrf_low ||
2279                 mrf_high == scan_mrf_high) {
2280                break;
2281             }
2282          }
2283
2284          if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2285             /* Found a SEND instruction, which means that there are
2286              * live values in MRFs from base_mrf to base_mrf +
2287              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
2288              * above it.
2289              */
2290             if (mrf_low >= scan_inst->base_mrf &&
2291                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2292                break;
2293             }
2294             if (mrf_high >= scan_inst->base_mrf &&
2295                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2296                break;
2297             }
2298          }
2299       }
2300    }
2301
2302    if (progress)
2303       invalidate_live_intervals(false);
2304
2305    return progress;
2306 }
2307
2308 /**
2309  * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
2310  * instructions to FS_OPCODE_REP_FB_WRITE.
2311  */
2312 void
2313 fs_visitor::try_rep_send()
2314 {
2315    int i, count;
2316    fs_inst *start = NULL;
2317
2318    /* From the Ivybridge PRM, Volume 4 Part 1, section 3.9.11.2
2319     * ("Message Descriptor - Render Target Write"):
2320     *
2321     * "SIMD16_REPDATA message must not be used in SIMD8 pixel-shaders."
2322     */
2323    if (dispatch_width != 16)
2324       return;
2325
2326    /* The constant color write message can't handle anything but the 4 color
2327     * values.  We could do MRT, but the loops below would need to understand
2328     * handling the header being enabled or disabled on different messages.  It
2329     * also requires that the render target be tiled, which might not be the
2330     * case for some EGLImage paths or if we some day do rendering to PBOs.
2331     */
2332    if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH) ||
2333        payload.aa_dest_stencil_reg ||
2334        payload.dest_depth_reg ||
2335        dual_src_output.file != BAD_FILE)
2336       return;
2337
2338    /* The optimization is implemented as one pass through the instruction
2339     * list.  We keep track of the most recent block of MOVs into sequential
2340     * MRFs from single, sequential float registers (ie uniforms).  Then when
2341     * we find an FB_WRITE opcode, we see if the payload registers match the
2342     * destination registers in our block of MOVs.
2343     */
2344    count = 0;
2345    foreach_in_list_safe(fs_inst, inst, &this->instructions) {
2346       if (count == 0)
2347          start = inst;
2348       if (inst->opcode == BRW_OPCODE_MOV &&
2349           inst->dst.file == MRF &&
2350           inst->dst.reg == start->dst.reg + 2 * count &&
2351           inst->src[0].file == HW_REG &&
2352           inst->src[0].reg_offset == start->src[0].reg_offset + count) {
2353          if (count == 0)
2354             start = inst;
2355          count++;
2356       }
2357
2358       if (inst->opcode == FS_OPCODE_FB_WRITE &&
2359           count == 4 &&
2360           (inst->base_mrf == start->dst.reg ||
2361            (inst->base_mrf + 2 == start->dst.reg && inst->header_present))) {
2362          fs_inst *mov = MOV(start->dst, start->src[0]);
2363
2364          /* Make a MOV that moves the four floats into the replicated write
2365           * payload.  Since we're running at the very end of code generation
2366           * we can use hw registers and generate the stride and offsets we
2367           * need for this MOV.  We use the first of the eight registers
2368           * allocated for the SIMD16 payload for the four floats.
2369           */
2370          mov->dst.fixed_hw_reg =
2371             brw_vec4_reg(BRW_MESSAGE_REGISTER_FILE,
2372                          start->dst.reg, 0);
2373          mov->dst.file = HW_REG;
2374          mov->dst.type = mov->dst.fixed_hw_reg.type;
2375
2376          mov->src[0].fixed_hw_reg =
2377             brw_vec4_grf(mov->src[0].fixed_hw_reg.nr, 0);
2378          mov->src[0].file = HW_REG;
2379          mov->src[0].type = mov->src[0].fixed_hw_reg.type;
2380          mov->force_writemask_all = true;
2381          mov->dst.type = BRW_REGISTER_TYPE_F;
2382
2383          /* Replace the four MOVs with the new vec4 MOV. */
2384          start->insert_before(mov);
2385          for (i = 0; i < 4; i++)
2386             mov->next->remove();
2387
2388          /* Finally, adjust the message length and set the opcode to
2389           * REP_FB_WRITE for the send, so that the generator will use the
2390           * replicated data mesage type.  Then reset count so we'll start
2391           * looking for a new block in case we're in a MRT shader.
2392           */
2393          inst->opcode = FS_OPCODE_REP_FB_WRITE;
2394          inst->mlen -= 7;
2395          count = 0;
2396       }
2397    }
2398
2399    return;
2400 }
2401
2402 /**
2403  * Walks through basic blocks, looking for repeated MRF writes and
2404  * removing the later ones.
2405  */
2406 bool
2407 fs_visitor::remove_duplicate_mrf_writes()
2408 {
2409    fs_inst *last_mrf_move[16];
2410    bool progress = false;
2411
2412    /* Need to update the MRF tracking for compressed instructions. */
2413    if (dispatch_width == 16)
2414       return false;
2415
2416    memset(last_mrf_move, 0, sizeof(last_mrf_move));
2417
2418    calculate_cfg();
2419
2420    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
2421       if (inst->is_control_flow()) {
2422          memset(last_mrf_move, 0, sizeof(last_mrf_move));
2423       }
2424
2425       if (inst->opcode == BRW_OPCODE_MOV &&
2426           inst->dst.file == MRF) {
2427          fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2428          if (prev_inst && inst->equals(prev_inst)) {
2429             inst->remove(block);
2430             progress = true;
2431             continue;
2432          }
2433       }
2434
2435       /* Clear out the last-write records for MRFs that were overwritten. */
2436       if (inst->dst.file == MRF) {
2437          last_mrf_move[inst->dst.reg] = NULL;
2438       }
2439
2440       if (inst->mlen > 0 && inst->base_mrf != -1) {
2441          /* Found a SEND instruction, which will include two or fewer
2442           * implied MRF writes.  We could do better here.
2443           */
2444          for (int i = 0; i < implied_mrf_writes(inst); i++) {
2445             last_mrf_move[inst->base_mrf + i] = NULL;
2446          }
2447       }
2448
2449       /* Clear out any MRF move records whose sources got overwritten. */
2450       if (inst->dst.file == GRF) {
2451          for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2452             if (last_mrf_move[i] &&
2453                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2454                last_mrf_move[i] = NULL;
2455             }
2456          }
2457       }
2458
2459       if (inst->opcode == BRW_OPCODE_MOV &&
2460           inst->dst.file == MRF &&
2461           inst->src[0].file == GRF &&
2462           !inst->is_partial_write()) {
2463          last_mrf_move[inst->dst.reg] = inst;
2464       }
2465    }
2466
2467    if (progress)
2468       invalidate_live_intervals();
2469
2470    return progress;
2471 }
2472
2473 static void
2474 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2475                         int first_grf, int grf_len)
2476 {
2477    bool inst_simd16 = (dispatch_width > 8 &&
2478                        !inst->force_uncompressed &&
2479                        !inst->force_sechalf);
2480
2481    /* Clear the flag for registers that actually got read (as expected). */
2482    for (int i = 0; i < inst->sources; i++) {
2483       int grf;
2484       if (inst->src[i].file == GRF) {
2485          grf = inst->src[i].reg;
2486       } else if (inst->src[i].file == HW_REG &&
2487                  inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2488          grf = inst->src[i].fixed_hw_reg.nr;
2489       } else {
2490          continue;
2491       }
2492
2493       if (grf >= first_grf &&
2494           grf < first_grf + grf_len) {
2495          deps[grf - first_grf] = false;
2496          if (inst_simd16)
2497             deps[grf - first_grf + 1] = false;
2498       }
2499    }
2500 }
2501
2502 /**
2503  * Implements this workaround for the original 965:
2504  *
2505  *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2506  *      check for post destination dependencies on this instruction, software
2507  *      must ensure that there is no destination hazard for the case of ‘write
2508  *      followed by a posted write’ shown in the following example.
2509  *
2510  *      1. mov r3 0
2511  *      2. send r3.xy <rest of send instruction>
2512  *      3. mov r2 r3
2513  *
2514  *      Due to no post-destination dependency check on the ‘send’, the above
2515  *      code sequence could have two instructions (1 and 2) in flight at the
2516  *      same time that both consider ‘r3’ as the target of their final writes.
2517  */
2518 void
2519 fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
2520 {
2521    int reg_size = dispatch_width / 8;
2522    int write_len = inst->regs_written * reg_size;
2523    int first_write_grf = inst->dst.reg;
2524    bool needs_dep[BRW_MAX_MRF];
2525    assert(write_len < (int)sizeof(needs_dep) - 1);
2526
2527    memset(needs_dep, false, sizeof(needs_dep));
2528    memset(needs_dep, true, write_len);
2529
2530    clear_deps_for_inst_src(inst, dispatch_width,
2531                            needs_dep, first_write_grf, write_len);
2532
2533    /* Walk backwards looking for writes to registers we're writing which
2534     * aren't read since being written.  If we hit the start of the program,
2535     * we assume that there are no outstanding dependencies on entry to the
2536     * program.
2537     */
2538    for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2539         !scan_inst->is_head_sentinel();
2540         scan_inst = (fs_inst *)scan_inst->prev) {
2541
2542       /* If we hit control flow, assume that there *are* outstanding
2543        * dependencies, and force their cleanup before our instruction.
2544        */
2545       if (scan_inst->is_control_flow()) {
2546          for (int i = 0; i < write_len; i++) {
2547             if (needs_dep[i]) {
2548                inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2549             }
2550          }
2551          return;
2552       }
2553
2554       bool scan_inst_simd16 = (dispatch_width > 8 &&
2555                                !scan_inst->force_uncompressed &&
2556                                !scan_inst->force_sechalf);
2557
2558       /* We insert our reads as late as possible on the assumption that any
2559        * instruction but a MOV that might have left us an outstanding
2560        * dependency has more latency than a MOV.
2561        */
2562       if (scan_inst->dst.file == GRF) {
2563          for (int i = 0; i < scan_inst->regs_written; i++) {
2564             int reg = scan_inst->dst.reg + i * reg_size;
2565
2566             if (reg >= first_write_grf &&
2567                 reg < first_write_grf + write_len &&
2568                 needs_dep[reg - first_write_grf]) {
2569                inst->insert_before(DEP_RESOLVE_MOV(reg));
2570                needs_dep[reg - first_write_grf] = false;
2571                if (scan_inst_simd16)
2572                   needs_dep[reg - first_write_grf + 1] = false;
2573             }
2574          }
2575       }
2576
2577       /* Clear the flag for registers that actually got read (as expected). */
2578       clear_deps_for_inst_src(scan_inst, dispatch_width,
2579                               needs_dep, first_write_grf, write_len);
2580
2581       /* Continue the loop only if we haven't resolved all the dependencies */
2582       int i;
2583       for (i = 0; i < write_len; i++) {
2584          if (needs_dep[i])
2585             break;
2586       }
2587       if (i == write_len)
2588          return;
2589    }
2590 }
2591
2592 /**
2593  * Implements this workaround for the original 965:
2594  *
2595  *     "[DevBW, DevCL] Errata: A destination register from a send can not be
2596  *      used as a destination register until after it has been sourced by an
2597  *      instruction with a different destination register.
2598  */
2599 void
2600 fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
2601 {
2602    int write_len = inst->regs_written * dispatch_width / 8;
2603    int first_write_grf = inst->dst.reg;
2604    bool needs_dep[BRW_MAX_MRF];
2605    assert(write_len < (int)sizeof(needs_dep) - 1);
2606
2607    memset(needs_dep, false, sizeof(needs_dep));
2608    memset(needs_dep, true, write_len);
2609    /* Walk forwards looking for writes to registers we're writing which aren't
2610     * read before being written.
2611     */
2612    for (fs_inst *scan_inst = (fs_inst *)inst->next;
2613         !scan_inst->is_tail_sentinel();
2614         scan_inst = (fs_inst *)scan_inst->next) {
2615       /* If we hit control flow, force resolve all remaining dependencies. */
2616       if (scan_inst->is_control_flow()) {
2617          for (int i = 0; i < write_len; i++) {
2618             if (needs_dep[i])
2619                scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2620          }
2621          return;
2622       }
2623
2624       /* Clear the flag for registers that actually got read (as expected). */
2625       clear_deps_for_inst_src(scan_inst, dispatch_width,
2626                               needs_dep, first_write_grf, write_len);
2627
2628       /* We insert our reads as late as possible since they're reading the
2629        * result of a SEND, which has massive latency.
2630        */
2631       if (scan_inst->dst.file == GRF &&
2632           scan_inst->dst.reg >= first_write_grf &&
2633           scan_inst->dst.reg < first_write_grf + write_len &&
2634           needs_dep[scan_inst->dst.reg - first_write_grf]) {
2635          scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2636          needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2637       }
2638
2639       /* Continue the loop only if we haven't resolved all the dependencies */
2640       int i;
2641       for (i = 0; i < write_len; i++) {
2642          if (needs_dep[i])
2643             break;
2644       }
2645       if (i == write_len)
2646          return;
2647    }
2648
2649    /* If we hit the end of the program, resolve all remaining dependencies out
2650     * of paranoia.
2651     */
2652    fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2653    assert(last_inst->eot);
2654    for (int i = 0; i < write_len; i++) {
2655       if (needs_dep[i])
2656          last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2657    }
2658 }
2659
2660 void
2661 fs_visitor::insert_gen4_send_dependency_workarounds()
2662 {
2663    if (brw->gen != 4 || brw->is_g4x)
2664       return;
2665
2666    bool progress = false;
2667
2668    /* Note that we're done with register allocation, so GRF fs_regs always
2669     * have a .reg_offset of 0.
2670     */
2671
2672    foreach_in_list_safe(fs_inst, inst, &instructions) {
2673       if (inst->mlen != 0 && inst->dst.file == GRF) {
2674          insert_gen4_pre_send_dependency_workarounds(inst);
2675          insert_gen4_post_send_dependency_workarounds(inst);
2676          progress = true;
2677       }
2678    }
2679
2680    if (progress)
2681       invalidate_live_intervals();
2682 }
2683
2684 /**
2685  * Turns the generic expression-style uniform pull constant load instruction
2686  * into a hardware-specific series of instructions for loading a pull
2687  * constant.
2688  *
2689  * The expression style allows the CSE pass before this to optimize out
2690  * repeated loads from the same offset, and gives the pre-register-allocation
2691  * scheduling full flexibility, while the conversion to native instructions
2692  * allows the post-register-allocation scheduler the best information
2693  * possible.
2694  *
2695  * Note that execution masking for setting up pull constant loads is special:
2696  * the channels that need to be written are unrelated to the current execution
2697  * mask, since a later instruction will use one of the result channels as a
2698  * source operand for all 8 or 16 of its channels.
2699  */
2700 void
2701 fs_visitor::lower_uniform_pull_constant_loads()
2702 {
2703    calculate_cfg();
2704
2705    foreach_block_and_inst (block, fs_inst, inst, cfg) {
2706       if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2707          continue;
2708
2709       if (brw->gen >= 7) {
2710          /* The offset arg before was a vec4-aligned byte offset.  We need to
2711           * turn it into a dword offset.
2712           */
2713          fs_reg const_offset_reg = inst->src[1];
2714          assert(const_offset_reg.file == IMM &&
2715                 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2716          const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
2717          fs_reg payload = fs_reg(this, glsl_type::uint_type);
2718
2719          /* This is actually going to be a MOV, but since only the first dword
2720           * is accessed, we have a special opcode to do just that one.  Note
2721           * that this needs to be an operation that will be considered a def
2722           * by live variable analysis, or register allocation will explode.
2723           */
2724          fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2725                                                payload, const_offset_reg);
2726          setup->force_writemask_all = true;
2727
2728          setup->ir = inst->ir;
2729          setup->annotation = inst->annotation;
2730          inst->insert_before(block, setup);
2731
2732          /* Similarly, this will only populate the first 4 channels of the
2733           * result register (since we only use smear values from 0-3), but we
2734           * don't tell the optimizer.
2735           */
2736          inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2737          inst->src[1] = payload;
2738
2739          invalidate_live_intervals(false);
2740       } else {
2741          /* Before register allocation, we didn't tell the scheduler about the
2742           * MRF we use.  We know it's safe to use this MRF because nothing
2743           * else does except for register spill/unspill, which generates and
2744           * uses its MRF within a single IR instruction.
2745           */
2746          inst->base_mrf = 14;
2747          inst->mlen = 1;
2748       }
2749    }
2750 }
2751
2752 bool
2753 fs_visitor::lower_load_payload()
2754 {
2755    bool progress = false;
2756
2757    calculate_cfg();
2758
2759    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
2760       if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
2761          fs_reg dst = inst->dst;
2762
2763          /* src[0] represents the (optional) message header. */
2764          if (inst->src[0].file != BAD_FILE) {
2765             inst->insert_before(block, MOV(dst, inst->src[0]));
2766          }
2767          dst.reg_offset++;
2768
2769          for (int i = 1; i < inst->sources; i++) {
2770             inst->insert_before(block, MOV(dst, inst->src[i]));
2771             dst.reg_offset++;
2772          }
2773
2774          inst->remove(block);
2775          progress = true;
2776       }
2777    }
2778
2779    if (progress)
2780       invalidate_live_intervals(false);
2781
2782    return progress;
2783 }
2784
2785 void
2786 fs_visitor::dump_instructions()
2787 {
2788    dump_instructions(NULL);
2789 }
2790
2791 void
2792 fs_visitor::dump_instructions(const char *name)
2793 {
2794    calculate_register_pressure();
2795    FILE *file = stderr;
2796    if (name && geteuid() != 0) {
2797       file = fopen(name, "w");
2798       if (!file)
2799          file = stderr;
2800    }
2801
2802    int ip = 0, max_pressure = 0;
2803    foreach_in_list(backend_instruction, inst, &instructions) {
2804       max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
2805       fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
2806       dump_instruction(inst, file);
2807       ++ip;
2808    }
2809    fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
2810
2811    if (file != stderr) {
2812       fclose(file);
2813    }
2814 }
2815
2816 void
2817 fs_visitor::dump_instruction(backend_instruction *be_inst)
2818 {
2819    dump_instruction(be_inst, stderr);
2820 }
2821
2822 void
2823 fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
2824 {
2825    fs_inst *inst = (fs_inst *)be_inst;
2826
2827    if (inst->predicate) {
2828       fprintf(file, "(%cf0.%d) ",
2829              inst->predicate_inverse ? '-' : '+',
2830              inst->flag_subreg);
2831    }
2832
2833    fprintf(file, "%s", brw_instruction_name(inst->opcode));
2834    if (inst->saturate)
2835       fprintf(file, ".sat");
2836    if (inst->conditional_mod) {
2837       fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
2838       if (!inst->predicate &&
2839           (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2840                               inst->opcode != BRW_OPCODE_IF &&
2841                               inst->opcode != BRW_OPCODE_WHILE))) {
2842          fprintf(file, ".f0.%d", inst->flag_subreg);
2843       }
2844    }
2845    fprintf(file, " ");
2846
2847
2848    switch (inst->dst.file) {
2849    case GRF:
2850       fprintf(file, "vgrf%d", inst->dst.reg);
2851       if (virtual_grf_sizes[inst->dst.reg] != 1 ||
2852           inst->dst.subreg_offset)
2853          fprintf(file, "+%d.%d",
2854                  inst->dst.reg_offset, inst->dst.subreg_offset);
2855       break;
2856    case MRF:
2857       fprintf(file, "m%d", inst->dst.reg);
2858       break;
2859    case BAD_FILE:
2860       fprintf(file, "(null)");
2861       break;
2862    case UNIFORM:
2863       fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
2864       break;
2865    case HW_REG:
2866       if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
2867          switch (inst->dst.fixed_hw_reg.nr) {
2868          case BRW_ARF_NULL:
2869             fprintf(file, "null");
2870             break;
2871          case BRW_ARF_ADDRESS:
2872             fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
2873             break;
2874          case BRW_ARF_ACCUMULATOR:
2875             fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
2876             break;
2877          case BRW_ARF_FLAG:
2878             fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
2879                              inst->dst.fixed_hw_reg.subnr);
2880             break;
2881          default:
2882             fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
2883                                inst->dst.fixed_hw_reg.subnr);
2884             break;
2885          }
2886       } else {
2887          fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
2888       }
2889       if (inst->dst.fixed_hw_reg.subnr)
2890          fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
2891       break;
2892    default:
2893       fprintf(file, "???");
2894       break;
2895    }
2896    fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
2897
2898    for (int i = 0; i < inst->sources && inst->src[i].file != BAD_FILE; i++) {
2899       if (inst->src[i].negate)
2900          fprintf(file, "-");
2901       if (inst->src[i].abs)
2902          fprintf(file, "|");
2903       switch (inst->src[i].file) {
2904       case GRF:
2905          fprintf(file, "vgrf%d", inst->src[i].reg);
2906          if (virtual_grf_sizes[inst->src[i].reg] != 1 ||
2907              inst->src[i].subreg_offset)
2908             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
2909                     inst->src[i].subreg_offset);
2910          break;
2911       case MRF:
2912          fprintf(file, "***m%d***", inst->src[i].reg);
2913          break;
2914       case UNIFORM:
2915          fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
2916          if (inst->src[i].reladdr) {
2917             fprintf(file, "+reladdr");
2918          } else if (virtual_grf_sizes[inst->src[i].reg] != 1 ||
2919              inst->src[i].subreg_offset) {
2920             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
2921                     inst->src[i].subreg_offset);
2922          }
2923          break;
2924       case BAD_FILE:
2925          fprintf(file, "(null)");
2926          break;
2927       case IMM:
2928          switch (inst->src[i].type) {
2929          case BRW_REGISTER_TYPE_F:
2930             fprintf(file, "%ff", inst->src[i].fixed_hw_reg.dw1.f);
2931             break;
2932          case BRW_REGISTER_TYPE_D:
2933             fprintf(file, "%dd", inst->src[i].fixed_hw_reg.dw1.d);
2934             break;
2935          case BRW_REGISTER_TYPE_UD:
2936             fprintf(file, "%uu", inst->src[i].fixed_hw_reg.dw1.ud);
2937             break;
2938          default:
2939             fprintf(file, "???");
2940             break;
2941          }
2942          break;
2943       case HW_REG:
2944          if (inst->src[i].fixed_hw_reg.negate)
2945             fprintf(file, "-");
2946          if (inst->src[i].fixed_hw_reg.abs)
2947             fprintf(file, "|");
2948          if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
2949             switch (inst->src[i].fixed_hw_reg.nr) {
2950             case BRW_ARF_NULL:
2951                fprintf(file, "null");
2952                break;
2953             case BRW_ARF_ADDRESS:
2954                fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
2955                break;
2956             case BRW_ARF_ACCUMULATOR:
2957                fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
2958                break;
2959             case BRW_ARF_FLAG:
2960                fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
2961                                 inst->src[i].fixed_hw_reg.subnr);
2962                break;
2963             default:
2964                fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
2965                                   inst->src[i].fixed_hw_reg.subnr);
2966                break;
2967             }
2968          } else {
2969             fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
2970          }
2971          if (inst->src[i].fixed_hw_reg.subnr)
2972             fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
2973          if (inst->src[i].fixed_hw_reg.abs)
2974             fprintf(file, "|");
2975          break;
2976       default:
2977          fprintf(file, "???");
2978          break;
2979       }
2980       if (inst->src[i].abs)
2981          fprintf(file, "|");
2982
2983       if (inst->src[i].file != IMM) {
2984          fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
2985       }
2986
2987       if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
2988          fprintf(file, ", ");
2989    }
2990
2991    fprintf(file, " ");
2992
2993    if (inst->force_uncompressed)
2994       fprintf(file, "1sthalf ");
2995
2996    if (inst->force_sechalf)
2997       fprintf(file, "2ndhalf ");
2998
2999    fprintf(file, "\n");
3000 }
3001
3002 /**
3003  * Possibly returns an instruction that set up @param reg.
3004  *
3005  * Sometimes we want to take the result of some expression/variable
3006  * dereference tree and rewrite the instruction generating the result
3007  * of the tree.  When processing the tree, we know that the
3008  * instructions generated are all writing temporaries that are dead
3009  * outside of this tree.  So, if we have some instructions that write
3010  * a temporary, we're free to point that temp write somewhere else.
3011  *
3012  * Note that this doesn't guarantee that the instruction generated
3013  * only reg -- it might be the size=4 destination of a texture instruction.
3014  */
3015 fs_inst *
3016 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3017                                            fs_inst *end,
3018                                            const fs_reg &reg)
3019 {
3020    if (end == start ||
3021        end->is_partial_write() ||
3022        reg.reladdr ||
3023        !reg.equals(end->dst)) {
3024       return NULL;
3025    } else {
3026       return end;
3027    }
3028 }
3029
3030 void
3031 fs_visitor::setup_payload_gen6()
3032 {
3033    bool uses_depth =
3034       (fp->Base.InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3035    unsigned barycentric_interp_modes = prog_data->barycentric_interp_modes;
3036
3037    assert(brw->gen >= 6);
3038
3039    /* R0-1: masks, pixel X/Y coordinates. */
3040    payload.num_regs = 2;
3041    /* R2: only for 32-pixel dispatch.*/
3042
3043    /* R3-26: barycentric interpolation coordinates.  These appear in the
3044     * same order that they appear in the brw_wm_barycentric_interp_mode
3045     * enum.  Each set of coordinates occupies 2 registers if dispatch width
3046     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
3047     * appear if they were enabled using the "Barycentric Interpolation
3048     * Mode" bits in WM_STATE.
3049     */
3050    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3051       if (barycentric_interp_modes & (1 << i)) {
3052          payload.barycentric_coord_reg[i] = payload.num_regs;
3053          payload.num_regs += 2;
3054          if (dispatch_width == 16) {
3055             payload.num_regs += 2;
3056          }
3057       }
3058    }
3059
3060    /* R27: interpolated depth if uses source depth */
3061    if (uses_depth) {
3062       payload.source_depth_reg = payload.num_regs;
3063       payload.num_regs++;
3064       if (dispatch_width == 16) {
3065          /* R28: interpolated depth if not SIMD8. */
3066          payload.num_regs++;
3067       }
3068    }
3069    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3070    if (uses_depth) {
3071       payload.source_w_reg = payload.num_regs;
3072       payload.num_regs++;
3073       if (dispatch_width == 16) {
3074          /* R30: interpolated W if not SIMD8. */
3075          payload.num_regs++;
3076       }
3077    }
3078
3079    prog_data->uses_pos_offset = key->compute_pos_offset;
3080    /* R31: MSAA position offsets. */
3081    if (prog_data->uses_pos_offset) {
3082       payload.sample_pos_reg = payload.num_regs;
3083       payload.num_regs++;
3084    }
3085
3086    /* R32: MSAA input coverage mask */
3087    if (fp->Base.SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
3088       assert(brw->gen >= 7);
3089       payload.sample_mask_in_reg = payload.num_regs;
3090       payload.num_regs++;
3091       if (dispatch_width == 16) {
3092          /* R33: input coverage mask if not SIMD8. */
3093          payload.num_regs++;
3094       }
3095    }
3096
3097    /* R34-: bary for 32-pixel. */
3098    /* R58-59: interp W for 32-pixel. */
3099
3100    if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3101       source_depth_to_render_target = true;
3102    }
3103 }
3104
3105 void
3106 fs_visitor::assign_binding_table_offsets()
3107 {
3108    uint32_t next_binding_table_offset = 0;
3109
3110    /* If there are no color regions, we still perform an FB write to a null
3111     * renderbuffer, which we place at surface index 0.
3112     */
3113    prog_data->binding_table.render_target_start = next_binding_table_offset;
3114    next_binding_table_offset += MAX2(key->nr_color_regions, 1);
3115
3116    assign_common_binding_table_offsets(next_binding_table_offset);
3117 }
3118
3119 void
3120 fs_visitor::calculate_register_pressure()
3121 {
3122    invalidate_live_intervals(false);
3123    calculate_live_intervals();
3124
3125    unsigned num_instructions = instructions.length();
3126
3127    regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
3128
3129    for (int reg = 0; reg < virtual_grf_count; reg++) {
3130       for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
3131          regs_live_at_ip[ip] += virtual_grf_sizes[reg];
3132    }
3133 }
3134
3135 /**
3136  * Look for repeated FS_OPCODE_MOV_DISPATCH_TO_FLAGS and drop the later ones.
3137  *
3138  * The needs_unlit_centroid_workaround ends up producing one of these per
3139  * channel of centroid input, so it's good to clean them up.
3140  *
3141  * An assumption here is that nothing ever modifies the dispatched pixels
3142  * value that FS_OPCODE_MOV_DISPATCH_TO_FLAGS reads from, but the hardware
3143  * dictates that anyway.
3144  */
3145 void
3146 fs_visitor::opt_drop_redundant_mov_to_flags()
3147 {
3148    bool flag_mov_found[2] = {false};
3149
3150    foreach_in_list_safe(fs_inst, inst, &instructions) {
3151       if (inst->is_control_flow()) {
3152          memset(flag_mov_found, 0, sizeof(flag_mov_found));
3153       } else if (inst->opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS) {
3154          if (!flag_mov_found[inst->flag_subreg])
3155             flag_mov_found[inst->flag_subreg] = true;
3156          else
3157             inst->remove();
3158       } else if (inst->writes_flag()) {
3159          flag_mov_found[inst->flag_subreg] = false;
3160       }
3161    }
3162 }
3163
3164 bool
3165 fs_visitor::run()
3166 {
3167    sanity_param_count = fp->Base.Parameters->NumParameters;
3168    bool allocated_without_spills;
3169
3170    assign_binding_table_offsets();
3171
3172    if (brw->gen >= 6)
3173       setup_payload_gen6();
3174    else
3175       setup_payload_gen4();
3176
3177    if (0) {
3178       emit_dummy_fs();
3179    } else {
3180       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3181          emit_shader_time_begin();
3182
3183       calculate_urb_setup();
3184       if (fp->Base.InputsRead > 0) {
3185          if (brw->gen < 6)
3186             emit_interpolation_setup_gen4();
3187          else
3188             emit_interpolation_setup_gen6();
3189       }
3190
3191       /* We handle discards by keeping track of the still-live pixels in f0.1.
3192        * Initialize it with the dispatched pixels.
3193        */
3194       if (fp->UsesKill || key->alpha_test_func) {
3195          fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
3196          discard_init->flag_subreg = 1;
3197       }
3198
3199       /* Generate FS IR for main().  (the visitor only descends into
3200        * functions called "main").
3201        */
3202       if (shader) {
3203          foreach_in_list(ir_instruction, ir, shader->base.ir) {
3204             base_ir = ir;
3205             this->result = reg_undef;
3206             ir->accept(this);
3207          }
3208       } else {
3209          emit_fragment_program_code();
3210       }
3211       base_ir = NULL;
3212       if (failed)
3213          return false;
3214
3215       emit(FS_OPCODE_PLACEHOLDER_HALT);
3216
3217       if (key->alpha_test_func)
3218          emit_alpha_test();
3219
3220       emit_fb_writes();
3221
3222       split_virtual_grfs();
3223
3224       move_uniform_array_access_to_pull_constants();
3225       assign_constant_locations();
3226       demote_pull_constants();
3227
3228       opt_drop_redundant_mov_to_flags();
3229
3230 #define OPT(pass, args...) do {                                            \
3231       pass_num++;                                                          \
3232       bool this_progress = pass(args);                                     \
3233                                                                            \
3234       if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) {      \
3235          char filename[64];                                                \
3236          snprintf(filename, 64, "fs%d-%04d-%02d-%02d-" #pass,              \
3237                   dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
3238                                                                            \
3239          backend_visitor::dump_instructions(filename);                     \
3240       }                                                                    \
3241                                                                            \
3242       progress = progress || this_progress;                                \
3243    } while (false)
3244
3245       if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
3246          char filename[64];
3247          snprintf(filename, 64, "fs%d-%04d-00-start",
3248                   dispatch_width, shader_prog ? shader_prog->Name : 0);
3249
3250          backend_visitor::dump_instructions(filename);
3251       }
3252
3253       bool progress;
3254       int iteration = 0;
3255       do {
3256          progress = false;
3257          iteration++;
3258          int pass_num = 0;
3259
3260          compact_virtual_grfs();
3261
3262          OPT(remove_duplicate_mrf_writes);
3263
3264          OPT(opt_algebraic);
3265          OPT(opt_cse);
3266          OPT(opt_copy_propagate);
3267          OPT(opt_peephole_predicated_break);
3268          OPT(dead_code_eliminate);
3269          OPT(opt_peephole_sel);
3270          OPT(dead_control_flow_eliminate, this);
3271          OPT(opt_register_renaming);
3272          OPT(opt_saturate_propagation);
3273          OPT(register_coalesce);
3274          OPT(compute_to_mrf);
3275       } while (progress);
3276
3277       if (lower_load_payload()) {
3278          register_coalesce();
3279          dead_code_eliminate();
3280       }
3281
3282       lower_uniform_pull_constant_loads();
3283
3284       assign_curb_setup();
3285       assign_urb_setup();
3286
3287       static enum instruction_scheduler_mode pre_modes[] = {
3288          SCHEDULE_PRE,
3289          SCHEDULE_PRE_NON_LIFO,
3290          SCHEDULE_PRE_LIFO,
3291       };
3292
3293       /* Try each scheduling heuristic to see if it can successfully register
3294        * allocate without spilling.  They should be ordered by decreasing
3295        * performance but increasing likelihood of allocating.
3296        */
3297       for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3298          schedule_instructions(pre_modes[i]);
3299
3300          if (0) {
3301             assign_regs_trivial();
3302             allocated_without_spills = true;
3303          } else {
3304             allocated_without_spills = assign_regs(false);
3305          }
3306          if (allocated_without_spills)
3307             break;
3308       }
3309
3310       if (!allocated_without_spills) {
3311          /* We assume that any spilling is worse than just dropping back to
3312           * SIMD8.  There's probably actually some intermediate point where
3313           * SIMD16 with a couple of spills is still better.
3314           */
3315          if (dispatch_width == 16) {
3316             fail("Failure to register allocate.  Reduce number of "
3317                  "live scalar values to avoid this.");
3318          } else {
3319             perf_debug("Fragment shader triggered register spilling.  "
3320                        "Try reducing the number of live scalar values to "
3321                        "improve performance.\n");
3322          }
3323
3324          /* Since we're out of heuristics, just go spill registers until we
3325           * get an allocation.
3326           */
3327          while (!assign_regs(true)) {
3328             if (failed)
3329                break;
3330          }
3331       }
3332    }
3333    assert(force_uncompressed_stack == 0);
3334
3335    /* This must come after all optimization and register allocation, since
3336     * it inserts dead code that happens to have side effects, and it does
3337     * so based on the actual physical registers in use.
3338     */
3339    insert_gen4_send_dependency_workarounds();
3340
3341    if (failed)
3342       return false;
3343
3344    if (!allocated_without_spills)
3345       schedule_instructions(SCHEDULE_POST);
3346
3347    if (last_scratch > 0) {
3348       prog_data->base.total_scratch = brw_get_scratch_size(last_scratch);
3349    }
3350
3351    if (brw->use_rep_send)
3352       try_rep_send();
3353
3354    if (dispatch_width == 8)
3355       prog_data->reg_blocks = brw_register_blocks(grf_used);
3356    else
3357       prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
3358
3359    /* If any state parameters were appended, then ParameterValues could have
3360     * been realloced, in which case the driver uniform storage set up by
3361     * _mesa_associate_uniform_storage() would point to freed memory.  Make
3362     * sure that didn't happen.
3363     */
3364    assert(sanity_param_count == fp->Base.Parameters->NumParameters);
3365
3366    calculate_cfg();
3367
3368    return !failed;
3369 }
3370
3371 const unsigned *
3372 brw_wm_fs_emit(struct brw_context *brw,
3373                void *mem_ctx,
3374                const struct brw_wm_prog_key *key,
3375                struct brw_wm_prog_data *prog_data,
3376                struct gl_fragment_program *fp,
3377                struct gl_shader_program *prog,
3378                unsigned *final_assembly_size)
3379 {
3380    bool start_busy = false;
3381    double start_time = 0;
3382
3383    if (unlikely(brw->perf_debug)) {
3384       start_busy = (brw->batch.last_bo &&
3385                     drm_intel_bo_busy(brw->batch.last_bo));
3386       start_time = get_time();
3387    }
3388
3389    struct brw_shader *shader = NULL;
3390    if (prog)
3391       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3392
3393    if (unlikely(INTEL_DEBUG & DEBUG_WM))
3394       brw_dump_ir(brw, "fragment", prog, &shader->base, &fp->Base);
3395
3396    /* Now the main event: Visit the shader IR and generate our FS IR for it.
3397     */
3398    fs_visitor v(brw, mem_ctx, key, prog_data, prog, fp, 8);
3399    if (!v.run()) {
3400       if (prog) {
3401          prog->LinkStatus = false;
3402          ralloc_strcat(&prog->InfoLog, v.fail_msg);
3403       }
3404
3405       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3406                     v.fail_msg);
3407
3408       return NULL;
3409    }
3410
3411    cfg_t *simd16_cfg = NULL;
3412    fs_visitor v2(brw, mem_ctx, key, prog_data, prog, fp, 16);
3413    if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16) ||
3414                                brw->use_rep_send)) {
3415       if (!v.simd16_unsupported) {
3416          /* Try a SIMD16 compile */
3417          v2.import_uniforms(&v);
3418          if (!v2.run()) {
3419             perf_debug("SIMD16 shader failed to compile, falling back to "
3420                        "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
3421          } else {
3422             simd16_cfg = v2.cfg;
3423          }
3424       } else {
3425          perf_debug("SIMD16 shader unsupported, falling back to "
3426                     "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
3427       }
3428    }
3429
3430    cfg_t *simd8_cfg;
3431    int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || brw->no_simd8;
3432    if (no_simd8 && simd16_cfg) {
3433       simd8_cfg = NULL;
3434       prog_data->no_8 = true;
3435    } else {
3436       simd8_cfg = v.cfg;
3437       prog_data->no_8 = false;
3438    }
3439
3440    const unsigned *assembly = NULL;
3441    fs_generator g(brw, mem_ctx, key, prog_data, prog, fp,
3442                   v.runtime_check_aads_emit, INTEL_DEBUG & DEBUG_WM);
3443    assembly = g.generate_assembly(simd8_cfg, simd16_cfg,
3444                                   final_assembly_size);
3445
3446    if (unlikely(brw->perf_debug) && shader) {
3447       if (shader->compiled_once)
3448          brw_wm_debug_recompile(brw, prog, key);
3449       shader->compiled_once = true;
3450
3451       if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
3452          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3453                     (get_time() - start_time) * 1000);
3454       }
3455    }
3456
3457    return assembly;
3458 }
3459
3460 bool
3461 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
3462 {
3463    struct brw_context *brw = brw_context(ctx);
3464    struct brw_wm_prog_key key;
3465
3466    if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
3467       return true;
3468
3469    struct gl_fragment_program *fp = (struct gl_fragment_program *)
3470       prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
3471    struct brw_fragment_program *bfp = brw_fragment_program(fp);
3472    bool program_uses_dfdy = fp->UsesDFdy;
3473
3474    memset(&key, 0, sizeof(key));
3475
3476    if (brw->gen < 6) {
3477       if (fp->UsesKill)
3478          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3479
3480       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3481          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3482
3483       /* Just assume depth testing. */
3484       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3485       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3486    }
3487
3488    if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
3489                                          BRW_FS_VARYING_INPUT_MASK) > 16)
3490       key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
3491
3492    unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
3493    for (unsigned i = 0; i < sampler_count; i++) {
3494       if (fp->Base.ShadowSamplers & (1 << i)) {
3495          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3496          key.tex.swizzles[i] =
3497             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3498       } else {
3499          /* Color sampler: assume no swizzling. */
3500          key.tex.swizzles[i] = SWIZZLE_XYZW;
3501       }
3502    }
3503
3504    if (fp->Base.InputsRead & VARYING_BIT_POS) {
3505       key.drawable_height = ctx->DrawBuffer->Height;
3506    }
3507
3508    key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
3509          ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
3510          BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
3511
3512    if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3513       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
3514                           key.nr_color_regions > 1;
3515    }
3516
3517    /* GL_FRAGMENT_SHADER_DERIVATIVE_HINT is almost always GL_DONT_CARE.  The
3518     * quality of the derivatives is likely to be determined by the driconf
3519     * option.
3520     */
3521    key.high_quality_derivatives = brw->disable_derivative_optimization;
3522
3523    key.program_string_id = bfp->id;
3524
3525    uint32_t old_prog_offset = brw->wm.base.prog_offset;
3526    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3527
3528    bool success = do_wm_prog(brw, prog, bfp, &key);
3529
3530    brw->wm.base.prog_offset = old_prog_offset;
3531    brw->wm.prog_data = old_prog_data;
3532
3533    return success;
3534 }