src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 extern "C" {
  32
  33 #include <sys/types.h>
  34
  35 #include "main/hash_table.h"
  36 #include "main/macros.h"
  37 #include "main/shaderobj.h"
  38 #include "main/uniforms.h"
  39 #include "main/fbobject.h"
  40 #include "program/prog_parameter.h"
  41 #include "program/prog_print.h"
  42 #include "program/register_allocate.h"
  43 #include "program/sampler.h"
  44 #include "program/hash_table.h"
  45 #include "brw_context.h"
  46 #include "brw_eu.h"
  47 #include "brw_wm.h"
  48 }
  49 #include "brw_fs.h"
  50 #include "glsl/glsl_types.h"
  51
  52 void
  53 fs_inst::init()
  54 {
  55    memset(this, 0, sizeof(*this));
  56    this->opcode = BRW_OPCODE_NOP;
  57    this->conditional_mod = BRW_CONDITIONAL_NONE;
  58
  59    this->dst = reg_undef;
  60    this->src[0] = reg_undef;
  61    this->src[1] = reg_undef;
  62    this->src[2] = reg_undef;
  63
  64    /* This will be the case for almost all instructions. */
  65    this->regs_written = 1;
  66 }
  67
  68 fs_inst::fs_inst()
  69 {
  70    init();
  71 }
  72
  73 fs_inst::fs_inst(enum opcode opcode)
  74 {
  75    init();
  76    this->opcode = opcode;
  77 }
  78
  79 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
  80 {
  81    init();
  82    this->opcode = opcode;
  83    this->dst = dst;
  84
  85    if (dst.file == GRF)
  86       assert(dst.reg_offset >= 0);
  87 }
  88
  89 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
  90 {
  91    init();
  92    this->opcode = opcode;
  93    this->dst = dst;
  94    this->src[0] = src0;
  95
  96    if (dst.file == GRF)
  97       assert(dst.reg_offset >= 0);
  98    if (src[0].file == GRF)
  99       assert(src[0].reg_offset >= 0);
 100 }
 101
 102 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 103 {
 104    init();
 105    this->opcode = opcode;
 106    this->dst = dst;
 107    this->src[0] = src0;
 108    this->src[1] = src1;
 109
 110    if (dst.file == GRF)
 111       assert(dst.reg_offset >= 0);
 112    if (src[0].file == GRF)
 113       assert(src[0].reg_offset >= 0);
 114    if (src[1].file == GRF)
 115       assert(src[1].reg_offset >= 0);
 116 }
 117
 118 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
 119                  fs_reg src0, fs_reg src1, fs_reg src2)
 120 {
 121    init();
 122    this->opcode = opcode;
 123    this->dst = dst;
 124    this->src[0] = src0;
 125    this->src[1] = src1;
 126    this->src[2] = src2;
 127
 128    if (dst.file == GRF)
 129       assert(dst.reg_offset >= 0);
 130    if (src[0].file == GRF)
 131       assert(src[0].reg_offset >= 0);
 132    if (src[1].file == GRF)
 133       assert(src[1].reg_offset >= 0);
 134    if (src[2].file == GRF)
 135       assert(src[2].reg_offset >= 0);
 136 }
 137
 138 #define ALU1(op)                                                        \
 139    fs_inst *                                                            \
 140    fs_visitor::op(fs_reg dst, fs_reg src0)                              \
 141    {                                                                    \
 142       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
 143    }
 144
 145 #define ALU2(op)                                                        \
 146    fs_inst *                                                            \
 147    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1)                 \
 148    {                                                                    \
 149       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
 150    }
 151
 152 #define ALU3(op)                                                        \
 153    fs_inst *                                                            \
 154    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1, fs_reg src2)    \
 155    {                                                                    \
 156       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
 157    }
 158
 159 ALU1(NOT)
 160 ALU1(MOV)
 161 ALU1(FRC)
 162 ALU1(RNDD)
 163 ALU1(RNDE)
 164 ALU1(RNDZ)
 165 ALU2(ADD)
 166 ALU2(MUL)
 167 ALU2(MACH)
 168 ALU2(AND)
 169 ALU2(OR)
 170 ALU2(XOR)
 171 ALU2(SHL)
 172 ALU2(SHR)
 173 ALU2(ASR)
 174 ALU3(LRP)
 175 ALU1(BFREV)
 176 ALU3(BFE)
 177 ALU2(BFI1)
 178 ALU3(BFI2)
 179 ALU1(FBH)
 180 ALU1(FBL)
 181 ALU1(CBIT)
 182 ALU3(MAD)
 183
 184 /** Gen4 predicated IF. */
 185 fs_inst *
 186 fs_visitor::IF(uint32_t predicate)
 187 {
 188    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
 189    inst->predicate = predicate;
 190    return inst;
 191 }
 192
 193 /** Gen6+ IF with embedded comparison. */
 194 fs_inst *
 195 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
 196 {
 197    assert(brw->gen >= 6);
 198    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
 199                                         reg_null_d, src0, src1);
 200    inst->conditional_mod = condition;
 201    return inst;
 202 }
 203
 204 /**
 205  * CMP: Sets the low bit of the destination channels with the result
 206  * of the comparison, while the upper bits are undefined, and updates
 207  * the flag register with the packed 16 bits of the result.
 208  */
 209 fs_inst *
 210 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
 211 {
 212    fs_inst *inst;
 213
 214    /* Take the instruction:
 215     *
 216     * CMP null<d> src0<f> src1<f>
 217     *
 218     * Original gen4 does type conversion to the destination type before
 219     * comparison, producing garbage results for floating point comparisons.
 220     * gen5 does the comparison on the execution type (resolved source types),
 221     * so dst type doesn't matter.  gen6 does comparison and then uses the
 222     * result as if it was the dst type with no conversion, which happens to
 223     * mostly work out for float-interpreted-as-int since our comparisons are
 224     * for >0, =0, <0.
 225     */
 226    if (brw->gen == 4) {
 227       dst.type = src0.type;
 228       if (dst.file == HW_REG)
 229          dst.fixed_hw_reg.type = dst.type;
 230    }
 231
 232    resolve_ud_negate(&src0);
 233    resolve_ud_negate(&src1);
 234
 235    inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
 236    inst->conditional_mod = condition;
 237
 238    return inst;
 239 }
 240
 241 exec_list
 242 fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
 243                                        fs_reg varying_offset,
 244                                        uint32_t const_offset)
 245 {
 246    exec_list instructions;
 247    fs_inst *inst;
 248
 249    /* We have our constant surface use a pitch of 4 bytes, so our index can
 250     * be any component of a vector, and then we load 4 contiguous
 251     * components starting from that.
 252     *
 253     * We break down the const_offset to a portion added to the variable
 254     * offset and a portion done using reg_offset, which means that if you
 255     * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
 256     * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
 257     * CSE can later notice that those loads are all the same and eliminate
 258     * the redundant ones.
 259     */
 260    fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
 261    instructions.push_tail(ADD(vec4_offset,
 262                               varying_offset, const_offset & ~3));
 263
 264    int scale = 1;
 265    if (brw->gen == 4 && dispatch_width == 8) {
 266       /* Pre-gen5, we can either use a SIMD8 message that requires (header,
 267        * u, v, r) as parameters, or we can just use the SIMD16 message
 268        * consisting of (header, u).  We choose the second, at the cost of a
 269        * longer return length.
 270        */
 271       scale = 2;
 272    }
 273
 274    enum opcode op;
 275    if (brw->gen >= 7)
 276       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
 277    else
 278       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
 279    fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4 * scale), dst.type);
 280    inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
 281    inst->regs_written = 4 * scale;
 282    instructions.push_tail(inst);
 283
 284    if (brw->gen < 7) {
 285       inst->base_mrf = 13;
 286       inst->header_present = true;
 287       if (brw->gen == 4)
 288          inst->mlen = 3;
 289       else
 290          inst->mlen = 1 + dispatch_width / 8;
 291    }
 292
 293    vec4_result.reg_offset += (const_offset & 3) * scale;
 294    instructions.push_tail(MOV(dst, vec4_result));
 295
 296    return instructions;
 297 }
 298
 299 /**
 300  * A helper for MOV generation for fixing up broken hardware SEND dependency
 301  * handling.
 302  */
 303 fs_inst *
 304 fs_visitor::DEP_RESOLVE_MOV(int grf)
 305 {
 306    fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
 307
 308    inst->ir = NULL;
 309    inst->annotation = "send dependency resolve";
 310
 311    /* The caller always wants uncompressed to emit the minimal extra
 312     * dependencies, and to avoid having to deal with aligning its regs to 2.
 313     */
 314    inst->force_uncompressed = true;
 315
 316    return inst;
 317 }
 318
 319 bool
 320 fs_inst::equals(fs_inst *inst)
 321 {
 322    return (opcode == inst->opcode &&
 323            dst.equals(inst->dst) &&
 324            src[0].equals(inst->src[0]) &&
 325            src[1].equals(inst->src[1]) &&
 326            src[2].equals(inst->src[2]) &&
 327            saturate == inst->saturate &&
 328            predicate == inst->predicate &&
 329            conditional_mod == inst->conditional_mod &&
 330            mlen == inst->mlen &&
 331            base_mrf == inst->base_mrf &&
 332            sampler == inst->sampler &&
 333            target == inst->target &&
 334            eot == inst->eot &&
 335            header_present == inst->header_present &&
 336            shadow_compare == inst->shadow_compare &&
 337            offset == inst->offset);
 338 }
 339
 340 bool
 341 fs_inst::overwrites_reg(const fs_reg &reg)
 342 {
 343    return (reg.file == dst.file &&
 344            reg.reg == dst.reg &&
 345            reg.reg_offset >= dst.reg_offset  &&
 346            reg.reg_offset < dst.reg_offset + regs_written);
 347 }
 348
 349 bool
 350 fs_inst::is_send_from_grf()
 351 {
 352    return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
 353            opcode == SHADER_OPCODE_SHADER_TIME_ADD ||
 354            (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
 355             src[1].file == GRF));
 356 }
 357
 358 bool
 359 fs_visitor::can_do_source_mods(fs_inst *inst)
 360 {
 361    if (brw->gen == 6 && inst->is_math())
 362       return false;
 363
 364    if (inst->is_send_from_grf())
 365       return false;
 366
 367    return true;
 368 }
 369
 370 void
 371 fs_reg::init()
 372 {
 373    memset(this, 0, sizeof(*this));
 374    this->smear = -1;
 375 }
 376
 377 /** Generic unset register constructor. */
 378 fs_reg::fs_reg()
 379 {
 380    init();
 381    this->file = BAD_FILE;
 382 }
 383
 384 /** Immediate value constructor. */
 385 fs_reg::fs_reg(float f)
 386 {
 387    init();
 388    this->file = IMM;
 389    this->type = BRW_REGISTER_TYPE_F;
 390    this->imm.f = f;
 391 }
 392
 393 /** Immediate value constructor. */
 394 fs_reg::fs_reg(int32_t i)
 395 {
 396    init();
 397    this->file = IMM;
 398    this->type = BRW_REGISTER_TYPE_D;
 399    this->imm.i = i;
 400 }
 401
 402 /** Immediate value constructor. */
 403 fs_reg::fs_reg(uint32_t u)
 404 {
 405    init();
 406    this->file = IMM;
 407    this->type = BRW_REGISTER_TYPE_UD;
 408    this->imm.u = u;
 409 }
 410
 411 /** Fixed brw_reg Immediate value constructor. */
 412 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
 413 {
 414    init();
 415    this->file = HW_REG;
 416    this->fixed_hw_reg = fixed_hw_reg;
 417    this->type = fixed_hw_reg.type;
 418 }
 419
 420 bool
 421 fs_reg::equals(const fs_reg &r) const
 422 {
 423    return (file == r.file &&
 424            reg == r.reg &&
 425            reg_offset == r.reg_offset &&
 426            type == r.type &&
 427            negate == r.negate &&
 428            abs == r.abs &&
 429            !reladdr && !r.reladdr &&
 430            memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
 431                   sizeof(fixed_hw_reg)) == 0 &&
 432            smear == r.smear &&
 433            imm.u == r.imm.u);
 434 }
 435
 436 bool
 437 fs_reg::is_zero() const
 438 {
 439    if (file != IMM)
 440       return false;
 441
 442    return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
 443 }
 444
 445 bool
 446 fs_reg::is_one() const
 447 {
 448    if (file != IMM)
 449       return false;
 450
 451    return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
 452 }
 453
 454 bool
 455 fs_reg::is_valid_3src() const
 456 {
 457    return file == GRF || file == UNIFORM;
 458 }
 459
 460 int
 461 fs_visitor::type_size(const struct glsl_type *type)
 462 {
 463    unsigned int size, i;
 464
 465    switch (type->base_type) {
 466    case GLSL_TYPE_UINT:
 467    case GLSL_TYPE_INT:
 468    case GLSL_TYPE_FLOAT:
 469    case GLSL_TYPE_BOOL:
 470       return type->components();
 471    case GLSL_TYPE_ARRAY:
 472       return type_size(type->fields.array) * type->length;
 473    case GLSL_TYPE_STRUCT:
 474       size = 0;
 475       for (i = 0; i < type->length; i++) {
 476          size += type_size(type->fields.structure[i].type);
 477       }
 478       return size;
 479    case GLSL_TYPE_SAMPLER:
 480       /* Samplers take up no register space, since they're baked in at
 481        * link time.
 482        */
 483       return 0;
 484    case GLSL_TYPE_VOID:
 485    case GLSL_TYPE_ERROR:
 486    case GLSL_TYPE_INTERFACE:
 487       assert(!"not reached");
 488       break;
 489    }
 490
 491    return 0;
 492 }
 493
 494 fs_reg
 495 fs_visitor::get_timestamp()
 496 {
 497    assert(brw->gen >= 7);
 498
 499    fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 500                                           BRW_ARF_TIMESTAMP,
 501                                           0),
 502                              BRW_REGISTER_TYPE_UD));
 503
 504    fs_reg dst = fs_reg(this, glsl_type::uint_type);
 505
 506    fs_inst *mov = emit(MOV(dst, ts));
 507    /* We want to read the 3 fields we care about (mostly field 0, but also 2)
 508     * even if it's not enabled in the dispatch.
 509     */
 510    mov->force_writemask_all = true;
 511    mov->force_uncompressed = true;
 512
 513    /* The caller wants the low 32 bits of the timestamp.  Since it's running
 514     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
 515     * which is plenty of time for our purposes.  It is identical across the
 516     * EUs, but since it's tracking GPU core speed it will increment at a
 517     * varying rate as render P-states change.
 518     *
 519     * The caller could also check if render P-states have changed (or anything
 520     * else that might disrupt timing) by setting smear to 2 and checking if
 521     * that field is != 0.
 522     */
 523    dst.smear = 0;
 524
 525    return dst;
 526 }
 527
 528 void
 529 fs_visitor::emit_shader_time_begin()
 530 {
 531    current_annotation = "shader time start";
 532    shader_start_time = get_timestamp();
 533 }
 534
 535 void
 536 fs_visitor::emit_shader_time_end()
 537 {
 538    current_annotation = "shader time end";
 539
 540    enum shader_time_shader_type type, written_type, reset_type;
 541    if (dispatch_width == 8) {
 542       type = ST_FS8;
 543       written_type = ST_FS8_WRITTEN;
 544       reset_type = ST_FS8_RESET;
 545    } else {
 546       assert(dispatch_width == 16);
 547       type = ST_FS16;
 548       written_type = ST_FS16_WRITTEN;
 549       reset_type = ST_FS16_RESET;
 550    }
 551
 552    fs_reg shader_end_time = get_timestamp();
 553
 554    /* Check that there weren't any timestamp reset events (assuming these
 555     * were the only two timestamp reads that happened).
 556     */
 557    fs_reg reset = shader_end_time;
 558    reset.smear = 2;
 559    fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
 560    test->conditional_mod = BRW_CONDITIONAL_Z;
 561    emit(IF(BRW_PREDICATE_NORMAL));
 562
 563    push_force_uncompressed();
 564    fs_reg start = shader_start_time;
 565    start.negate = true;
 566    fs_reg diff = fs_reg(this, glsl_type::uint_type);
 567    emit(ADD(diff, start, shader_end_time));
 568
 569    /* If there were no instructions between the two timestamp gets, the diff
 570     * is 2 cycles.  Remove that overhead, so I can forget about that when
 571     * trying to determine the time taken for single instructions.
 572     */
 573    emit(ADD(diff, diff, fs_reg(-2u)));
 574
 575    emit_shader_time_write(type, diff);
 576    emit_shader_time_write(written_type, fs_reg(1u));
 577    emit(BRW_OPCODE_ELSE);
 578    emit_shader_time_write(reset_type, fs_reg(1u));
 579    emit(BRW_OPCODE_ENDIF);
 580
 581    pop_force_uncompressed();
 582 }
 583
 584 void
 585 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
 586                                    fs_reg value)
 587 {
 588    int shader_time_index =
 589       brw_get_shader_time_index(brw, shader_prog, &fp->Base, type);
 590    fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
 591
 592    fs_reg payload;
 593    if (dispatch_width == 8)
 594       payload = fs_reg(this, glsl_type::uvec2_type);
 595    else
 596       payload = fs_reg(this, glsl_type::uint_type);
 597
 598    emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
 599                 fs_reg(), payload, offset, value));
 600 }
 601
 602 void
 603 fs_visitor::fail(const char *format, ...)
 604 {
 605    va_list va;
 606    char *msg;
 607
 608    if (failed)
 609       return;
 610
 611    failed = true;
 612
 613    va_start(va, format);
 614    msg = ralloc_vasprintf(mem_ctx, format, va);
 615    va_end(va);
 616    msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
 617
 618    this->fail_msg = msg;
 619
 620    if (INTEL_DEBUG & DEBUG_WM) {
 621       fprintf(stderr, "%s",  msg);
 622    }
 623 }
 624
 625 fs_inst *
 626 fs_visitor::emit(enum opcode opcode)
 627 {
 628    return emit(fs_inst(opcode));
 629 }
 630
 631 fs_inst *
 632 fs_visitor::emit(enum opcode opcode, fs_reg dst)
 633 {
 634    return emit(fs_inst(opcode, dst));
 635 }
 636
 637 fs_inst *
 638 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
 639 {
 640    return emit(fs_inst(opcode, dst, src0));
 641 }
 642
 643 fs_inst *
 644 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 645 {
 646    return emit(fs_inst(opcode, dst, src0, src1));
 647 }
 648
 649 fs_inst *
 650 fs_visitor::emit(enum opcode opcode, fs_reg dst,
 651                  fs_reg src0, fs_reg src1, fs_reg src2)
 652 {
 653    return emit(fs_inst(opcode, dst, src0, src1, src2));
 654 }
 655
 656 void
 657 fs_visitor::push_force_uncompressed()
 658 {
 659    force_uncompressed_stack++;
 660 }
 661
 662 void
 663 fs_visitor::pop_force_uncompressed()
 664 {
 665    force_uncompressed_stack--;
 666    assert(force_uncompressed_stack >= 0);
 667 }
 668
 669 void
 670 fs_visitor::push_force_sechalf()
 671 {
 672    force_sechalf_stack++;
 673 }
 674
 675 void
 676 fs_visitor::pop_force_sechalf()
 677 {
 678    force_sechalf_stack--;
 679    assert(force_sechalf_stack >= 0);
 680 }
 681
 682 /**
 683  * Returns true if the instruction has a flag that means it won't
 684  * update an entire destination register.
 685  *
 686  * For example, dead code elimination and live variable analysis want to know
 687  * when a write to a variable screens off any preceding values that were in
 688  * it.
 689  */
 690 bool
 691 fs_inst::is_partial_write()
 692 {
 693    return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
 694            this->force_uncompressed ||
 695            this->force_sechalf);
 696 }
 697
 698 /**
 699  * Returns how many MRFs an FS opcode will write over.
 700  *
 701  * Note that this is not the 0 or 1 implied writes in an actual gen
 702  * instruction -- the FS opcodes often generate MOVs in addition.
 703  */
 704 int
 705 fs_visitor::implied_mrf_writes(fs_inst *inst)
 706 {
 707    if (inst->mlen == 0)
 708       return 0;
 709
 710    switch (inst->opcode) {
 711    case SHADER_OPCODE_RCP:
 712    case SHADER_OPCODE_RSQ:
 713    case SHADER_OPCODE_SQRT:
 714    case SHADER_OPCODE_EXP2:
 715    case SHADER_OPCODE_LOG2:
 716    case SHADER_OPCODE_SIN:
 717    case SHADER_OPCODE_COS:
 718       return 1 * dispatch_width / 8;
 719    case SHADER_OPCODE_POW:
 720    case SHADER_OPCODE_INT_QUOTIENT:
 721    case SHADER_OPCODE_INT_REMAINDER:
 722       return 2 * dispatch_width / 8;
 723    case SHADER_OPCODE_TEX:
 724    case FS_OPCODE_TXB:
 725    case SHADER_OPCODE_TXD:
 726    case SHADER_OPCODE_TXF:
 727    case SHADER_OPCODE_TXF_MS:
 728    case SHADER_OPCODE_TXL:
 729    case SHADER_OPCODE_TXS:
 730    case SHADER_OPCODE_LOD:
 731       return 1;
 732    case FS_OPCODE_FB_WRITE:
 733       return 2;
 734    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 735    case FS_OPCODE_UNSPILL:
 736       return 1;
 737    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
 738       return inst->mlen;
 739    case FS_OPCODE_SPILL:
 740       return 2;
 741    default:
 742       assert(!"not reached");
 743       return inst->mlen;
 744    }
 745 }
 746
 747 int
 748 fs_visitor::virtual_grf_alloc(int size)
 749 {
 750    if (virtual_grf_array_size <= virtual_grf_count) {
 751       if (virtual_grf_array_size == 0)
 752          virtual_grf_array_size = 16;
 753       else
 754          virtual_grf_array_size *= 2;
 755       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 756                                    virtual_grf_array_size);
 757    }
 758    virtual_grf_sizes[virtual_grf_count] = size;
 759    return virtual_grf_count++;
 760 }
 761
 762 /** Fixed HW reg constructor. */
 763 fs_reg::fs_reg(enum register_file file, int reg)
 764 {
 765    init();
 766    this->file = file;
 767    this->reg = reg;
 768    this->type = BRW_REGISTER_TYPE_F;
 769 }
 770
 771 /** Fixed HW reg constructor. */
 772 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
 773 {
 774    init();
 775    this->file = file;
 776    this->reg = reg;
 777    this->type = type;
 778 }
 779
 780 /** Automatic reg constructor. */
 781 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
 782 {
 783    init();
 784
 785    this->file = GRF;
 786    this->reg = v->virtual_grf_alloc(v->type_size(type));
 787    this->reg_offset = 0;
 788    this->type = brw_type_for_base_type(type);
 789 }
 790
 791 fs_reg *
 792 fs_visitor::variable_storage(ir_variable *var)
 793 {
 794    return (fs_reg *)hash_table_find(this->variable_ht, var);
 795 }
 796
 797 void
 798 import_uniforms_callback(const void *key,
 799                          void *data,
 800                          void *closure)
 801 {
 802    struct hash_table *dst_ht = (struct hash_table *)closure;
 803    const fs_reg *reg = (const fs_reg *)data;
 804
 805    if (reg->file != UNIFORM)
 806       return;
 807
 808    hash_table_insert(dst_ht, data, key);
 809 }
 810
 811 /* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
 812  * This brings in those uniform definitions
 813  */
 814 void
 815 fs_visitor::import_uniforms(fs_visitor *v)
 816 {
 817    hash_table_call_foreach(v->variable_ht,
 818                            import_uniforms_callback,
 819                            variable_ht);
 820    this->params_remap = v->params_remap;
 821    this->nr_params_remap = v->nr_params_remap;
 822 }
 823
 824 /* Our support for uniforms is piggy-backed on the struct
 825  * gl_fragment_program, because that's where the values actually
 826  * get stored, rather than in some global gl_shader_program uniform
 827  * store.
 828  */
 829 void
 830 fs_visitor::setup_uniform_values(ir_variable *ir)
 831 {
 832    int namelen = strlen(ir->name);
 833
 834    /* The data for our (non-builtin) uniforms is stored in a series of
 835     * gl_uniform_driver_storage structs for each subcomponent that
 836     * glGetUniformLocation() could name.  We know it's been set up in the same
 837     * order we'd walk the type, so walk the list of storage and find anything
 838     * with our name, or the prefix of a component that starts with our name.
 839     */
 840    unsigned params_before = c->prog_data.nr_params;
 841    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
 842       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 843
 844       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 845           (storage->name[namelen] != 0 &&
 846            storage->name[namelen] != '.' &&
 847            storage->name[namelen] != '[')) {
 848          continue;
 849       }
 850
 851       unsigned slots = storage->type->component_slots();
 852       if (storage->array_elements)
 853          slots *= storage->array_elements;
 854
 855       for (unsigned i = 0; i < slots; i++) {
 856          c->prog_data.param[c->prog_data.nr_params++] =
 857             &storage->storage[i].f;
 858       }
 859    }
 860
 861    /* Make sure we actually initialized the right amount of stuff here. */
 862    assert(params_before + ir->type->component_slots() ==
 863           c->prog_data.nr_params);
 864    (void)params_before;
 865 }
 866
 867
 868 /* Our support for builtin uniforms is even scarier than non-builtin.
 869  * It sits on top of the PROG_STATE_VAR parameters that are
 870  * automatically updated from GL context state.
 871  */
 872 void
 873 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
 874 {
 875    const ir_state_slot *const slots = ir->state_slots;
 876    assert(ir->state_slots != NULL);
 877
 878    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 879       /* This state reference has already been setup by ir_to_mesa, but we'll
 880        * get the same index back here.
 881        */
 882       int index = _mesa_add_state_reference(this->fp->Base.Parameters,
 883                                             (gl_state_index *)slots[i].tokens);
 884
 885       /* Add each of the unique swizzles of the element as a parameter.
 886        * This'll end up matching the expected layout of the
 887        * array/matrix/structure we're trying to fill in.
 888        */
 889       int last_swiz = -1;
 890       for (unsigned int j = 0; j < 4; j++) {
 891          int swiz = GET_SWZ(slots[i].swizzle, j);
 892          if (swiz == last_swiz)
 893             break;
 894          last_swiz = swiz;
 895
 896          c->prog_data.param[c->prog_data.nr_params++] =
 897             &fp->Base.Parameters->ParameterValues[index][swiz].f;
 898       }
 899    }
 900 }
 901
 902 fs_reg *
 903 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
 904 {
 905    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 906    fs_reg wpos = *reg;
 907    bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
 908
 909    /* gl_FragCoord.x */
 910    if (ir->pixel_center_integer) {
 911       emit(MOV(wpos, this->pixel_x));
 912    } else {
 913       emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
 914    }
 915    wpos.reg_offset++;
 916
 917    /* gl_FragCoord.y */
 918    if (!flip && ir->pixel_center_integer) {
 919       emit(MOV(wpos, this->pixel_y));
 920    } else {
 921       fs_reg pixel_y = this->pixel_y;
 922       float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
 923
 924       if (flip) {
 925          pixel_y.negate = true;
 926          offset += c->key.drawable_height - 1.0;
 927       }
 928
 929       emit(ADD(wpos, pixel_y, fs_reg(offset)));
 930    }
 931    wpos.reg_offset++;
 932
 933    /* gl_FragCoord.z */
 934    if (brw->gen >= 6) {
 935       emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
 936    } else {
 937       emit(FS_OPCODE_LINTERP, wpos,
 938            this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
 939            this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
 940            interp_reg(VARYING_SLOT_POS, 2));
 941    }
 942    wpos.reg_offset++;
 943
 944    /* gl_FragCoord.w: Already set up in emit_interpolation */
 945    emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
 946
 947    return reg;
 948 }
 949
 950 fs_inst *
 951 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
 952                          glsl_interp_qualifier interpolation_mode,
 953                          bool is_centroid)
 954 {
 955    brw_wm_barycentric_interp_mode barycoord_mode;
 956    if (brw->gen >= 6) {
 957       if (is_centroid) {
 958          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
 959             barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
 960          else
 961             barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
 962       } else {
 963          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
 964             barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
 965          else
 966             barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
 967       }
 968    } else {
 969       /* On Ironlake and below, there is only one interpolation mode.
 970        * Centroid interpolation doesn't mean anything on this hardware --
 971        * there is no multisampling.
 972        */
 973       barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
 974    }
 975    return emit(FS_OPCODE_LINTERP, attr,
 976                this->delta_x[barycoord_mode],
 977                this->delta_y[barycoord_mode], interp);
 978 }
 979
 980 fs_reg *
 981 fs_visitor::emit_general_interpolation(ir_variable *ir)
 982 {
 983    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 984    reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
 985    fs_reg attr = *reg;
 986
 987    unsigned int array_elements;
 988    const glsl_type *type;
 989
 990    if (ir->type->is_array()) {
 991       array_elements = ir->type->length;
 992       if (array_elements == 0) {
 993          fail("dereferenced array '%s' has length 0\n", ir->name);
 994       }
 995       type = ir->type->fields.array;
 996    } else {
 997       array_elements = 1;
 998       type = ir->type;
 999    }
1000
1001    glsl_interp_qualifier interpolation_mode =
1002       ir->determine_interpolation_mode(c->key.flat_shade);
1003
1004    int location = ir->location;
1005    for (unsigned int i = 0; i < array_elements; i++) {
1006       for (unsigned int j = 0; j < type->matrix_columns; j++) {
1007          if (c->prog_data.urb_setup[location] == -1) {
1008             /* If there's no incoming setup data for this slot, don't
1009              * emit interpolation for it.
1010              */
1011             attr.reg_offset += type->vector_elements;
1012             location++;
1013             continue;
1014          }
1015
1016          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1017             /* Constant interpolation (flat shading) case. The SF has
1018              * handed us defined values in only the constant offset
1019              * field of the setup reg.
1020              */
1021             for (unsigned int k = 0; k < type->vector_elements; k++) {
1022                struct brw_reg interp = interp_reg(location, k);
1023                interp = suboffset(interp, 3);
1024                interp.type = reg->type;
1025                emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1026                attr.reg_offset++;
1027             }
1028          } else {
1029             /* Smooth/noperspective interpolation case. */
1030             for (unsigned int k = 0; k < type->vector_elements; k++) {
1031                /* FINISHME: At some point we probably want to push
1032                 * this farther by giving similar treatment to the
1033                 * other potentially constant components of the
1034                 * attribute, as well as making brw_vs_constval.c
1035                 * handle varyings other than gl_TexCoord.
1036                 */
1037                struct brw_reg interp = interp_reg(location, k);
1038                emit_linterp(attr, fs_reg(interp), interpolation_mode,
1039                             ir->centroid);
1040                if (brw->needs_unlit_centroid_workaround && ir->centroid) {
1041                   /* Get the pixel/sample mask into f0 so that we know
1042                    * which pixels are lit.  Then, for each channel that is
1043                    * unlit, replace the centroid data with non-centroid
1044                    * data.
1045                    */
1046                   emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1047                   fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1048                                                interpolation_mode, false);
1049                   inst->predicate = BRW_PREDICATE_NORMAL;
1050                   inst->predicate_inverse = true;
1051                }
1052                if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1053                   emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1054                }
1055                attr.reg_offset++;
1056             }
1057
1058          }
1059          location++;
1060       }
1061    }
1062
1063    return reg;
1064 }
1065
1066 fs_reg *
1067 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1068 {
1069    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1070
1071    /* The frontfacing comes in as a bit in the thread payload. */
1072    if (brw->gen >= 6) {
1073       emit(BRW_OPCODE_ASR, *reg,
1074            fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1075            fs_reg(15));
1076       emit(BRW_OPCODE_NOT, *reg, *reg);
1077       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1078    } else {
1079       struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1080       /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1081        * us front face
1082        */
1083       emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1084       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1085    }
1086
1087    return reg;
1088 }
1089
1090 fs_reg
1091 fs_visitor::fix_math_operand(fs_reg src)
1092 {
1093    /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1094     * might be able to do better by doing execsize = 1 math and then
1095     * expanding that result out, but we would need to be careful with
1096     * masking.
1097     *
1098     * The hardware ignores source modifiers (negate and abs) on math
1099     * instructions, so we also move to a temp to set those up.
1100     */
1101    if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1102        !src.abs && !src.negate)
1103       return src;
1104
1105    /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1106     * operands to math
1107     */
1108    if (brw->gen >= 7 && src.file != IMM)
1109       return src;
1110
1111    fs_reg expanded = fs_reg(this, glsl_type::float_type);
1112    expanded.type = src.type;
1113    emit(BRW_OPCODE_MOV, expanded, src);
1114    return expanded;
1115 }
1116
1117 fs_inst *
1118 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1119 {
1120    switch (opcode) {
1121    case SHADER_OPCODE_RCP:
1122    case SHADER_OPCODE_RSQ:
1123    case SHADER_OPCODE_SQRT:
1124    case SHADER_OPCODE_EXP2:
1125    case SHADER_OPCODE_LOG2:
1126    case SHADER_OPCODE_SIN:
1127    case SHADER_OPCODE_COS:
1128       break;
1129    default:
1130       assert(!"not reached: bad math opcode");
1131       return NULL;
1132    }
1133
1134    /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
1135     * might be able to do better by doing execsize = 1 math and then
1136     * expanding that result out, but we would need to be careful with
1137     * masking.
1138     *
1139     * Gen 6 hardware ignores source modifiers (negate and abs) on math
1140     * instructions, so we also move to a temp to set those up.
1141     */
1142    if (brw->gen >= 6)
1143       src = fix_math_operand(src);
1144
1145    fs_inst *inst = emit(opcode, dst, src);
1146
1147    if (brw->gen < 6) {
1148       inst->base_mrf = 2;
1149       inst->mlen = dispatch_width / 8;
1150    }
1151
1152    return inst;
1153 }
1154
1155 fs_inst *
1156 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1157 {
1158    int base_mrf = 2;
1159    fs_inst *inst;
1160
1161    switch (opcode) {
1162    case SHADER_OPCODE_INT_QUOTIENT:
1163    case SHADER_OPCODE_INT_REMAINDER:
1164       if (brw->gen >= 7 && dispatch_width == 16)
1165          fail("16-wide INTDIV unsupported\n");
1166       break;
1167    case SHADER_OPCODE_POW:
1168       break;
1169    default:
1170       assert(!"not reached: unsupported binary math opcode.");
1171       return NULL;
1172    }
1173
1174    if (brw->gen >= 6) {
1175       src0 = fix_math_operand(src0);
1176       src1 = fix_math_operand(src1);
1177
1178       inst = emit(opcode, dst, src0, src1);
1179    } else {
1180       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1181        * "Message Payload":
1182        *
1183        * "Operand0[7].  For the INT DIV functions, this operand is the
1184        *  denominator."
1185        *  ...
1186        * "Operand1[7].  For the INT DIV functions, this operand is the
1187        *  numerator."
1188        */
1189       bool is_int_div = opcode != SHADER_OPCODE_POW;
1190       fs_reg &op0 = is_int_div ? src1 : src0;
1191       fs_reg &op1 = is_int_div ? src0 : src1;
1192
1193       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1194       inst = emit(opcode, dst, op0, reg_null_f);
1195
1196       inst->base_mrf = base_mrf;
1197       inst->mlen = 2 * dispatch_width / 8;
1198    }
1199    return inst;
1200 }
1201
1202 void
1203 fs_visitor::assign_curb_setup()
1204 {
1205    c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
1206    if (dispatch_width == 8) {
1207       c->prog_data.first_curbe_grf = c->nr_payload_regs;
1208    } else {
1209       c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1210    }
1211
1212    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1213    foreach_list(node, &this->instructions) {
1214       fs_inst *inst = (fs_inst *)node;
1215
1216       for (unsigned int i = 0; i < 3; i++) {
1217          if (inst->src[i].file == UNIFORM) {
1218             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1219             struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1220                                                   constant_nr / 8,
1221                                                   constant_nr % 8);
1222
1223             inst->src[i].file = HW_REG;
1224             inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
1225          }
1226       }
1227    }
1228 }
1229
1230 void
1231 fs_visitor::calculate_urb_setup()
1232 {
1233    for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1234       c->prog_data.urb_setup[i] = -1;
1235    }
1236
1237    int urb_next = 0;
1238    /* Figure out where each of the incoming setup attributes lands. */
1239    if (brw->gen >= 6) {
1240       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1241          if (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1242              BITFIELD64_BIT(i)) {
1243             c->prog_data.urb_setup[i] = urb_next++;
1244          }
1245       }
1246    } else {
1247       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1248       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1249          /* Point size is packed into the header, not as a general attribute */
1250          if (i == VARYING_SLOT_PSIZ)
1251             continue;
1252
1253          if (c->key.input_slots_valid & BITFIELD64_BIT(i)) {
1254             /* The back color slot is skipped when the front color is
1255              * also written to.  In addition, some slots can be
1256              * written in the vertex shader and not read in the
1257              * fragment shader.  So the register number must always be
1258              * incremented, mapped or not.
1259              */
1260             if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1261                c->prog_data.urb_setup[i] = urb_next;
1262             urb_next++;
1263          }
1264       }
1265
1266       /*
1267        * It's a FS only attribute, and we did interpolation for this attribute
1268        * in SF thread. So, count it here, too.
1269        *
1270        * See compile_sf_prog() for more info.
1271        */
1272       if (fp->Base.InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1273          c->prog_data.urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1274    }
1275
1276    c->prog_data.num_varying_inputs = urb_next;
1277 }
1278
1279 void
1280 fs_visitor::assign_urb_setup()
1281 {
1282    int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1283
1284    /* Offset all the urb_setup[] index by the actual position of the
1285     * setup regs, now that the location of the constants has been chosen.
1286     */
1287    foreach_list(node, &this->instructions) {
1288       fs_inst *inst = (fs_inst *)node;
1289
1290       if (inst->opcode == FS_OPCODE_LINTERP) {
1291          assert(inst->src[2].file == HW_REG);
1292          inst->src[2].fixed_hw_reg.nr += urb_start;
1293       }
1294
1295       if (inst->opcode == FS_OPCODE_CINTERP) {
1296          assert(inst->src[0].file == HW_REG);
1297          inst->src[0].fixed_hw_reg.nr += urb_start;
1298       }
1299    }
1300
1301    /* Each attribute is 4 setup channels, each of which is half a reg. */
1302    this->first_non_payload_grf =
1303       urb_start + c->prog_data.num_varying_inputs * 2;
1304 }
1305
1306 /**
1307  * Split large virtual GRFs into separate components if we can.
1308  *
1309  * This is mostly duplicated with what brw_fs_vector_splitting does,
1310  * but that's really conservative because it's afraid of doing
1311  * splitting that doesn't result in real progress after the rest of
1312  * the optimization phases, which would cause infinite looping in
1313  * optimization.  We can do it once here, safely.  This also has the
1314  * opportunity to split interpolated values, or maybe even uniforms,
1315  * which we don't have at the IR level.
1316  *
1317  * We want to split, because virtual GRFs are what we register
1318  * allocate and spill (due to contiguousness requirements for some
1319  * instructions), and they're what we naturally generate in the
1320  * codegen process, but most virtual GRFs don't actually need to be
1321  * contiguous sets of GRFs.  If we split, we'll end up with reduced
1322  * live intervals and better dead code elimination and coalescing.
1323  */
1324 void
1325 fs_visitor::split_virtual_grfs()
1326 {
1327    int num_vars = this->virtual_grf_count;
1328    bool split_grf[num_vars];
1329    int new_virtual_grf[num_vars];
1330
1331    /* Try to split anything > 0 sized. */
1332    for (int i = 0; i < num_vars; i++) {
1333       if (this->virtual_grf_sizes[i] != 1)
1334          split_grf[i] = true;
1335       else
1336          split_grf[i] = false;
1337    }
1338
1339    if (brw->has_pln &&
1340        this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1341       /* PLN opcodes rely on the delta_xy being contiguous.  We only have to
1342        * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1343        * Gen6, that was the only supported interpolation mode, and since Gen6,
1344        * delta_x and delta_y are in fixed hardware registers.
1345        */
1346       split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1347          false;
1348    }
1349
1350    foreach_list(node, &this->instructions) {
1351       fs_inst *inst = (fs_inst *)node;
1352
1353       /* If there's a SEND message that requires contiguous destination
1354        * registers, no splitting is allowed.
1355        */
1356       if (inst->regs_written > 1) {
1357          split_grf[inst->dst.reg] = false;
1358       }
1359
1360       /* If we're sending from a GRF, don't split it, on the assumption that
1361        * the send is reading the whole thing.
1362        */
1363       if (inst->is_send_from_grf()) {
1364          for (int i = 0; i < 3; i++) {
1365             if (inst->src[i].file == GRF) {
1366                split_grf[inst->src[i].reg] = false;
1367             }
1368          }
1369       }
1370    }
1371
1372    /* Allocate new space for split regs.  Note that the virtual
1373     * numbers will be contiguous.
1374     */
1375    for (int i = 0; i < num_vars; i++) {
1376       if (split_grf[i]) {
1377          new_virtual_grf[i] = virtual_grf_alloc(1);
1378          for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1379             int reg = virtual_grf_alloc(1);
1380             assert(reg == new_virtual_grf[i] + j - 1);
1381             (void) reg;
1382          }
1383          this->virtual_grf_sizes[i] = 1;
1384       }
1385    }
1386
1387    foreach_list(node, &this->instructions) {
1388       fs_inst *inst = (fs_inst *)node;
1389
1390       if (inst->dst.file == GRF &&
1391           split_grf[inst->dst.reg] &&
1392           inst->dst.reg_offset != 0) {
1393          inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1394                           inst->dst.reg_offset - 1);
1395          inst->dst.reg_offset = 0;
1396       }
1397       for (int i = 0; i < 3; i++) {
1398          if (inst->src[i].file == GRF &&
1399              split_grf[inst->src[i].reg] &&
1400              inst->src[i].reg_offset != 0) {
1401             inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1402                                 inst->src[i].reg_offset - 1);
1403             inst->src[i].reg_offset = 0;
1404          }
1405       }
1406    }
1407    this->live_intervals_valid = false;
1408 }
1409
1410 /**
1411  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1412  *
1413  * During code generation, we create tons of temporary variables, many of
1414  * which get immediately killed and are never used again.  Yet, in later
1415  * optimization and analysis passes, such as compute_live_intervals, we need
1416  * to loop over all the virtual GRFs.  Compacting them can save a lot of
1417  * overhead.
1418  */
1419 void
1420 fs_visitor::compact_virtual_grfs()
1421 {
1422    /* Mark which virtual GRFs are used, and count how many. */
1423    int remap_table[this->virtual_grf_count];
1424    memset(remap_table, -1, sizeof(remap_table));
1425
1426    foreach_list(node, &this->instructions) {
1427       const fs_inst *inst = (const fs_inst *) node;
1428
1429       if (inst->dst.file == GRF)
1430          remap_table[inst->dst.reg] = 0;
1431
1432       for (int i = 0; i < 3; i++) {
1433          if (inst->src[i].file == GRF)
1434             remap_table[inst->src[i].reg] = 0;
1435       }
1436    }
1437
1438    /* In addition to registers used in instructions, fs_visitor keeps
1439     * direct references to certain special values which must be patched:
1440     */
1441    fs_reg *special[] = {
1442       &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1443       &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1444       &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1445       &delta_x[0], &delta_x[1], &delta_x[2],
1446       &delta_x[3], &delta_x[4], &delta_x[5],
1447       &delta_y[0], &delta_y[1], &delta_y[2],
1448       &delta_y[3], &delta_y[4], &delta_y[5],
1449    };
1450    STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1451    STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1452
1453    /* Treat all special values as used, to be conservative */
1454    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1455       if (special[i]->file == GRF)
1456          remap_table[special[i]->reg] = 0;
1457    }
1458
1459    /* Compact the GRF arrays. */
1460    int new_index = 0;
1461    for (int i = 0; i < this->virtual_grf_count; i++) {
1462       if (remap_table[i] != -1) {
1463          remap_table[i] = new_index;
1464          virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1465          if (live_intervals_valid) {
1466             virtual_grf_start[new_index] = virtual_grf_start[i];
1467             virtual_grf_end[new_index] = virtual_grf_end[i];
1468          }
1469          ++new_index;
1470       }
1471    }
1472
1473    this->virtual_grf_count = new_index;
1474
1475    /* Patch all the instructions to use the newly renumbered registers */
1476    foreach_list(node, &this->instructions) {
1477       fs_inst *inst = (fs_inst *) node;
1478
1479       if (inst->dst.file == GRF)
1480          inst->dst.reg = remap_table[inst->dst.reg];
1481
1482       for (int i = 0; i < 3; i++) {
1483          if (inst->src[i].file == GRF)
1484             inst->src[i].reg = remap_table[inst->src[i].reg];
1485       }
1486    }
1487
1488    /* Patch all the references to special values */
1489    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1490       if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1491          special[i]->reg = remap_table[special[i]->reg];
1492    }
1493 }
1494
1495 bool
1496 fs_visitor::remove_dead_constants()
1497 {
1498    if (dispatch_width == 8) {
1499       this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
1500       this->nr_params_remap = c->prog_data.nr_params;
1501
1502       for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
1503          this->params_remap[i] = -1;
1504
1505       /* Find which params are still in use. */
1506       foreach_list(node, &this->instructions) {
1507          fs_inst *inst = (fs_inst *)node;
1508
1509          for (int i = 0; i < 3; i++) {
1510             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1511
1512             if (inst->src[i].file != UNIFORM)
1513                continue;
1514
1515             /* Section 5.11 of the OpenGL 4.3 spec says:
1516              *
1517              *     "Out-of-bounds reads return undefined values, which include
1518              *     values from other variables of the active program or zero."
1519              */
1520             if (constant_nr < 0 || constant_nr >= (int)c->prog_data.nr_params) {
1521                constant_nr = 0;
1522             }
1523
1524             /* For now, set this to non-negative.  We'll give it the
1525              * actual new number in a moment, in order to keep the
1526              * register numbers nicely ordered.
1527              */
1528             this->params_remap[constant_nr] = 0;
1529          }
1530       }
1531
1532       /* Figure out what the new numbers for the params will be.  At some
1533        * point when we're doing uniform array access, we're going to want
1534        * to keep the distinction between .reg and .reg_offset, but for
1535        * now we don't care.
1536        */
1537       unsigned int new_nr_params = 0;
1538       for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1539          if (this->params_remap[i] != -1) {
1540             this->params_remap[i] = new_nr_params++;
1541          }
1542       }
1543
1544       /* Update the list of params to be uploaded to match our new numbering. */
1545       for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1546          int remapped = this->params_remap[i];
1547
1548          if (remapped == -1)
1549             continue;
1550
1551          c->prog_data.param[remapped] = c->prog_data.param[i];
1552       }
1553
1554       c->prog_data.nr_params = new_nr_params;
1555    } else {
1556       /* This should have been generated in the 8-wide pass already. */
1557       assert(this->params_remap);
1558    }
1559
1560    /* Now do the renumbering of the shader to remove unused params. */
1561    foreach_list(node, &this->instructions) {
1562       fs_inst *inst = (fs_inst *)node;
1563
1564       for (int i = 0; i < 3; i++) {
1565          int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1566
1567          if (inst->src[i].file != UNIFORM)
1568             continue;
1569
1570          /* as above alias to 0 */
1571          if (constant_nr < 0 || constant_nr >= (int)this->nr_params_remap) {
1572             constant_nr = 0;
1573          }
1574          assert(this->params_remap[constant_nr] != -1);
1575          inst->src[i].reg = this->params_remap[constant_nr];
1576          inst->src[i].reg_offset = 0;
1577       }
1578    }
1579
1580    return true;
1581 }
1582
1583 /*
1584  * Implements array access of uniforms by inserting a
1585  * PULL_CONSTANT_LOAD instruction.
1586  *
1587  * Unlike temporary GRF array access (where we don't support it due to
1588  * the difficulty of doing relative addressing on instruction
1589  * destinations), we could potentially do array access of uniforms
1590  * that were loaded in GRF space as push constants.  In real-world
1591  * usage we've seen, though, the arrays being used are always larger
1592  * than we could load as push constants, so just always move all
1593  * uniform array access out to a pull constant buffer.
1594  */
1595 void
1596 fs_visitor::move_uniform_array_access_to_pull_constants()
1597 {
1598    int pull_constant_loc[c->prog_data.nr_params];
1599
1600    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1601       pull_constant_loc[i] = -1;
1602    }
1603
1604    /* Walk through and find array access of uniforms.  Put a copy of that
1605     * uniform in the pull constant buffer.
1606     *
1607     * Note that we don't move constant-indexed accesses to arrays.  No
1608     * testing has been done of the performance impact of this choice.
1609     */
1610    foreach_list_safe(node, &this->instructions) {
1611       fs_inst *inst = (fs_inst *)node;
1612
1613       for (int i = 0 ; i < 3; i++) {
1614          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1615             continue;
1616
1617          int uniform = inst->src[i].reg;
1618
1619          /* If this array isn't already present in the pull constant buffer,
1620           * add it.
1621           */
1622          if (pull_constant_loc[uniform] == -1) {
1623             const float **values = &c->prog_data.param[uniform];
1624
1625             pull_constant_loc[uniform] = c->prog_data.nr_pull_params;
1626
1627             assert(param_size[uniform]);
1628
1629             for (int j = 0; j < param_size[uniform]; j++) {
1630                c->prog_data.pull_param[c->prog_data.nr_pull_params++] =
1631                   values[j];
1632             }
1633          }
1634
1635          /* Set up the annotation tracking for new generated instructions. */
1636          base_ir = inst->ir;
1637          current_annotation = inst->annotation;
1638
1639          fs_reg surf_index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1640          fs_reg temp = fs_reg(this, glsl_type::float_type);
1641          exec_list list = VARYING_PULL_CONSTANT_LOAD(temp,
1642                                                      surf_index,
1643                                                      *inst->src[i].reladdr,
1644                                                      pull_constant_loc[uniform] +
1645                                                      inst->src[i].reg_offset);
1646          inst->insert_before(&list);
1647
1648          inst->src[i].file = temp.file;
1649          inst->src[i].reg = temp.reg;
1650          inst->src[i].reg_offset = temp.reg_offset;
1651          inst->src[i].reladdr = NULL;
1652       }
1653    }
1654 }
1655
1656 /**
1657  * Choose accesses from the UNIFORM file to demote to using the pull
1658  * constant buffer.
1659  *
1660  * We allow a fragment shader to have more than the specified minimum
1661  * maximum number of fragment shader uniform components (64).  If
1662  * there are too many of these, they'd fill up all of register space.
1663  * So, this will push some of them out to the pull constant buffer and
1664  * update the program to load them.
1665  */
1666 void
1667 fs_visitor::setup_pull_constants()
1668 {
1669    /* Only allow 16 registers (128 uniform components) as push constants. */
1670    unsigned int max_uniform_components = 16 * 8;
1671    if (c->prog_data.nr_params <= max_uniform_components)
1672       return;
1673
1674    if (dispatch_width == 16) {
1675       fail("Pull constants not supported in 16-wide\n");
1676       return;
1677    }
1678
1679    /* Just demote the end of the list.  We could probably do better
1680     * here, demoting things that are rarely used in the program first.
1681     */
1682    unsigned int pull_uniform_base = max_uniform_components;
1683
1684    int pull_constant_loc[c->prog_data.nr_params];
1685    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1686       if (i < pull_uniform_base) {
1687          pull_constant_loc[i] = -1;
1688       } else {
1689          pull_constant_loc[i] = -1;
1690          /* If our constant is already being uploaded for reladdr purposes,
1691           * reuse it.
1692           */
1693          for (unsigned int j = 0; j < c->prog_data.nr_pull_params; j++) {
1694             if (c->prog_data.pull_param[j] == c->prog_data.param[i]) {
1695                pull_constant_loc[i] = j;
1696                break;
1697             }
1698          }
1699          if (pull_constant_loc[i] == -1) {
1700             int pull_index = c->prog_data.nr_pull_params++;
1701             c->prog_data.pull_param[pull_index] = c->prog_data.param[i];
1702             pull_constant_loc[i] = pull_index;;
1703          }
1704       }
1705    }
1706    c->prog_data.nr_params = pull_uniform_base;
1707
1708    foreach_list(node, &this->instructions) {
1709       fs_inst *inst = (fs_inst *)node;
1710
1711       for (int i = 0; i < 3; i++) {
1712          if (inst->src[i].file != UNIFORM)
1713             continue;
1714
1715          int pull_index = pull_constant_loc[inst->src[i].reg +
1716                                             inst->src[i].reg_offset];
1717          if (pull_index == -1)
1718             continue;
1719
1720          assert(!inst->src[i].reladdr);
1721
1722          fs_reg dst = fs_reg(this, glsl_type::float_type);
1723          fs_reg index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1724          fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1725          fs_inst *pull =
1726             new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1727                                  dst, index, offset);
1728          pull->ir = inst->ir;
1729          pull->annotation = inst->annotation;
1730
1731          inst->insert_before(pull);
1732
1733          inst->src[i].file = GRF;
1734          inst->src[i].reg = dst.reg;
1735          inst->src[i].reg_offset = 0;
1736          inst->src[i].smear = pull_index & 3;
1737       }
1738    }
1739 }
1740
1741 bool
1742 fs_visitor::opt_algebraic()
1743 {
1744    bool progress = false;
1745
1746    foreach_list(node, &this->instructions) {
1747       fs_inst *inst = (fs_inst *)node;
1748
1749       switch (inst->opcode) {
1750       case BRW_OPCODE_MUL:
1751          if (inst->src[1].file != IMM)
1752             continue;
1753
1754          /* a * 1.0 = a */
1755          if (inst->src[1].is_one()) {
1756             inst->opcode = BRW_OPCODE_MOV;
1757             inst->src[1] = reg_undef;
1758             progress = true;
1759             break;
1760          }
1761
1762          /* a * 0.0 = 0.0 */
1763          if (inst->src[1].is_zero()) {
1764             inst->opcode = BRW_OPCODE_MOV;
1765             inst->src[0] = inst->src[1];
1766             inst->src[1] = reg_undef;
1767             progress = true;
1768             break;
1769          }
1770
1771          break;
1772       case BRW_OPCODE_ADD:
1773          if (inst->src[1].file != IMM)
1774             continue;
1775
1776          /* a + 0.0 = a */
1777          if (inst->src[1].is_zero()) {
1778             inst->opcode = BRW_OPCODE_MOV;
1779             inst->src[1] = reg_undef;
1780             progress = true;
1781             break;
1782          }
1783          break;
1784       default:
1785          break;
1786       }
1787    }
1788
1789    return progress;
1790 }
1791
1792 /**
1793  * Removes any instructions writing a VGRF where that VGRF is not used by any
1794  * later instruction.
1795  */
1796 bool
1797 fs_visitor::dead_code_eliminate()
1798 {
1799    bool progress = false;
1800    int pc = 0;
1801
1802    calculate_live_intervals();
1803
1804    foreach_list_safe(node, &this->instructions) {
1805       fs_inst *inst = (fs_inst *)node;
1806
1807       if (inst->dst.file == GRF) {
1808          assert(this->virtual_grf_end[inst->dst.reg] >= pc);
1809          if (this->virtual_grf_end[inst->dst.reg] == pc) {
1810             inst->remove();
1811             progress = true;
1812          }
1813       }
1814
1815       pc++;
1816    }
1817
1818    if (progress)
1819       live_intervals_valid = false;
1820
1821    return progress;
1822 }
1823
1824 struct dead_code_hash_key
1825 {
1826    int vgrf;
1827    int reg_offset;
1828 };
1829
1830 static bool
1831 dead_code_hash_compare(const void *a, const void *b)
1832 {
1833    return memcmp(a, b, sizeof(struct dead_code_hash_key)) == 0;
1834 }
1835
1836 static void
1837 clear_dead_code_hash(struct hash_table *ht)
1838 {
1839    struct hash_entry *entry;
1840
1841    hash_table_foreach(ht, entry) {
1842       _mesa_hash_table_remove(ht, entry);
1843    }
1844 }
1845
1846 static void
1847 insert_dead_code_hash(struct hash_table *ht,
1848                       int vgrf, int reg_offset, fs_inst *inst)
1849 {
1850    /* We don't bother freeing keys, because they'll be GCed with the ht. */
1851    struct dead_code_hash_key *key = ralloc(ht, struct dead_code_hash_key);
1852
1853    key->vgrf = vgrf;
1854    key->reg_offset = reg_offset;
1855
1856    _mesa_hash_table_insert(ht, _mesa_hash_data(key, sizeof(*key)), key, inst);
1857 }
1858
1859 static struct hash_entry *
1860 get_dead_code_hash_entry(struct hash_table *ht, int vgrf, int reg_offset)
1861 {
1862    struct dead_code_hash_key key;
1863
1864    key.vgrf = vgrf;
1865    key.reg_offset = reg_offset;
1866
1867    return _mesa_hash_table_search(ht, _mesa_hash_data(&key, sizeof(key)), &key);
1868 }
1869
1870 static void
1871 remove_dead_code_hash(struct hash_table *ht,
1872                       int vgrf, int reg_offset)
1873 {
1874    struct hash_entry *entry = get_dead_code_hash_entry(ht, vgrf, reg_offset);
1875    if (!entry)
1876       return;
1877
1878    _mesa_hash_table_remove(ht, entry);
1879 }
1880
1881 /**
1882  * Walks basic blocks, removing any regs that are written but not read before
1883  * being redefined.
1884  *
1885  * The dead_code_eliminate() function implements a global dead code
1886  * elimination, but it only handles the removing the last write to a register
1887  * if it's never read.  This one can handle intermediate writes, but only
1888  * within a basic block.
1889  */
1890 bool
1891 fs_visitor::dead_code_eliminate_local()
1892 {
1893    struct hash_table *ht;
1894    bool progress = false;
1895
1896    ht = _mesa_hash_table_create(mem_ctx, dead_code_hash_compare);
1897
1898    foreach_list_safe(node, &this->instructions) {
1899       fs_inst *inst = (fs_inst *)node;
1900
1901       /* At a basic block, empty the HT since we don't understand dataflow
1902        * here.
1903        */
1904       if (inst->is_control_flow()) {
1905          clear_dead_code_hash(ht);
1906          continue;
1907       }
1908
1909       /* Clear the HT of any instructions that got read. */
1910       for (int i = 0; i < 3; i++) {
1911          fs_reg src = inst->src[i];
1912          if (src.file != GRF)
1913             continue;
1914
1915          int read = 1;
1916          if (inst->is_send_from_grf())
1917             read = virtual_grf_sizes[src.reg] - src.reg_offset;
1918
1919          for (int reg_offset = src.reg_offset;
1920               reg_offset < src.reg_offset + read;
1921               reg_offset++) {
1922             remove_dead_code_hash(ht, src.reg, reg_offset);
1923          }
1924       }
1925
1926       /* Add any update of a GRF to the HT, removing a previous write if it
1927        * wasn't read.
1928        */
1929       if (inst->dst.file == GRF) {
1930          if (inst->regs_written > 1) {
1931             /* We don't know how to trim channels from an instruction's
1932              * writes, so we can't incrementally remove unread channels from
1933              * it.  Just remove whatever it overwrites from the table
1934              */
1935             for (int i = 0; i < inst->regs_written; i++) {
1936                remove_dead_code_hash(ht,
1937                                      inst->dst.reg,
1938                                      inst->dst.reg_offset + i);
1939             }
1940          } else {
1941             struct hash_entry *entry =
1942                get_dead_code_hash_entry(ht, inst->dst.reg,
1943                                         inst->dst.reg_offset);
1944
1945             if (inst->is_partial_write()) {
1946                /* For a partial write, we can't remove any previous dead code
1947                 * candidate, since we're just modifying their result, but we can
1948                 * be dead code eliminiated ourselves.
1949                 */
1950                if (entry) {
1951                   entry->data = inst;
1952                } else {
1953                   insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
1954                                         inst);
1955                }
1956             } else {
1957                if (entry) {
1958                   /* We're completely updating a channel, and there was a
1959                    * previous write to the channel that wasn't read.  Kill it!
1960                    */
1961                   fs_inst *inst = (fs_inst *)entry->data;
1962                   inst->remove();
1963                   progress = true;
1964                   _mesa_hash_table_remove(ht, entry);
1965                }
1966
1967                insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
1968                                      inst);
1969             }
1970          }
1971       }
1972    }
1973
1974    _mesa_hash_table_destroy(ht, NULL);
1975
1976    if (progress)
1977       live_intervals_valid = false;
1978
1979    return progress;
1980 }
1981
1982 /**
1983  * Implements a second type of register coalescing: This one checks if
1984  * the two regs involved in a raw move don't interfere, in which case
1985  * they can both by stored in the same place and the MOV removed.
1986  */
1987 bool
1988 fs_visitor::register_coalesce_2()
1989 {
1990    bool progress = false;
1991
1992    calculate_live_intervals();
1993
1994    foreach_list_safe(node, &this->instructions) {
1995       fs_inst *inst = (fs_inst *)node;
1996
1997       if (inst->opcode != BRW_OPCODE_MOV ||
1998           inst->is_partial_write() ||
1999           inst->saturate ||
2000           inst->src[0].file != GRF ||
2001           inst->src[0].negate ||
2002           inst->src[0].abs ||
2003           inst->src[0].smear != -1 ||
2004           inst->dst.file != GRF ||
2005           inst->dst.type != inst->src[0].type ||
2006           virtual_grf_sizes[inst->src[0].reg] != 1 ||
2007           virtual_grf_interferes(inst->dst.reg, inst->src[0].reg)) {
2008          continue;
2009       }
2010
2011       int reg_from = inst->src[0].reg;
2012       assert(inst->src[0].reg_offset == 0);
2013       int reg_to = inst->dst.reg;
2014       int reg_to_offset = inst->dst.reg_offset;
2015
2016       foreach_list(node, &this->instructions) {
2017          fs_inst *scan_inst = (fs_inst *)node;
2018
2019          if (scan_inst->dst.file == GRF &&
2020              scan_inst->dst.reg == reg_from) {
2021             scan_inst->dst.reg = reg_to;
2022             scan_inst->dst.reg_offset = reg_to_offset;
2023          }
2024          for (int i = 0; i < 3; i++) {
2025             if (scan_inst->src[i].file == GRF &&
2026                 scan_inst->src[i].reg == reg_from) {
2027                scan_inst->src[i].reg = reg_to;
2028                scan_inst->src[i].reg_offset = reg_to_offset;
2029             }
2030          }
2031       }
2032
2033       inst->remove();
2034
2035       /* We don't need to recalculate live intervals inside the loop despite
2036        * flagging live_intervals_valid because we only use live intervals for
2037        * the interferes test, and we must have had a situation where the
2038        * intervals were:
2039        *
2040        *  from  to
2041        *  ^
2042        *  |
2043        *  v
2044        *        ^
2045        *        |
2046        *        v
2047        *
2048        * Some register R that might get coalesced with one of these two could
2049        * only be referencing "to", otherwise "from"'s range would have been
2050        * longer.  R's range could also only start at the end of "to" or later,
2051        * otherwise it will conflict with "to" when we try to coalesce "to"
2052        * into Rw anyway.
2053        */
2054       live_intervals_valid = false;
2055
2056       progress = true;
2057       continue;
2058    }
2059
2060    return progress;
2061 }
2062
2063 bool
2064 fs_visitor::register_coalesce()
2065 {
2066    bool progress = false;
2067    int if_depth = 0;
2068    int loop_depth = 0;
2069
2070    foreach_list_safe(node, &this->instructions) {
2071       fs_inst *inst = (fs_inst *)node;
2072
2073       /* Make sure that we dominate the instructions we're going to
2074        * scan for interfering with our coalescing, or we won't have
2075        * scanned enough to see if anything interferes with our
2076        * coalescing.  We don't dominate the following instructions if
2077        * we're in a loop or an if block.
2078        */
2079       switch (inst->opcode) {
2080       case BRW_OPCODE_DO:
2081          loop_depth++;
2082          break;
2083       case BRW_OPCODE_WHILE:
2084          loop_depth--;
2085          break;
2086       case BRW_OPCODE_IF:
2087          if_depth++;
2088          break;
2089       case BRW_OPCODE_ENDIF:
2090          if_depth--;
2091          break;
2092       default:
2093          break;
2094       }
2095       if (loop_depth || if_depth)
2096          continue;
2097
2098       if (inst->opcode != BRW_OPCODE_MOV ||
2099           inst->is_partial_write() ||
2100           inst->saturate ||
2101           inst->dst.file != GRF || (inst->src[0].file != GRF &&
2102                                     inst->src[0].file != UNIFORM)||
2103           inst->dst.type != inst->src[0].type)
2104          continue;
2105
2106       bool has_source_modifiers = (inst->src[0].abs ||
2107                                    inst->src[0].negate ||
2108                                    inst->src[0].smear != -1 ||
2109                                    inst->src[0].file == UNIFORM);
2110
2111       /* Found a move of a GRF to a GRF.  Let's see if we can coalesce
2112        * them: check for no writes to either one until the exit of the
2113        * program.
2114        */
2115       bool interfered = false;
2116
2117       for (fs_inst *scan_inst = (fs_inst *)inst->next;
2118            !scan_inst->is_tail_sentinel();
2119            scan_inst = (fs_inst *)scan_inst->next) {
2120          if (scan_inst->dst.file == GRF) {
2121             if (scan_inst->overwrites_reg(inst->dst) ||
2122                 scan_inst->overwrites_reg(inst->src[0])) {
2123                interfered = true;
2124                break;
2125             }
2126          }
2127
2128          if (has_source_modifiers) {
2129             for (int i = 0; i < 3; i++) {
2130                if (scan_inst->src[i].file == GRF &&
2131                    scan_inst->src[i].reg == inst->dst.reg &&
2132                    scan_inst->src[i].reg_offset == inst->dst.reg_offset &&
2133                    inst->dst.type != scan_inst->src[i].type)
2134                {
2135                  interfered = true;
2136                  break;
2137                }
2138             }
2139          }
2140
2141
2142          /* The gen6 MATH instruction can't handle source modifiers or
2143           * unusual register regions, so avoid coalescing those for
2144           * now.  We should do something more specific.
2145           */
2146          if (has_source_modifiers && !can_do_source_mods(scan_inst)) {
2147             interfered = true;
2148             break;
2149          }
2150
2151          /* The accumulator result appears to get used for the
2152           * conditional modifier generation.  When negating a UD
2153           * value, there is a 33rd bit generated for the sign in the
2154           * accumulator value, so now you can't check, for example,
2155           * equality with a 32-bit value.  See piglit fs-op-neg-uint.
2156           */
2157          if (scan_inst->conditional_mod &&
2158              inst->src[0].negate &&
2159              inst->src[0].type == BRW_REGISTER_TYPE_UD) {
2160             interfered = true;
2161             break;
2162          }
2163       }
2164       if (interfered) {
2165          continue;
2166       }
2167
2168       /* Rewrite the later usage to point at the source of the move to
2169        * be removed.
2170        */
2171       for (fs_inst *scan_inst = inst;
2172            !scan_inst->is_tail_sentinel();
2173            scan_inst = (fs_inst *)scan_inst->next) {
2174          for (int i = 0; i < 3; i++) {
2175             if (scan_inst->src[i].file == GRF &&
2176                 scan_inst->src[i].reg == inst->dst.reg &&
2177                 scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
2178                fs_reg new_src = inst->src[0];
2179                if (scan_inst->src[i].abs) {
2180                   new_src.negate = 0;
2181                   new_src.abs = 1;
2182                }
2183                new_src.negate ^= scan_inst->src[i].negate;
2184                scan_inst->src[i] = new_src;
2185             }
2186          }
2187       }
2188
2189       inst->remove();
2190       progress = true;
2191    }
2192
2193    if (progress)
2194       live_intervals_valid = false;
2195
2196    return progress;
2197 }
2198
2199
2200 bool
2201 fs_visitor::compute_to_mrf()
2202 {
2203    bool progress = false;
2204    int next_ip = 0;
2205
2206    calculate_live_intervals();
2207
2208    foreach_list_safe(node, &this->instructions) {
2209       fs_inst *inst = (fs_inst *)node;
2210
2211       int ip = next_ip;
2212       next_ip++;
2213
2214       if (inst->opcode != BRW_OPCODE_MOV ||
2215           inst->is_partial_write() ||
2216           inst->dst.file != MRF || inst->src[0].file != GRF ||
2217           inst->dst.type != inst->src[0].type ||
2218           inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
2219          continue;
2220
2221       /* Work out which hardware MRF registers are written by this
2222        * instruction.
2223        */
2224       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2225       int mrf_high;
2226       if (inst->dst.reg & BRW_MRF_COMPR4) {
2227          mrf_high = mrf_low + 4;
2228       } else if (dispatch_width == 16 &&
2229                  (!inst->force_uncompressed && !inst->force_sechalf)) {
2230          mrf_high = mrf_low + 1;
2231       } else {
2232          mrf_high = mrf_low;
2233       }
2234
2235       /* Can't compute-to-MRF this GRF if someone else was going to
2236        * read it later.
2237        */
2238       if (this->virtual_grf_end[inst->src[0].reg] > ip)
2239          continue;
2240
2241       /* Found a move of a GRF to a MRF.  Let's see if we can go
2242        * rewrite the thing that made this GRF to write into the MRF.
2243        */
2244       fs_inst *scan_inst;
2245       for (scan_inst = (fs_inst *)inst->prev;
2246            scan_inst->prev != NULL;
2247            scan_inst = (fs_inst *)scan_inst->prev) {
2248          if (scan_inst->dst.file == GRF &&
2249              scan_inst->dst.reg == inst->src[0].reg) {
2250             /* Found the last thing to write our reg we want to turn
2251              * into a compute-to-MRF.
2252              */
2253
2254             /* If this one instruction didn't populate all the
2255              * channels, bail.  We might be able to rewrite everything
2256              * that writes that reg, but it would require smarter
2257              * tracking to delay the rewriting until complete success.
2258              */
2259             if (scan_inst->is_partial_write())
2260                break;
2261
2262             /* Things returning more than one register would need us to
2263              * understand coalescing out more than one MOV at a time.
2264              */
2265             if (scan_inst->regs_written > 1)
2266                break;
2267
2268             /* SEND instructions can't have MRF as a destination. */
2269             if (scan_inst->mlen)
2270                break;
2271
2272             if (brw->gen == 6) {
2273                /* gen6 math instructions must have the destination be
2274                 * GRF, so no compute-to-MRF for them.
2275                 */
2276                if (scan_inst->is_math()) {
2277                   break;
2278                }
2279             }
2280
2281             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2282                /* Found the creator of our MRF's source value. */
2283                scan_inst->dst.file = MRF;
2284                scan_inst->dst.reg = inst->dst.reg;
2285                scan_inst->saturate |= inst->saturate;
2286                inst->remove();
2287                progress = true;
2288             }
2289             break;
2290          }
2291
2292          /* We don't handle control flow here.  Most computation of
2293           * values that end up in MRFs are shortly before the MRF
2294           * write anyway.
2295           */
2296          if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2297             break;
2298
2299          /* You can't read from an MRF, so if someone else reads our
2300           * MRF's source GRF that we wanted to rewrite, that stops us.
2301           */
2302          bool interfered = false;
2303          for (int i = 0; i < 3; i++) {
2304             if (scan_inst->src[i].file == GRF &&
2305                 scan_inst->src[i].reg == inst->src[0].reg &&
2306                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2307                interfered = true;
2308             }
2309          }
2310          if (interfered)
2311             break;
2312
2313          if (scan_inst->dst.file == MRF) {
2314             /* If somebody else writes our MRF here, we can't
2315              * compute-to-MRF before that.
2316              */
2317             int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2318             int scan_mrf_high;
2319
2320             if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2321                scan_mrf_high = scan_mrf_low + 4;
2322             } else if (dispatch_width == 16 &&
2323                        (!scan_inst->force_uncompressed &&
2324                         !scan_inst->force_sechalf)) {
2325                scan_mrf_high = scan_mrf_low + 1;
2326             } else {
2327                scan_mrf_high = scan_mrf_low;
2328             }
2329
2330             if (mrf_low == scan_mrf_low ||
2331                 mrf_low == scan_mrf_high ||
2332                 mrf_high == scan_mrf_low ||
2333                 mrf_high == scan_mrf_high) {
2334                break;
2335             }
2336          }
2337
2338          if (scan_inst->mlen > 0) {
2339             /* Found a SEND instruction, which means that there are
2340              * live values in MRFs from base_mrf to base_mrf +
2341              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
2342              * above it.
2343              */
2344             if (mrf_low >= scan_inst->base_mrf &&
2345                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2346                break;
2347             }
2348             if (mrf_high >= scan_inst->base_mrf &&
2349                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2350                break;
2351             }
2352          }
2353       }
2354    }
2355
2356    if (progress)
2357       live_intervals_valid = false;
2358
2359    return progress;
2360 }
2361
2362 /**
2363  * Walks through basic blocks, looking for repeated MRF writes and
2364  * removing the later ones.
2365  */
2366 bool
2367 fs_visitor::remove_duplicate_mrf_writes()
2368 {
2369    fs_inst *last_mrf_move[16];
2370    bool progress = false;
2371
2372    /* Need to update the MRF tracking for compressed instructions. */
2373    if (dispatch_width == 16)
2374       return false;
2375
2376    memset(last_mrf_move, 0, sizeof(last_mrf_move));
2377
2378    foreach_list_safe(node, &this->instructions) {
2379       fs_inst *inst = (fs_inst *)node;
2380
2381       if (inst->is_control_flow()) {
2382          memset(last_mrf_move, 0, sizeof(last_mrf_move));
2383       }
2384
2385       if (inst->opcode == BRW_OPCODE_MOV &&
2386           inst->dst.file == MRF) {
2387          fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2388          if (prev_inst && inst->equals(prev_inst)) {
2389             inst->remove();
2390             progress = true;
2391             continue;
2392          }
2393       }
2394
2395       /* Clear out the last-write records for MRFs that were overwritten. */
2396       if (inst->dst.file == MRF) {
2397          last_mrf_move[inst->dst.reg] = NULL;
2398       }
2399
2400       if (inst->mlen > 0) {
2401          /* Found a SEND instruction, which will include two or fewer
2402           * implied MRF writes.  We could do better here.
2403           */
2404          for (int i = 0; i < implied_mrf_writes(inst); i++) {
2405             last_mrf_move[inst->base_mrf + i] = NULL;
2406          }
2407       }
2408
2409       /* Clear out any MRF move records whose sources got overwritten. */
2410       if (inst->dst.file == GRF) {
2411          for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2412             if (last_mrf_move[i] &&
2413                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2414                last_mrf_move[i] = NULL;
2415             }
2416          }
2417       }
2418
2419       if (inst->opcode == BRW_OPCODE_MOV &&
2420           inst->dst.file == MRF &&
2421           inst->src[0].file == GRF &&
2422           !inst->is_partial_write()) {
2423          last_mrf_move[inst->dst.reg] = inst;
2424       }
2425    }
2426
2427    if (progress)
2428       live_intervals_valid = false;
2429
2430    return progress;
2431 }
2432
2433 static void
2434 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2435                         int first_grf, int grf_len)
2436 {
2437    bool inst_16wide = (dispatch_width > 8 &&
2438                        !inst->force_uncompressed &&
2439                        !inst->force_sechalf);
2440
2441    /* Clear the flag for registers that actually got read (as expected). */
2442    for (int i = 0; i < 3; i++) {
2443       int grf;
2444       if (inst->src[i].file == GRF) {
2445          grf = inst->src[i].reg;
2446       } else if (inst->src[i].file == HW_REG &&
2447                  inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2448          grf = inst->src[i].fixed_hw_reg.nr;
2449       } else {
2450          continue;
2451       }
2452
2453       if (grf >= first_grf &&
2454           grf < first_grf + grf_len) {
2455          deps[grf - first_grf] = false;
2456          if (inst_16wide)
2457             deps[grf - first_grf + 1] = false;
2458       }
2459    }
2460 }
2461
2462 /**
2463  * Implements this workaround for the original 965:
2464  *
2465  *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2466  *      check for post destination dependencies on this instruction, software
2467  *      must ensure that there is no destination hazard for the case of ‘write
2468  *      followed by a posted write’ shown in the following example.
2469  *
2470  *      1. mov r3 0
2471  *      2. send r3.xy <rest of send instruction>
2472  *      3. mov r2 r3
2473  *
2474  *      Due to no post-destination dependency check on the ‘send’, the above
2475  *      code sequence could have two instructions (1 and 2) in flight at the
2476  *      same time that both consider ‘r3’ as the target of their final writes.
2477  */
2478 void
2479 fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
2480 {
2481    int reg_size = dispatch_width / 8;
2482    int write_len = inst->regs_written * reg_size;
2483    int first_write_grf = inst->dst.reg;
2484    bool needs_dep[BRW_MAX_MRF];
2485    assert(write_len < (int)sizeof(needs_dep) - 1);
2486
2487    memset(needs_dep, false, sizeof(needs_dep));
2488    memset(needs_dep, true, write_len);
2489
2490    clear_deps_for_inst_src(inst, dispatch_width,
2491                            needs_dep, first_write_grf, write_len);
2492
2493    /* Walk backwards looking for writes to registers we're writing which
2494     * aren't read since being written.  If we hit the start of the program,
2495     * we assume that there are no outstanding dependencies on entry to the
2496     * program.
2497     */
2498    for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2499         scan_inst != NULL;
2500         scan_inst = (fs_inst *)scan_inst->prev) {
2501
2502       /* If we hit control flow, assume that there *are* outstanding
2503        * dependencies, and force their cleanup before our instruction.
2504        */
2505       if (scan_inst->is_control_flow()) {
2506          for (int i = 0; i < write_len; i++) {
2507             if (needs_dep[i]) {
2508                inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2509             }
2510          }
2511          return;
2512       }
2513
2514       bool scan_inst_16wide = (dispatch_width > 8 &&
2515                                !scan_inst->force_uncompressed &&
2516                                !scan_inst->force_sechalf);
2517
2518       /* We insert our reads as late as possible on the assumption that any
2519        * instruction but a MOV that might have left us an outstanding
2520        * dependency has more latency than a MOV.
2521        */
2522       if (scan_inst->dst.file == GRF) {
2523          for (int i = 0; i < scan_inst->regs_written; i++) {
2524             int reg = scan_inst->dst.reg + i * reg_size;
2525
2526             if (reg >= first_write_grf &&
2527                 reg < first_write_grf + write_len &&
2528                 needs_dep[reg - first_write_grf]) {
2529                inst->insert_before(DEP_RESOLVE_MOV(reg));
2530                needs_dep[reg - first_write_grf] = false;
2531                if (scan_inst_16wide)
2532                   needs_dep[reg - first_write_grf + 1] = false;
2533             }
2534          }
2535       }
2536
2537       /* Clear the flag for registers that actually got read (as expected). */
2538       clear_deps_for_inst_src(scan_inst, dispatch_width,
2539                               needs_dep, first_write_grf, write_len);
2540
2541       /* Continue the loop only if we haven't resolved all the dependencies */
2542       int i;
2543       for (i = 0; i < write_len; i++) {
2544          if (needs_dep[i])
2545             break;
2546       }
2547       if (i == write_len)
2548          return;
2549    }
2550 }
2551
2552 /**
2553  * Implements this workaround for the original 965:
2554  *
2555  *     "[DevBW, DevCL] Errata: A destination register from a send can not be
2556  *      used as a destination register until after it has been sourced by an
2557  *      instruction with a different destination register.
2558  */
2559 void
2560 fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
2561 {
2562    int write_len = inst->regs_written * dispatch_width / 8;
2563    int first_write_grf = inst->dst.reg;
2564    bool needs_dep[BRW_MAX_MRF];
2565    assert(write_len < (int)sizeof(needs_dep) - 1);
2566
2567    memset(needs_dep, false, sizeof(needs_dep));
2568    memset(needs_dep, true, write_len);
2569    /* Walk forwards looking for writes to registers we're writing which aren't
2570     * read before being written.
2571     */
2572    for (fs_inst *scan_inst = (fs_inst *)inst->next;
2573         !scan_inst->is_tail_sentinel();
2574         scan_inst = (fs_inst *)scan_inst->next) {
2575       /* If we hit control flow, force resolve all remaining dependencies. */
2576       if (scan_inst->is_control_flow()) {
2577          for (int i = 0; i < write_len; i++) {
2578             if (needs_dep[i])
2579                scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2580          }
2581          return;
2582       }
2583
2584       /* Clear the flag for registers that actually got read (as expected). */
2585       clear_deps_for_inst_src(scan_inst, dispatch_width,
2586                               needs_dep, first_write_grf, write_len);
2587
2588       /* We insert our reads as late as possible since they're reading the
2589        * result of a SEND, which has massive latency.
2590        */
2591       if (scan_inst->dst.file == GRF &&
2592           scan_inst->dst.reg >= first_write_grf &&
2593           scan_inst->dst.reg < first_write_grf + write_len &&
2594           needs_dep[scan_inst->dst.reg - first_write_grf]) {
2595          scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2596          needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2597       }
2598
2599       /* Continue the loop only if we haven't resolved all the dependencies */
2600       int i;
2601       for (i = 0; i < write_len; i++) {
2602          if (needs_dep[i])
2603             break;
2604       }
2605       if (i == write_len)
2606          return;
2607    }
2608
2609    /* If we hit the end of the program, resolve all remaining dependencies out
2610     * of paranoia.
2611     */
2612    fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2613    assert(last_inst->eot);
2614    for (int i = 0; i < write_len; i++) {
2615       if (needs_dep[i])
2616          last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2617    }
2618 }
2619
2620 void
2621 fs_visitor::insert_gen4_send_dependency_workarounds()
2622 {
2623    if (brw->gen != 4 || brw->is_g4x)
2624       return;
2625
2626    /* Note that we're done with register allocation, so GRF fs_regs always
2627     * have a .reg_offset of 0.
2628     */
2629
2630    foreach_list_safe(node, &this->instructions) {
2631       fs_inst *inst = (fs_inst *)node;
2632
2633       if (inst->mlen != 0 && inst->dst.file == GRF) {
2634          insert_gen4_pre_send_dependency_workarounds(inst);
2635          insert_gen4_post_send_dependency_workarounds(inst);
2636       }
2637    }
2638 }
2639
2640 /**
2641  * Turns the generic expression-style uniform pull constant load instruction
2642  * into a hardware-specific series of instructions for loading a pull
2643  * constant.
2644  *
2645  * The expression style allows the CSE pass before this to optimize out
2646  * repeated loads from the same offset, and gives the pre-register-allocation
2647  * scheduling full flexibility, while the conversion to native instructions
2648  * allows the post-register-allocation scheduler the best information
2649  * possible.
2650  *
2651  * Note that execution masking for setting up pull constant loads is special:
2652  * the channels that need to be written are unrelated to the current execution
2653  * mask, since a later instruction will use one of the result channels as a
2654  * source operand for all 8 or 16 of its channels.
2655  */
2656 void
2657 fs_visitor::lower_uniform_pull_constant_loads()
2658 {
2659    foreach_list(node, &this->instructions) {
2660       fs_inst *inst = (fs_inst *)node;
2661
2662       if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2663          continue;
2664
2665       if (brw->gen >= 7) {
2666          /* The offset arg before was a vec4-aligned byte offset.  We need to
2667           * turn it into a dword offset.
2668           */
2669          fs_reg const_offset_reg = inst->src[1];
2670          assert(const_offset_reg.file == IMM &&
2671                 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2672          const_offset_reg.imm.u /= 4;
2673          fs_reg payload = fs_reg(this, glsl_type::uint_type);
2674
2675          /* This is actually going to be a MOV, but since only the first dword
2676           * is accessed, we have a special opcode to do just that one.  Note
2677           * that this needs to be an operation that will be considered a def
2678           * by live variable analysis, or register allocation will explode.
2679           */
2680          fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2681                                                payload, const_offset_reg);
2682          setup->force_writemask_all = true;
2683
2684          setup->ir = inst->ir;
2685          setup->annotation = inst->annotation;
2686          inst->insert_before(setup);
2687
2688          /* Similarly, this will only populate the first 4 channels of the
2689           * result register (since we only use smear values from 0-3), but we
2690           * don't tell the optimizer.
2691           */
2692          inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2693          inst->src[1] = payload;
2694
2695          this->live_intervals_valid = false;
2696       } else {
2697          /* Before register allocation, we didn't tell the scheduler about the
2698           * MRF we use.  We know it's safe to use this MRF because nothing
2699           * else does except for register spill/unspill, which generates and
2700           * uses its MRF within a single IR instruction.
2701           */
2702          inst->base_mrf = 14;
2703          inst->mlen = 1;
2704       }
2705    }
2706 }
2707
2708 void
2709 fs_visitor::dump_instruction(backend_instruction *be_inst)
2710 {
2711    fs_inst *inst = (fs_inst *)be_inst;
2712
2713    if (inst->predicate) {
2714       printf("(%cf0.%d) ",
2715              inst->predicate_inverse ? '-' : '+',
2716              inst->flag_subreg);
2717    }
2718
2719    printf("%s", brw_instruction_name(inst->opcode));
2720    if (inst->saturate)
2721       printf(".sat");
2722    if (inst->conditional_mod) {
2723       printf(".cmod");
2724       if (!inst->predicate &&
2725           (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2726                               inst->opcode != BRW_OPCODE_IF &&
2727                               inst->opcode != BRW_OPCODE_WHILE))) {
2728          printf(".f0.%d", inst->flag_subreg);
2729       }
2730    }
2731    printf(" ");
2732
2733
2734    switch (inst->dst.file) {
2735    case GRF:
2736       printf("vgrf%d", inst->dst.reg);
2737       if (inst->dst.reg_offset)
2738          printf("+%d", inst->dst.reg_offset);
2739       break;
2740    case MRF:
2741       printf("m%d", inst->dst.reg);
2742       break;
2743    case BAD_FILE:
2744       printf("(null)");
2745       break;
2746    case UNIFORM:
2747       printf("***u%d***", inst->dst.reg);
2748       break;
2749    case ARF:
2750       if (inst->dst.reg == BRW_ARF_NULL)
2751          printf("(null)");
2752       else
2753          printf("arf%d", inst->dst.reg);
2754       break;
2755    default:
2756       printf("???");
2757       break;
2758    }
2759    printf(", ");
2760
2761    for (int i = 0; i < 3; i++) {
2762       if (inst->src[i].negate)
2763          printf("-");
2764       if (inst->src[i].abs)
2765          printf("|");
2766       switch (inst->src[i].file) {
2767       case GRF:
2768          printf("vgrf%d", inst->src[i].reg);
2769          if (inst->src[i].reg_offset)
2770             printf("+%d", inst->src[i].reg_offset);
2771          break;
2772       case MRF:
2773          printf("***m%d***", inst->src[i].reg);
2774          break;
2775       case UNIFORM:
2776          printf("u%d", inst->src[i].reg);
2777          if (inst->src[i].reg_offset)
2778             printf(".%d", inst->src[i].reg_offset);
2779          break;
2780       case BAD_FILE:
2781          printf("(null)");
2782          break;
2783       case IMM:
2784          switch (inst->src[i].type) {
2785          case BRW_REGISTER_TYPE_F:
2786             printf("%ff", inst->src[i].imm.f);
2787             break;
2788          case BRW_REGISTER_TYPE_D:
2789             printf("%dd", inst->src[i].imm.i);
2790             break;
2791          case BRW_REGISTER_TYPE_UD:
2792             printf("%uu", inst->src[i].imm.u);
2793             break;
2794          default:
2795             printf("???");
2796             break;
2797          }
2798          break;
2799       default:
2800          printf("???");
2801          break;
2802       }
2803       if (inst->src[i].abs)
2804          printf("|");
2805
2806       if (i < 3)
2807          printf(", ");
2808    }
2809
2810    printf(" ");
2811
2812    if (inst->force_uncompressed)
2813       printf("1sthalf ");
2814
2815    if (inst->force_sechalf)
2816       printf("2ndhalf ");
2817
2818    printf("\n");
2819 }
2820
2821 /**
2822  * Possibly returns an instruction that set up @param reg.
2823  *
2824  * Sometimes we want to take the result of some expression/variable
2825  * dereference tree and rewrite the instruction generating the result
2826  * of the tree.  When processing the tree, we know that the
2827  * instructions generated are all writing temporaries that are dead
2828  * outside of this tree.  So, if we have some instructions that write
2829  * a temporary, we're free to point that temp write somewhere else.
2830  *
2831  * Note that this doesn't guarantee that the instruction generated
2832  * only reg -- it might be the size=4 destination of a texture instruction.
2833  */
2834 fs_inst *
2835 fs_visitor::get_instruction_generating_reg(fs_inst *start,
2836                                            fs_inst *end,
2837                                            fs_reg reg)
2838 {
2839    if (end == start ||
2840        end->is_partial_write() ||
2841        reg.reladdr ||
2842        !reg.equals(end->dst)) {
2843       return NULL;
2844    } else {
2845       return end;
2846    }
2847 }
2848
2849 void
2850 fs_visitor::setup_payload_gen6()
2851 {
2852    bool uses_depth =
2853       (fp->Base.InputsRead & (1 << VARYING_SLOT_POS)) != 0;
2854    unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
2855
2856    assert(brw->gen >= 6);
2857
2858    /* R0-1: masks, pixel X/Y coordinates. */
2859    c->nr_payload_regs = 2;
2860    /* R2: only for 32-pixel dispatch.*/
2861
2862    /* R3-26: barycentric interpolation coordinates.  These appear in the
2863     * same order that they appear in the brw_wm_barycentric_interp_mode
2864     * enum.  Each set of coordinates occupies 2 registers if dispatch width
2865     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
2866     * appear if they were enabled using the "Barycentric Interpolation
2867     * Mode" bits in WM_STATE.
2868     */
2869    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
2870       if (barycentric_interp_modes & (1 << i)) {
2871          c->barycentric_coord_reg[i] = c->nr_payload_regs;
2872          c->nr_payload_regs += 2;
2873          if (dispatch_width == 16) {
2874             c->nr_payload_regs += 2;
2875          }
2876       }
2877    }
2878
2879    /* R27: interpolated depth if uses source depth */
2880    if (uses_depth) {
2881       c->source_depth_reg = c->nr_payload_regs;
2882       c->nr_payload_regs++;
2883       if (dispatch_width == 16) {
2884          /* R28: interpolated depth if not 8-wide. */
2885          c->nr_payload_regs++;
2886       }
2887    }
2888    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
2889    if (uses_depth) {
2890       c->source_w_reg = c->nr_payload_regs;
2891       c->nr_payload_regs++;
2892       if (dispatch_width == 16) {
2893          /* R30: interpolated W if not 8-wide. */
2894          c->nr_payload_regs++;
2895       }
2896    }
2897    /* R31: MSAA position offsets. */
2898    /* R32-: bary for 32-pixel. */
2899    /* R58-59: interp W for 32-pixel. */
2900
2901    if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
2902       c->source_depth_to_render_target = true;
2903    }
2904 }
2905
2906 bool
2907 fs_visitor::run()
2908 {
2909    sanity_param_count = fp->Base.Parameters->NumParameters;
2910    uint32_t orig_nr_params = c->prog_data.nr_params;
2911
2912    if (brw->gen >= 6)
2913       setup_payload_gen6();
2914    else
2915       setup_payload_gen4();
2916
2917    if (0) {
2918       emit_dummy_fs();
2919    } else {
2920       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2921          emit_shader_time_begin();
2922
2923       calculate_urb_setup();
2924       if (brw->gen < 6)
2925          emit_interpolation_setup_gen4();
2926       else
2927          emit_interpolation_setup_gen6();
2928
2929       /* We handle discards by keeping track of the still-live pixels in f0.1.
2930        * Initialize it with the dispatched pixels.
2931        */
2932       if (fp->UsesKill) {
2933          fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
2934          discard_init->flag_subreg = 1;
2935       }
2936
2937       /* Generate FS IR for main().  (the visitor only descends into
2938        * functions called "main").
2939        */
2940       if (shader) {
2941          foreach_list(node, &*shader->ir) {
2942             ir_instruction *ir = (ir_instruction *)node;
2943             base_ir = ir;
2944             this->result = reg_undef;
2945             ir->accept(this);
2946          }
2947       } else {
2948          emit_fragment_program_code();
2949       }
2950       base_ir = NULL;
2951       if (failed)
2952          return false;
2953
2954       emit(FS_OPCODE_PLACEHOLDER_HALT);
2955
2956       emit_fb_writes();
2957
2958       split_virtual_grfs();
2959
2960       move_uniform_array_access_to_pull_constants();
2961       setup_pull_constants();
2962
2963       bool progress;
2964       do {
2965          progress = false;
2966
2967          compact_virtual_grfs();
2968
2969          progress = remove_duplicate_mrf_writes() || progress;
2970
2971          progress = opt_algebraic() || progress;
2972          progress = opt_cse() || progress;
2973          progress = opt_copy_propagate() || progress;
2974          progress = dead_code_eliminate() || progress;
2975          progress = dead_code_eliminate_local() || progress;
2976          progress = register_coalesce() || progress;
2977          progress = register_coalesce_2() || progress;
2978          progress = compute_to_mrf() || progress;
2979       } while (progress);
2980
2981       remove_dead_constants();
2982
2983       schedule_instructions(false);
2984
2985       lower_uniform_pull_constant_loads();
2986
2987       assign_curb_setup();
2988       assign_urb_setup();
2989
2990       if (0) {
2991          /* Debug of register spilling: Go spill everything. */
2992          for (int i = 0; i < virtual_grf_count; i++) {
2993             spill_reg(i);
2994          }
2995       }
2996
2997       if (0)
2998          assign_regs_trivial();
2999       else {
3000          while (!assign_regs()) {
3001             if (failed)
3002                break;
3003          }
3004       }
3005    }
3006    assert(force_uncompressed_stack == 0);
3007    assert(force_sechalf_stack == 0);
3008
3009    /* This must come after all optimization and register allocation, since
3010     * it inserts dead code that happens to have side effects, and it does
3011     * so based on the actual physical registers in use.
3012     */
3013    insert_gen4_send_dependency_workarounds();
3014
3015    if (failed)
3016       return false;
3017
3018    schedule_instructions(true);
3019
3020    if (dispatch_width == 8) {
3021       c->prog_data.reg_blocks = brw_register_blocks(grf_used);
3022    } else {
3023       c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
3024
3025       /* Make sure we didn't try to sneak in an extra uniform */
3026       assert(orig_nr_params == c->prog_data.nr_params);
3027       (void) orig_nr_params;
3028    }
3029
3030    /* If any state parameters were appended, then ParameterValues could have
3031     * been realloced, in which case the driver uniform storage set up by
3032     * _mesa_associate_uniform_storage() would point to freed memory.  Make
3033     * sure that didn't happen.
3034     */
3035    assert(sanity_param_count == fp->Base.Parameters->NumParameters);
3036
3037    return !failed;
3038 }
3039
3040 const unsigned *
3041 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
3042                struct gl_fragment_program *fp,
3043                struct gl_shader_program *prog,
3044                unsigned *final_assembly_size)
3045 {
3046    bool start_busy = false;
3047    float start_time = 0;
3048
3049    if (unlikely(brw->perf_debug)) {
3050       start_busy = (brw->batch.last_bo &&
3051                     drm_intel_bo_busy(brw->batch.last_bo));
3052       start_time = get_time();
3053    }
3054
3055    struct brw_shader *shader = NULL;
3056    if (prog)
3057       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3058
3059    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3060       if (prog) {
3061          printf("GLSL IR for native fragment shader %d:\n", prog->Name);
3062          _mesa_print_ir(shader->ir, NULL);
3063          printf("\n\n");
3064       } else {
3065          printf("ARB_fragment_program %d ir for native fragment shader\n",
3066                 fp->Base.Id);
3067          _mesa_print_program(&fp->Base);
3068       }
3069    }
3070
3071    /* Now the main event: Visit the shader IR and generate our FS IR for it.
3072     */
3073    fs_visitor v(brw, c, prog, fp, 8);
3074    if (!v.run()) {
3075       if (prog) {
3076          prog->LinkStatus = false;
3077          ralloc_strcat(&prog->InfoLog, v.fail_msg);
3078       }
3079
3080       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3081                     v.fail_msg);
3082
3083       return NULL;
3084    }
3085
3086    exec_list *simd16_instructions = NULL;
3087    fs_visitor v2(brw, c, prog, fp, 16);
3088    if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16))) {
3089       if (c->prog_data.nr_pull_params == 0) {
3090          /* Try a 16-wide compile */
3091          v2.import_uniforms(&v);
3092          if (!v2.run()) {
3093             perf_debug("16-wide shader failed to compile, falling back to "
3094                        "8-wide at a 10-20%% performance cost: %s", v2.fail_msg);
3095          } else {
3096             simd16_instructions = &v2.instructions;
3097          }
3098       } else {
3099          perf_debug("Skipping 16-wide due to pull parameters.\n");
3100       }
3101    }
3102
3103    c->prog_data.dispatch_width = 8;
3104
3105    fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
3106    const unsigned *generated = g.generate_assembly(&v.instructions,
3107                                                    simd16_instructions,
3108                                                    final_assembly_size);
3109
3110    if (unlikely(brw->perf_debug) && shader) {
3111       if (shader->compiled_once)
3112          brw_wm_debug_recompile(brw, prog, &c->key);
3113       shader->compiled_once = true;
3114
3115       if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
3116          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3117                     (get_time() - start_time) * 1000);
3118       }
3119    }
3120
3121    return generated;
3122 }
3123
3124 bool
3125 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
3126 {
3127    struct brw_context *brw = brw_context(ctx);
3128    struct brw_wm_prog_key key;
3129
3130    if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
3131       return true;
3132
3133    struct gl_fragment_program *fp = (struct gl_fragment_program *)
3134       prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
3135    struct brw_fragment_program *bfp = brw_fragment_program(fp);
3136    bool program_uses_dfdy = fp->UsesDFdy;
3137
3138    memset(&key, 0, sizeof(key));
3139
3140    if (brw->gen < 6) {
3141       if (fp->UsesKill)
3142          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3143
3144       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3145          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3146
3147       /* Just assume depth testing. */
3148       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3149       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3150    }
3151
3152    if (brw->gen < 6)
3153       key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
3154
3155    key.clamp_fragment_color = ctx->API == API_OPENGL_COMPAT;
3156
3157    unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
3158    for (unsigned i = 0; i < sampler_count; i++) {
3159       if (fp->Base.ShadowSamplers & (1 << i)) {
3160          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3161          key.tex.swizzles[i] =
3162             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3163       } else {
3164          /* Color sampler: assume no swizzling. */
3165          key.tex.swizzles[i] = SWIZZLE_XYZW;
3166       }
3167    }
3168
3169    if (fp->Base.InputsRead & VARYING_BIT_POS) {
3170       key.drawable_height = ctx->DrawBuffer->Height;
3171    }
3172
3173    if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3174       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
3175    }
3176
3177    key.nr_color_regions = 1;
3178
3179    key.program_string_id = bfp->id;
3180
3181    uint32_t old_prog_offset = brw->wm.base.prog_offset;
3182    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3183
3184    bool success = do_wm_prog(brw, prog, bfp, &key);
3185
3186    brw->wm.base.prog_offset = old_prog_offset;
3187    brw->wm.prog_data = old_prog_data;
3188
3189    return success;
3190 }