src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 extern "C" {
  32
  33 #include <sys/types.h>
  34
  35 #include "main/hash_table.h"
  36 #include "main/macros.h"
  37 #include "main/shaderobj.h"
  38 #include "main/uniforms.h"
  39 #include "main/fbobject.h"
  40 #include "program/prog_parameter.h"
  41 #include "program/prog_print.h"
  42 #include "program/register_allocate.h"
  43 #include "program/sampler.h"
  44 #include "program/hash_table.h"
  45 #include "brw_context.h"
  46 #include "brw_eu.h"
  47 #include "brw_wm.h"
  48 }
  49 #include "brw_fs.h"
  50 #include "glsl/glsl_types.h"
  51 #include "glsl/ir_print_visitor.h"
  52
  53 void
  54 fs_inst::init()
  55 {
  56    memset(this, 0, sizeof(*this));
  57    this->opcode = BRW_OPCODE_NOP;
  58    this->conditional_mod = BRW_CONDITIONAL_NONE;
  59
  60    this->dst = reg_undef;
  61    this->src[0] = reg_undef;
  62    this->src[1] = reg_undef;
  63    this->src[2] = reg_undef;
  64
  65    /* This will be the case for almost all instructions. */
  66    this->regs_written = 1;
  67 }
  68
  69 fs_inst::fs_inst()
  70 {
  71    init();
  72 }
  73
  74 fs_inst::fs_inst(enum opcode opcode)
  75 {
  76    init();
  77    this->opcode = opcode;
  78 }
  79
  80 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
  81 {
  82    init();
  83    this->opcode = opcode;
  84    this->dst = dst;
  85
  86    if (dst.file == GRF)
  87       assert(dst.reg_offset >= 0);
  88 }
  89
  90 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
  91 {
  92    init();
  93    this->opcode = opcode;
  94    this->dst = dst;
  95    this->src[0] = src0;
  96
  97    if (dst.file == GRF)
  98       assert(dst.reg_offset >= 0);
  99    if (src[0].file == GRF)
 100       assert(src[0].reg_offset >= 0);
 101 }
 102
 103 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 104 {
 105    init();
 106    this->opcode = opcode;
 107    this->dst = dst;
 108    this->src[0] = src0;
 109    this->src[1] = src1;
 110
 111    if (dst.file == GRF)
 112       assert(dst.reg_offset >= 0);
 113    if (src[0].file == GRF)
 114       assert(src[0].reg_offset >= 0);
 115    if (src[1].file == GRF)
 116       assert(src[1].reg_offset >= 0);
 117 }
 118
 119 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
 120                  fs_reg src0, fs_reg src1, fs_reg src2)
 121 {
 122    init();
 123    this->opcode = opcode;
 124    this->dst = dst;
 125    this->src[0] = src0;
 126    this->src[1] = src1;
 127    this->src[2] = src2;
 128
 129    if (dst.file == GRF)
 130       assert(dst.reg_offset >= 0);
 131    if (src[0].file == GRF)
 132       assert(src[0].reg_offset >= 0);
 133    if (src[1].file == GRF)
 134       assert(src[1].reg_offset >= 0);
 135    if (src[2].file == GRF)
 136       assert(src[2].reg_offset >= 0);
 137 }
 138
 139 #define ALU1(op)                                                        \
 140    fs_inst *                                                            \
 141    fs_visitor::op(fs_reg dst, fs_reg src0)                              \
 142    {                                                                    \
 143       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
 144    }
 145
 146 #define ALU2(op)                                                        \
 147    fs_inst *                                                            \
 148    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1)                 \
 149    {                                                                    \
 150       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
 151    }
 152
 153 #define ALU3(op)                                                        \
 154    fs_inst *                                                            \
 155    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1, fs_reg src2)    \
 156    {                                                                    \
 157       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
 158    }
 159
 160 ALU1(NOT)
 161 ALU1(MOV)
 162 ALU1(FRC)
 163 ALU1(RNDD)
 164 ALU1(RNDE)
 165 ALU1(RNDZ)
 166 ALU2(ADD)
 167 ALU2(MUL)
 168 ALU2(MACH)
 169 ALU2(AND)
 170 ALU2(OR)
 171 ALU2(XOR)
 172 ALU2(SHL)
 173 ALU2(SHR)
 174 ALU2(ASR)
 175 ALU3(LRP)
 176 ALU1(BFREV)
 177 ALU3(BFE)
 178 ALU2(BFI1)
 179 ALU3(BFI2)
 180 ALU1(FBH)
 181 ALU1(FBL)
 182 ALU1(CBIT)
 183
 184 /** Gen4 predicated IF. */
 185 fs_inst *
 186 fs_visitor::IF(uint32_t predicate)
 187 {
 188    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
 189    inst->predicate = predicate;
 190    return inst;
 191 }
 192
 193 /** Gen6+ IF with embedded comparison. */
 194 fs_inst *
 195 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
 196 {
 197    assert(intel->gen >= 6);
 198    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
 199                                         reg_null_d, src0, src1);
 200    inst->conditional_mod = condition;
 201    return inst;
 202 }
 203
 204 /**
 205  * CMP: Sets the low bit of the destination channels with the result
 206  * of the comparison, while the upper bits are undefined, and updates
 207  * the flag register with the packed 16 bits of the result.
 208  */
 209 fs_inst *
 210 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
 211 {
 212    fs_inst *inst;
 213
 214    /* Take the instruction:
 215     *
 216     * CMP null<d> src0<f> src1<f>
 217     *
 218     * Original gen4 does type conversion to the destination type before
 219     * comparison, producing garbage results for floating point comparisons.
 220     * gen5 does the comparison on the execution type (resolved source types),
 221     * so dst type doesn't matter.  gen6 does comparison and then uses the
 222     * result as if it was the dst type with no conversion, which happens to
 223     * mostly work out for float-interpreted-as-int since our comparisons are
 224     * for >0, =0, <0.
 225     */
 226    if (intel->gen == 4) {
 227       dst.type = src0.type;
 228       if (dst.file == HW_REG)
 229          dst.fixed_hw_reg.type = dst.type;
 230    }
 231
 232    resolve_ud_negate(&src0);
 233    resolve_ud_negate(&src1);
 234
 235    inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
 236    inst->conditional_mod = condition;
 237
 238    return inst;
 239 }
 240
 241 exec_list
 242 fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
 243                                        fs_reg varying_offset,
 244                                        uint32_t const_offset)
 245 {
 246    exec_list instructions;
 247    fs_inst *inst;
 248
 249    /* We have our constant surface use a pitch of 4 bytes, so our index can
 250     * be any component of a vector, and then we load 4 contiguous
 251     * components starting from that.
 252     *
 253     * We break down the const_offset to a portion added to the variable
 254     * offset and a portion done using reg_offset, which means that if you
 255     * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
 256     * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
 257     * CSE can later notice that those loads are all the same and eliminate
 258     * the redundant ones.
 259     */
 260    fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
 261    instructions.push_tail(ADD(vec4_offset,
 262                               varying_offset, const_offset & ~3));
 263
 264    int scale = 1;
 265    if (intel->gen == 4 && dispatch_width == 8) {
 266       /* Pre-gen5, we can either use a SIMD8 message that requires (header,
 267        * u, v, r) as parameters, or we can just use the SIMD16 message
 268        * consisting of (header, u).  We choose the second, at the cost of a
 269        * longer return length.
 270        */
 271       scale = 2;
 272    }
 273
 274    enum opcode op;
 275    if (intel->gen >= 7)
 276       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
 277    else
 278       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
 279    fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4 * scale), dst.type);
 280    inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
 281    inst->regs_written = 4 * scale;
 282    instructions.push_tail(inst);
 283
 284    if (intel->gen < 7) {
 285       inst->base_mrf = 13;
 286       inst->header_present = true;
 287       if (intel->gen == 4)
 288          inst->mlen = 3;
 289       else
 290          inst->mlen = 1 + dispatch_width / 8;
 291    }
 292
 293    vec4_result.reg_offset += (const_offset & 3) * scale;
 294    instructions.push_tail(MOV(dst, vec4_result));
 295
 296    return instructions;
 297 }
 298
 299 /**
 300  * A helper for MOV generation for fixing up broken hardware SEND dependency
 301  * handling.
 302  */
 303 fs_inst *
 304 fs_visitor::DEP_RESOLVE_MOV(int grf)
 305 {
 306    fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
 307
 308    inst->ir = NULL;
 309    inst->annotation = "send dependency resolve";
 310
 311    /* The caller always wants uncompressed to emit the minimal extra
 312     * dependencies, and to avoid having to deal with aligning its regs to 2.
 313     */
 314    inst->force_uncompressed = true;
 315
 316    return inst;
 317 }
 318
 319 bool
 320 fs_inst::equals(fs_inst *inst)
 321 {
 322    return (opcode == inst->opcode &&
 323            dst.equals(inst->dst) &&
 324            src[0].equals(inst->src[0]) &&
 325            src[1].equals(inst->src[1]) &&
 326            src[2].equals(inst->src[2]) &&
 327            saturate == inst->saturate &&
 328            predicate == inst->predicate &&
 329            conditional_mod == inst->conditional_mod &&
 330            mlen == inst->mlen &&
 331            base_mrf == inst->base_mrf &&
 332            sampler == inst->sampler &&
 333            target == inst->target &&
 334            eot == inst->eot &&
 335            header_present == inst->header_present &&
 336            shadow_compare == inst->shadow_compare &&
 337            offset == inst->offset);
 338 }
 339
 340 bool
 341 fs_inst::overwrites_reg(const fs_reg &reg)
 342 {
 343    return (reg.file == dst.file &&
 344            reg.reg == dst.reg &&
 345            reg.reg_offset >= dst.reg_offset  &&
 346            reg.reg_offset < dst.reg_offset + regs_written);
 347 }
 348
 349 bool
 350 fs_inst::is_send_from_grf()
 351 {
 352    return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
 353            opcode == SHADER_OPCODE_SHADER_TIME_ADD ||
 354            (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
 355             src[1].file == GRF));
 356 }
 357
 358 bool
 359 fs_visitor::can_do_source_mods(fs_inst *inst)
 360 {
 361    if (intel->gen == 6 && inst->is_math())
 362       return false;
 363
 364    if (inst->is_send_from_grf())
 365       return false;
 366
 367    return true;
 368 }
 369
 370 void
 371 fs_reg::init()
 372 {
 373    memset(this, 0, sizeof(*this));
 374    this->smear = -1;
 375 }
 376
 377 /** Generic unset register constructor. */
 378 fs_reg::fs_reg()
 379 {
 380    init();
 381    this->file = BAD_FILE;
 382 }
 383
 384 /** Immediate value constructor. */
 385 fs_reg::fs_reg(float f)
 386 {
 387    init();
 388    this->file = IMM;
 389    this->type = BRW_REGISTER_TYPE_F;
 390    this->imm.f = f;
 391 }
 392
 393 /** Immediate value constructor. */
 394 fs_reg::fs_reg(int32_t i)
 395 {
 396    init();
 397    this->file = IMM;
 398    this->type = BRW_REGISTER_TYPE_D;
 399    this->imm.i = i;
 400 }
 401
 402 /** Immediate value constructor. */
 403 fs_reg::fs_reg(uint32_t u)
 404 {
 405    init();
 406    this->file = IMM;
 407    this->type = BRW_REGISTER_TYPE_UD;
 408    this->imm.u = u;
 409 }
 410
 411 /** Fixed brw_reg Immediate value constructor. */
 412 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
 413 {
 414    init();
 415    this->file = HW_REG;
 416    this->fixed_hw_reg = fixed_hw_reg;
 417    this->type = fixed_hw_reg.type;
 418 }
 419
 420 bool
 421 fs_reg::equals(const fs_reg &r) const
 422 {
 423    return (file == r.file &&
 424            reg == r.reg &&
 425            reg_offset == r.reg_offset &&
 426            type == r.type &&
 427            negate == r.negate &&
 428            abs == r.abs &&
 429            !reladdr && !r.reladdr &&
 430            memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
 431                   sizeof(fixed_hw_reg)) == 0 &&
 432            smear == r.smear &&
 433            imm.u == r.imm.u);
 434 }
 435
 436 bool
 437 fs_reg::is_zero() const
 438 {
 439    if (file != IMM)
 440       return false;
 441
 442    return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
 443 }
 444
 445 bool
 446 fs_reg::is_one() const
 447 {
 448    if (file != IMM)
 449       return false;
 450
 451    return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
 452 }
 453
 454 bool
 455 fs_reg::is_valid_3src() const
 456 {
 457    return file == GRF || file == UNIFORM;
 458 }
 459
 460 int
 461 fs_visitor::type_size(const struct glsl_type *type)
 462 {
 463    unsigned int size, i;
 464
 465    switch (type->base_type) {
 466    case GLSL_TYPE_UINT:
 467    case GLSL_TYPE_INT:
 468    case GLSL_TYPE_FLOAT:
 469    case GLSL_TYPE_BOOL:
 470       return type->components();
 471    case GLSL_TYPE_ARRAY:
 472       return type_size(type->fields.array) * type->length;
 473    case GLSL_TYPE_STRUCT:
 474       size = 0;
 475       for (i = 0; i < type->length; i++) {
 476          size += type_size(type->fields.structure[i].type);
 477       }
 478       return size;
 479    case GLSL_TYPE_SAMPLER:
 480       /* Samplers take up no register space, since they're baked in at
 481        * link time.
 482        */
 483       return 0;
 484    case GLSL_TYPE_VOID:
 485    case GLSL_TYPE_ERROR:
 486    case GLSL_TYPE_INTERFACE:
 487       assert(!"not reached");
 488       break;
 489    }
 490
 491    return 0;
 492 }
 493
 494 fs_reg
 495 fs_visitor::get_timestamp()
 496 {
 497    assert(intel->gen >= 7);
 498
 499    fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 500                                           BRW_ARF_TIMESTAMP,
 501                                           0),
 502                              BRW_REGISTER_TYPE_UD));
 503
 504    fs_reg dst = fs_reg(this, glsl_type::uint_type);
 505
 506    fs_inst *mov = emit(MOV(dst, ts));
 507    /* We want to read the 3 fields we care about (mostly field 0, but also 2)
 508     * even if it's not enabled in the dispatch.
 509     */
 510    mov->force_writemask_all = true;
 511    mov->force_uncompressed = true;
 512
 513    /* The caller wants the low 32 bits of the timestamp.  Since it's running
 514     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
 515     * which is plenty of time for our purposes.  It is identical across the
 516     * EUs, but since it's tracking GPU core speed it will increment at a
 517     * varying rate as render P-states change.
 518     *
 519     * The caller could also check if render P-states have changed (or anything
 520     * else that might disrupt timing) by setting smear to 2 and checking if
 521     * that field is != 0.
 522     */
 523    dst.smear = 0;
 524
 525    return dst;
 526 }
 527
 528 void
 529 fs_visitor::emit_shader_time_begin()
 530 {
 531    current_annotation = "shader time start";
 532    shader_start_time = get_timestamp();
 533 }
 534
 535 void
 536 fs_visitor::emit_shader_time_end()
 537 {
 538    current_annotation = "shader time end";
 539
 540    enum shader_time_shader_type type, written_type, reset_type;
 541    if (dispatch_width == 8) {
 542       type = ST_FS8;
 543       written_type = ST_FS8_WRITTEN;
 544       reset_type = ST_FS8_RESET;
 545    } else {
 546       assert(dispatch_width == 16);
 547       type = ST_FS16;
 548       written_type = ST_FS16_WRITTEN;
 549       reset_type = ST_FS16_RESET;
 550    }
 551
 552    fs_reg shader_end_time = get_timestamp();
 553
 554    /* Check that there weren't any timestamp reset events (assuming these
 555     * were the only two timestamp reads that happened).
 556     */
 557    fs_reg reset = shader_end_time;
 558    reset.smear = 2;
 559    fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
 560    test->conditional_mod = BRW_CONDITIONAL_Z;
 561    emit(IF(BRW_PREDICATE_NORMAL));
 562
 563    push_force_uncompressed();
 564    fs_reg start = shader_start_time;
 565    start.negate = true;
 566    fs_reg diff = fs_reg(this, glsl_type::uint_type);
 567    emit(ADD(diff, start, shader_end_time));
 568
 569    /* If there were no instructions between the two timestamp gets, the diff
 570     * is 2 cycles.  Remove that overhead, so I can forget about that when
 571     * trying to determine the time taken for single instructions.
 572     */
 573    emit(ADD(diff, diff, fs_reg(-2u)));
 574
 575    emit_shader_time_write(type, diff);
 576    emit_shader_time_write(written_type, fs_reg(1u));
 577    emit(BRW_OPCODE_ELSE);
 578    emit_shader_time_write(reset_type, fs_reg(1u));
 579    emit(BRW_OPCODE_ENDIF);
 580
 581    pop_force_uncompressed();
 582 }
 583
 584 void
 585 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
 586                                    fs_reg value)
 587 {
 588    int shader_time_index =
 589       brw_get_shader_time_index(brw, shader_prog, &fp->Base, type);
 590    fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
 591
 592    fs_reg payload;
 593    if (dispatch_width == 8)
 594       payload = fs_reg(this, glsl_type::uvec2_type);
 595    else
 596       payload = fs_reg(this, glsl_type::uint_type);
 597
 598    emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
 599                 fs_reg(), payload, offset, value));
 600 }
 601
 602 void
 603 fs_visitor::fail(const char *format, ...)
 604 {
 605    va_list va;
 606    char *msg;
 607
 608    if (failed)
 609       return;
 610
 611    failed = true;
 612
 613    va_start(va, format);
 614    msg = ralloc_vasprintf(mem_ctx, format, va);
 615    va_end(va);
 616    msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
 617
 618    this->fail_msg = msg;
 619
 620    if (INTEL_DEBUG & DEBUG_WM) {
 621       fprintf(stderr, "%s",  msg);
 622    }
 623 }
 624
 625 fs_inst *
 626 fs_visitor::emit(enum opcode opcode)
 627 {
 628    return emit(fs_inst(opcode));
 629 }
 630
 631 fs_inst *
 632 fs_visitor::emit(enum opcode opcode, fs_reg dst)
 633 {
 634    return emit(fs_inst(opcode, dst));
 635 }
 636
 637 fs_inst *
 638 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
 639 {
 640    return emit(fs_inst(opcode, dst, src0));
 641 }
 642
 643 fs_inst *
 644 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 645 {
 646    return emit(fs_inst(opcode, dst, src0, src1));
 647 }
 648
 649 fs_inst *
 650 fs_visitor::emit(enum opcode opcode, fs_reg dst,
 651                  fs_reg src0, fs_reg src1, fs_reg src2)
 652 {
 653    return emit(fs_inst(opcode, dst, src0, src1, src2));
 654 }
 655
 656 void
 657 fs_visitor::push_force_uncompressed()
 658 {
 659    force_uncompressed_stack++;
 660 }
 661
 662 void
 663 fs_visitor::pop_force_uncompressed()
 664 {
 665    force_uncompressed_stack--;
 666    assert(force_uncompressed_stack >= 0);
 667 }
 668
 669 void
 670 fs_visitor::push_force_sechalf()
 671 {
 672    force_sechalf_stack++;
 673 }
 674
 675 void
 676 fs_visitor::pop_force_sechalf()
 677 {
 678    force_sechalf_stack--;
 679    assert(force_sechalf_stack >= 0);
 680 }
 681
 682 /**
 683  * Returns true if the instruction has a flag that means it won't
 684  * update an entire destination register.
 685  *
 686  * For example, dead code elimination and live variable analysis want to know
 687  * when a write to a variable screens off any preceding values that were in
 688  * it.
 689  */
 690 bool
 691 fs_inst::is_partial_write()
 692 {
 693    return (this->predicate ||
 694            this->force_uncompressed ||
 695            this->force_sechalf);
 696 }
 697
 698 /**
 699  * Returns how many MRFs an FS opcode will write over.
 700  *
 701  * Note that this is not the 0 or 1 implied writes in an actual gen
 702  * instruction -- the FS opcodes often generate MOVs in addition.
 703  */
 704 int
 705 fs_visitor::implied_mrf_writes(fs_inst *inst)
 706 {
 707    if (inst->mlen == 0)
 708       return 0;
 709
 710    switch (inst->opcode) {
 711    case SHADER_OPCODE_RCP:
 712    case SHADER_OPCODE_RSQ:
 713    case SHADER_OPCODE_SQRT:
 714    case SHADER_OPCODE_EXP2:
 715    case SHADER_OPCODE_LOG2:
 716    case SHADER_OPCODE_SIN:
 717    case SHADER_OPCODE_COS:
 718       return 1 * dispatch_width / 8;
 719    case SHADER_OPCODE_POW:
 720    case SHADER_OPCODE_INT_QUOTIENT:
 721    case SHADER_OPCODE_INT_REMAINDER:
 722       return 2 * dispatch_width / 8;
 723    case SHADER_OPCODE_TEX:
 724    case FS_OPCODE_TXB:
 725    case SHADER_OPCODE_TXD:
 726    case SHADER_OPCODE_TXF:
 727    case SHADER_OPCODE_TXF_MS:
 728    case SHADER_OPCODE_TXL:
 729    case SHADER_OPCODE_TXS:
 730    case SHADER_OPCODE_LOD:
 731       return 1;
 732    case FS_OPCODE_FB_WRITE:
 733       return 2;
 734    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 735    case FS_OPCODE_UNSPILL:
 736       return 1;
 737    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
 738       return inst->mlen;
 739    case FS_OPCODE_SPILL:
 740       return 2;
 741    default:
 742       assert(!"not reached");
 743       return inst->mlen;
 744    }
 745 }
 746
 747 int
 748 fs_visitor::virtual_grf_alloc(int size)
 749 {
 750    if (virtual_grf_array_size <= virtual_grf_count) {
 751       if (virtual_grf_array_size == 0)
 752          virtual_grf_array_size = 16;
 753       else
 754          virtual_grf_array_size *= 2;
 755       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 756                                    virtual_grf_array_size);
 757    }
 758    virtual_grf_sizes[virtual_grf_count] = size;
 759    return virtual_grf_count++;
 760 }
 761
 762 /** Fixed HW reg constructor. */
 763 fs_reg::fs_reg(enum register_file file, int reg)
 764 {
 765    init();
 766    this->file = file;
 767    this->reg = reg;
 768    this->type = BRW_REGISTER_TYPE_F;
 769 }
 770
 771 /** Fixed HW reg constructor. */
 772 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
 773 {
 774    init();
 775    this->file = file;
 776    this->reg = reg;
 777    this->type = type;
 778 }
 779
 780 /** Automatic reg constructor. */
 781 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
 782 {
 783    init();
 784
 785    this->file = GRF;
 786    this->reg = v->virtual_grf_alloc(v->type_size(type));
 787    this->reg_offset = 0;
 788    this->type = brw_type_for_base_type(type);
 789 }
 790
 791 fs_reg *
 792 fs_visitor::variable_storage(ir_variable *var)
 793 {
 794    return (fs_reg *)hash_table_find(this->variable_ht, var);
 795 }
 796
 797 void
 798 import_uniforms_callback(const void *key,
 799                          void *data,
 800                          void *closure)
 801 {
 802    struct hash_table *dst_ht = (struct hash_table *)closure;
 803    const fs_reg *reg = (const fs_reg *)data;
 804
 805    if (reg->file != UNIFORM)
 806       return;
 807
 808    hash_table_insert(dst_ht, data, key);
 809 }
 810
 811 /* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
 812  * This brings in those uniform definitions
 813  */
 814 void
 815 fs_visitor::import_uniforms(fs_visitor *v)
 816 {
 817    hash_table_call_foreach(v->variable_ht,
 818                            import_uniforms_callback,
 819                            variable_ht);
 820    this->params_remap = v->params_remap;
 821 }
 822
 823 /* Our support for uniforms is piggy-backed on the struct
 824  * gl_fragment_program, because that's where the values actually
 825  * get stored, rather than in some global gl_shader_program uniform
 826  * store.
 827  */
 828 void
 829 fs_visitor::setup_uniform_values(ir_variable *ir)
 830 {
 831    int namelen = strlen(ir->name);
 832
 833    /* The data for our (non-builtin) uniforms is stored in a series of
 834     * gl_uniform_driver_storage structs for each subcomponent that
 835     * glGetUniformLocation() could name.  We know it's been set up in the same
 836     * order we'd walk the type, so walk the list of storage and find anything
 837     * with our name, or the prefix of a component that starts with our name.
 838     */
 839    unsigned params_before = c->prog_data.nr_params;
 840    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
 841       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 842
 843       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 844           (storage->name[namelen] != 0 &&
 845            storage->name[namelen] != '.' &&
 846            storage->name[namelen] != '[')) {
 847          continue;
 848       }
 849
 850       unsigned slots = storage->type->component_slots();
 851       if (storage->array_elements)
 852          slots *= storage->array_elements;
 853
 854       for (unsigned i = 0; i < slots; i++) {
 855          c->prog_data.param[c->prog_data.nr_params++] =
 856             &storage->storage[i].f;
 857       }
 858    }
 859
 860    /* Make sure we actually initialized the right amount of stuff here. */
 861    assert(params_before + ir->type->component_slots() ==
 862           c->prog_data.nr_params);
 863    (void)params_before;
 864 }
 865
 866
 867 /* Our support for builtin uniforms is even scarier than non-builtin.
 868  * It sits on top of the PROG_STATE_VAR parameters that are
 869  * automatically updated from GL context state.
 870  */
 871 void
 872 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
 873 {
 874    const ir_state_slot *const slots = ir->state_slots;
 875    assert(ir->state_slots != NULL);
 876
 877    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 878       /* This state reference has already been setup by ir_to_mesa, but we'll
 879        * get the same index back here.
 880        */
 881       int index = _mesa_add_state_reference(this->fp->Base.Parameters,
 882                                             (gl_state_index *)slots[i].tokens);
 883
 884       /* Add each of the unique swizzles of the element as a parameter.
 885        * This'll end up matching the expected layout of the
 886        * array/matrix/structure we're trying to fill in.
 887        */
 888       int last_swiz = -1;
 889       for (unsigned int j = 0; j < 4; j++) {
 890          int swiz = GET_SWZ(slots[i].swizzle, j);
 891          if (swiz == last_swiz)
 892             break;
 893          last_swiz = swiz;
 894
 895          c->prog_data.param[c->prog_data.nr_params++] =
 896             &fp->Base.Parameters->ParameterValues[index][swiz].f;
 897       }
 898    }
 899 }
 900
 901 fs_reg *
 902 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
 903 {
 904    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 905    fs_reg wpos = *reg;
 906    bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
 907
 908    /* gl_FragCoord.x */
 909    if (ir->pixel_center_integer) {
 910       emit(MOV(wpos, this->pixel_x));
 911    } else {
 912       emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
 913    }
 914    wpos.reg_offset++;
 915
 916    /* gl_FragCoord.y */
 917    if (!flip && ir->pixel_center_integer) {
 918       emit(MOV(wpos, this->pixel_y));
 919    } else {
 920       fs_reg pixel_y = this->pixel_y;
 921       float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
 922
 923       if (flip) {
 924          pixel_y.negate = true;
 925          offset += c->key.drawable_height - 1.0;
 926       }
 927
 928       emit(ADD(wpos, pixel_y, fs_reg(offset)));
 929    }
 930    wpos.reg_offset++;
 931
 932    /* gl_FragCoord.z */
 933    if (intel->gen >= 6) {
 934       emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
 935    } else {
 936       emit(FS_OPCODE_LINTERP, wpos,
 937            this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
 938            this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
 939            interp_reg(VARYING_SLOT_POS, 2));
 940    }
 941    wpos.reg_offset++;
 942
 943    /* gl_FragCoord.w: Already set up in emit_interpolation */
 944    emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
 945
 946    return reg;
 947 }
 948
 949 fs_inst *
 950 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
 951                          glsl_interp_qualifier interpolation_mode,
 952                          bool is_centroid)
 953 {
 954    brw_wm_barycentric_interp_mode barycoord_mode;
 955    if (intel->gen >= 6) {
 956       if (is_centroid) {
 957          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
 958             barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
 959          else
 960             barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
 961       } else {
 962          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
 963             barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
 964          else
 965             barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
 966       }
 967    } else {
 968       /* On Ironlake and below, there is only one interpolation mode.
 969        * Centroid interpolation doesn't mean anything on this hardware --
 970        * there is no multisampling.
 971        */
 972       barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
 973    }
 974    return emit(FS_OPCODE_LINTERP, attr,
 975                this->delta_x[barycoord_mode],
 976                this->delta_y[barycoord_mode], interp);
 977 }
 978
 979 fs_reg *
 980 fs_visitor::emit_general_interpolation(ir_variable *ir)
 981 {
 982    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 983    reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
 984    fs_reg attr = *reg;
 985
 986    unsigned int array_elements;
 987    const glsl_type *type;
 988
 989    if (ir->type->is_array()) {
 990       array_elements = ir->type->length;
 991       if (array_elements == 0) {
 992          fail("dereferenced array '%s' has length 0\n", ir->name);
 993       }
 994       type = ir->type->fields.array;
 995    } else {
 996       array_elements = 1;
 997       type = ir->type;
 998    }
 999
1000    glsl_interp_qualifier interpolation_mode =
1001       ir->determine_interpolation_mode(c->key.flat_shade);
1002
1003    int location = ir->location;
1004    for (unsigned int i = 0; i < array_elements; i++) {
1005       for (unsigned int j = 0; j < type->matrix_columns; j++) {
1006          if (urb_setup[location] == -1) {
1007             /* If there's no incoming setup data for this slot, don't
1008              * emit interpolation for it.
1009              */
1010             attr.reg_offset += type->vector_elements;
1011             location++;
1012             continue;
1013          }
1014
1015          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1016             /* Constant interpolation (flat shading) case. The SF has
1017              * handed us defined values in only the constant offset
1018              * field of the setup reg.
1019              */
1020             for (unsigned int k = 0; k < type->vector_elements; k++) {
1021                struct brw_reg interp = interp_reg(location, k);
1022                interp = suboffset(interp, 3);
1023                interp.type = reg->type;
1024                emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1025                attr.reg_offset++;
1026             }
1027          } else {
1028             /* Smooth/noperspective interpolation case. */
1029             for (unsigned int k = 0; k < type->vector_elements; k++) {
1030                /* FINISHME: At some point we probably want to push
1031                 * this farther by giving similar treatment to the
1032                 * other potentially constant components of the
1033                 * attribute, as well as making brw_vs_constval.c
1034                 * handle varyings other than gl_TexCoord.
1035                 */
1036                struct brw_reg interp = interp_reg(location, k);
1037                emit_linterp(attr, fs_reg(interp), interpolation_mode,
1038                             ir->centroid);
1039                if (brw->needs_unlit_centroid_workaround && ir->centroid) {
1040                   /* Get the pixel/sample mask into f0 so that we know
1041                    * which pixels are lit.  Then, for each channel that is
1042                    * unlit, replace the centroid data with non-centroid
1043                    * data.
1044                    */
1045                   emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1046                   fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1047                                                interpolation_mode, false);
1048                   inst->predicate = BRW_PREDICATE_NORMAL;
1049                   inst->predicate_inverse = true;
1050                }
1051                if (intel->gen < 6) {
1052                   emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1053                }
1054                attr.reg_offset++;
1055             }
1056
1057          }
1058          location++;
1059       }
1060    }
1061
1062    return reg;
1063 }
1064
1065 fs_reg *
1066 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1067 {
1068    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1069
1070    /* The frontfacing comes in as a bit in the thread payload. */
1071    if (intel->gen >= 6) {
1072       emit(BRW_OPCODE_ASR, *reg,
1073            fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1074            fs_reg(15));
1075       emit(BRW_OPCODE_NOT, *reg, *reg);
1076       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1077    } else {
1078       struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1079       /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1080        * us front face
1081        */
1082       emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1083       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1084    }
1085
1086    return reg;
1087 }
1088
1089 fs_reg
1090 fs_visitor::fix_math_operand(fs_reg src)
1091 {
1092    /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1093     * might be able to do better by doing execsize = 1 math and then
1094     * expanding that result out, but we would need to be careful with
1095     * masking.
1096     *
1097     * The hardware ignores source modifiers (negate and abs) on math
1098     * instructions, so we also move to a temp to set those up.
1099     */
1100    if (intel->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1101        !src.abs && !src.negate)
1102       return src;
1103
1104    /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1105     * operands to math
1106     */
1107    if (intel->gen >= 7 && src.file != IMM)
1108       return src;
1109
1110    fs_reg expanded = fs_reg(this, glsl_type::float_type);
1111    expanded.type = src.type;
1112    emit(BRW_OPCODE_MOV, expanded, src);
1113    return expanded;
1114 }
1115
1116 fs_inst *
1117 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1118 {
1119    switch (opcode) {
1120    case SHADER_OPCODE_RCP:
1121    case SHADER_OPCODE_RSQ:
1122    case SHADER_OPCODE_SQRT:
1123    case SHADER_OPCODE_EXP2:
1124    case SHADER_OPCODE_LOG2:
1125    case SHADER_OPCODE_SIN:
1126    case SHADER_OPCODE_COS:
1127       break;
1128    default:
1129       assert(!"not reached: bad math opcode");
1130       return NULL;
1131    }
1132
1133    /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
1134     * might be able to do better by doing execsize = 1 math and then
1135     * expanding that result out, but we would need to be careful with
1136     * masking.
1137     *
1138     * Gen 6 hardware ignores source modifiers (negate and abs) on math
1139     * instructions, so we also move to a temp to set those up.
1140     */
1141    if (intel->gen >= 6)
1142       src = fix_math_operand(src);
1143
1144    fs_inst *inst = emit(opcode, dst, src);
1145
1146    if (intel->gen < 6) {
1147       inst->base_mrf = 2;
1148       inst->mlen = dispatch_width / 8;
1149    }
1150
1151    return inst;
1152 }
1153
1154 fs_inst *
1155 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1156 {
1157    int base_mrf = 2;
1158    fs_inst *inst;
1159
1160    switch (opcode) {
1161    case SHADER_OPCODE_INT_QUOTIENT:
1162    case SHADER_OPCODE_INT_REMAINDER:
1163       if (intel->gen >= 7 && dispatch_width == 16)
1164          fail("16-wide INTDIV unsupported\n");
1165       break;
1166    case SHADER_OPCODE_POW:
1167       break;
1168    default:
1169       assert(!"not reached: unsupported binary math opcode.");
1170       return NULL;
1171    }
1172
1173    if (intel->gen >= 6) {
1174       src0 = fix_math_operand(src0);
1175       src1 = fix_math_operand(src1);
1176
1177       inst = emit(opcode, dst, src0, src1);
1178    } else {
1179       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1180        * "Message Payload":
1181        *
1182        * "Operand0[7].  For the INT DIV functions, this operand is the
1183        *  denominator."
1184        *  ...
1185        * "Operand1[7].  For the INT DIV functions, this operand is the
1186        *  numerator."
1187        */
1188       bool is_int_div = opcode != SHADER_OPCODE_POW;
1189       fs_reg &op0 = is_int_div ? src1 : src0;
1190       fs_reg &op1 = is_int_div ? src0 : src1;
1191
1192       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1193       inst = emit(opcode, dst, op0, reg_null_f);
1194
1195       inst->base_mrf = base_mrf;
1196       inst->mlen = 2 * dispatch_width / 8;
1197    }
1198    return inst;
1199 }
1200
1201 void
1202 fs_visitor::assign_curb_setup()
1203 {
1204    c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
1205    if (dispatch_width == 8) {
1206       c->prog_data.first_curbe_grf = c->nr_payload_regs;
1207    } else {
1208       c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1209    }
1210
1211    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1212    foreach_list(node, &this->instructions) {
1213       fs_inst *inst = (fs_inst *)node;
1214
1215       for (unsigned int i = 0; i < 3; i++) {
1216          if (inst->src[i].file == UNIFORM) {
1217             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1218             struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1219                                                   constant_nr / 8,
1220                                                   constant_nr % 8);
1221
1222             inst->src[i].file = HW_REG;
1223             inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
1224          }
1225       }
1226    }
1227 }
1228
1229 void
1230 fs_visitor::calculate_urb_setup()
1231 {
1232    for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1233       urb_setup[i] = -1;
1234    }
1235
1236    int urb_next = 0;
1237    /* Figure out where each of the incoming setup attributes lands. */
1238    if (intel->gen >= 6) {
1239       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1240          if (fp->Base.InputsRead & BITFIELD64_BIT(i)) {
1241             urb_setup[i] = urb_next++;
1242          }
1243       }
1244    } else {
1245       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1246       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1247          /* Point size is packed into the header, not as a general attribute */
1248          if (i == VARYING_SLOT_PSIZ)
1249             continue;
1250
1251          if (c->key.input_slots_valid & BITFIELD64_BIT(i)) {
1252             /* The back color slot is skipped when the front color is
1253              * also written to.  In addition, some slots can be
1254              * written in the vertex shader and not read in the
1255              * fragment shader.  So the register number must always be
1256              * incremented, mapped or not.
1257              */
1258             if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1259                urb_setup[i] = urb_next;
1260             urb_next++;
1261          }
1262       }
1263
1264       /*
1265        * It's a FS only attribute, and we did interpolation for this attribute
1266        * in SF thread. So, count it here, too.
1267        *
1268        * See compile_sf_prog() for more info.
1269        */
1270       if (fp->Base.InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1271          urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1272    }
1273
1274    /* Each attribute is 4 setup channels, each of which is half a reg. */
1275    c->prog_data.urb_read_length = urb_next * 2;
1276 }
1277
1278 void
1279 fs_visitor::assign_urb_setup()
1280 {
1281    int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1282
1283    /* Offset all the urb_setup[] index by the actual position of the
1284     * setup regs, now that the location of the constants has been chosen.
1285     */
1286    foreach_list(node, &this->instructions) {
1287       fs_inst *inst = (fs_inst *)node;
1288
1289       if (inst->opcode == FS_OPCODE_LINTERP) {
1290          assert(inst->src[2].file == HW_REG);
1291          inst->src[2].fixed_hw_reg.nr += urb_start;
1292       }
1293
1294       if (inst->opcode == FS_OPCODE_CINTERP) {
1295          assert(inst->src[0].file == HW_REG);
1296          inst->src[0].fixed_hw_reg.nr += urb_start;
1297       }
1298    }
1299
1300    this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length;
1301 }
1302
1303 /**
1304  * Split large virtual GRFs into separate components if we can.
1305  *
1306  * This is mostly duplicated with what brw_fs_vector_splitting does,
1307  * but that's really conservative because it's afraid of doing
1308  * splitting that doesn't result in real progress after the rest of
1309  * the optimization phases, which would cause infinite looping in
1310  * optimization.  We can do it once here, safely.  This also has the
1311  * opportunity to split interpolated values, or maybe even uniforms,
1312  * which we don't have at the IR level.
1313  *
1314  * We want to split, because virtual GRFs are what we register
1315  * allocate and spill (due to contiguousness requirements for some
1316  * instructions), and they're what we naturally generate in the
1317  * codegen process, but most virtual GRFs don't actually need to be
1318  * contiguous sets of GRFs.  If we split, we'll end up with reduced
1319  * live intervals and better dead code elimination and coalescing.
1320  */
1321 void
1322 fs_visitor::split_virtual_grfs()
1323 {
1324    int num_vars = this->virtual_grf_count;
1325    bool split_grf[num_vars];
1326    int new_virtual_grf[num_vars];
1327
1328    /* Try to split anything > 0 sized. */
1329    for (int i = 0; i < num_vars; i++) {
1330       if (this->virtual_grf_sizes[i] != 1)
1331          split_grf[i] = true;
1332       else
1333          split_grf[i] = false;
1334    }
1335
1336    if (brw->has_pln &&
1337        this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1338       /* PLN opcodes rely on the delta_xy being contiguous.  We only have to
1339        * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1340        * Gen6, that was the only supported interpolation mode, and since Gen6,
1341        * delta_x and delta_y are in fixed hardware registers.
1342        */
1343       split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1344          false;
1345    }
1346
1347    foreach_list(node, &this->instructions) {
1348       fs_inst *inst = (fs_inst *)node;
1349
1350       /* If there's a SEND message that requires contiguous destination
1351        * registers, no splitting is allowed.
1352        */
1353       if (inst->regs_written > 1) {
1354          split_grf[inst->dst.reg] = false;
1355       }
1356
1357       /* If we're sending from a GRF, don't split it, on the assumption that
1358        * the send is reading the whole thing.
1359        */
1360       if (inst->is_send_from_grf()) {
1361          split_grf[inst->src[0].reg] = false;
1362       }
1363    }
1364
1365    /* Allocate new space for split regs.  Note that the virtual
1366     * numbers will be contiguous.
1367     */
1368    for (int i = 0; i < num_vars; i++) {
1369       if (split_grf[i]) {
1370          new_virtual_grf[i] = virtual_grf_alloc(1);
1371          for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1372             int reg = virtual_grf_alloc(1);
1373             assert(reg == new_virtual_grf[i] + j - 1);
1374             (void) reg;
1375          }
1376          this->virtual_grf_sizes[i] = 1;
1377       }
1378    }
1379
1380    foreach_list(node, &this->instructions) {
1381       fs_inst *inst = (fs_inst *)node;
1382
1383       if (inst->dst.file == GRF &&
1384           split_grf[inst->dst.reg] &&
1385           inst->dst.reg_offset != 0) {
1386          inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1387                           inst->dst.reg_offset - 1);
1388          inst->dst.reg_offset = 0;
1389       }
1390       for (int i = 0; i < 3; i++) {
1391          if (inst->src[i].file == GRF &&
1392              split_grf[inst->src[i].reg] &&
1393              inst->src[i].reg_offset != 0) {
1394             inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1395                                 inst->src[i].reg_offset - 1);
1396             inst->src[i].reg_offset = 0;
1397          }
1398       }
1399    }
1400    this->live_intervals_valid = false;
1401 }
1402
1403 /**
1404  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1405  *
1406  * During code generation, we create tons of temporary variables, many of
1407  * which get immediately killed and are never used again.  Yet, in later
1408  * optimization and analysis passes, such as compute_live_intervals, we need
1409  * to loop over all the virtual GRFs.  Compacting them can save a lot of
1410  * overhead.
1411  */
1412 void
1413 fs_visitor::compact_virtual_grfs()
1414 {
1415    /* Mark which virtual GRFs are used, and count how many. */
1416    int remap_table[this->virtual_grf_count];
1417    memset(remap_table, -1, sizeof(remap_table));
1418
1419    foreach_list(node, &this->instructions) {
1420       const fs_inst *inst = (const fs_inst *) node;
1421
1422       if (inst->dst.file == GRF)
1423          remap_table[inst->dst.reg] = 0;
1424
1425       for (int i = 0; i < 3; i++) {
1426          if (inst->src[i].file == GRF)
1427             remap_table[inst->src[i].reg] = 0;
1428       }
1429    }
1430
1431    /* In addition to registers used in instructions, fs_visitor keeps
1432     * direct references to certain special values which must be patched:
1433     */
1434    fs_reg *special[] = {
1435       &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1436       &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1437       &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1438       &delta_x[0], &delta_x[1], &delta_x[2],
1439       &delta_x[3], &delta_x[4], &delta_x[5],
1440       &delta_y[0], &delta_y[1], &delta_y[2],
1441       &delta_y[3], &delta_y[4], &delta_y[5],
1442    };
1443    STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1444    STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1445
1446    /* Treat all special values as used, to be conservative */
1447    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1448       if (special[i]->file == GRF)
1449          remap_table[special[i]->reg] = 0;
1450    }
1451
1452    /* Compact the GRF arrays. */
1453    int new_index = 0;
1454    for (int i = 0; i < this->virtual_grf_count; i++) {
1455       if (remap_table[i] != -1) {
1456          remap_table[i] = new_index;
1457          virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1458          if (live_intervals_valid) {
1459             virtual_grf_start[new_index] = virtual_grf_start[i];
1460             virtual_grf_end[new_index] = virtual_grf_end[i];
1461          }
1462          ++new_index;
1463       }
1464    }
1465
1466    this->virtual_grf_count = new_index;
1467
1468    /* Patch all the instructions to use the newly renumbered registers */
1469    foreach_list(node, &this->instructions) {
1470       fs_inst *inst = (fs_inst *) node;
1471
1472       if (inst->dst.file == GRF)
1473          inst->dst.reg = remap_table[inst->dst.reg];
1474
1475       for (int i = 0; i < 3; i++) {
1476          if (inst->src[i].file == GRF)
1477             inst->src[i].reg = remap_table[inst->src[i].reg];
1478       }
1479    }
1480
1481    /* Patch all the references to special values */
1482    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1483       if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1484          special[i]->reg = remap_table[special[i]->reg];
1485    }
1486 }
1487
1488 bool
1489 fs_visitor::remove_dead_constants()
1490 {
1491    if (dispatch_width == 8) {
1492       this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
1493
1494       for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
1495          this->params_remap[i] = -1;
1496
1497       /* Find which params are still in use. */
1498       foreach_list(node, &this->instructions) {
1499          fs_inst *inst = (fs_inst *)node;
1500
1501          for (int i = 0; i < 3; i++) {
1502             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1503
1504             if (inst->src[i].file != UNIFORM)
1505                continue;
1506
1507             /* if we get a negative constant nr or one greater than we can
1508              * handle, this can cause an overflow, we can't just refuse to
1509              * build, so just go undefined and alias everyone to constant 0.
1510              */
1511             if (constant_nr < 0 || constant_nr >= (int)c->prog_data.nr_params) {
1512                constant_nr = 0;
1513             }
1514
1515             /* For now, set this to non-negative.  We'll give it the
1516              * actual new number in a moment, in order to keep the
1517              * register numbers nicely ordered.
1518              */
1519             this->params_remap[constant_nr] = 0;
1520          }
1521       }
1522
1523       /* Figure out what the new numbers for the params will be.  At some
1524        * point when we're doing uniform array access, we're going to want
1525        * to keep the distinction between .reg and .reg_offset, but for
1526        * now we don't care.
1527        */
1528       unsigned int new_nr_params = 0;
1529       for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1530          if (this->params_remap[i] != -1) {
1531             this->params_remap[i] = new_nr_params++;
1532          }
1533       }
1534
1535       /* Update the list of params to be uploaded to match our new numbering. */
1536       for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1537          int remapped = this->params_remap[i];
1538
1539          if (remapped == -1)
1540             continue;
1541
1542          c->prog_data.param[remapped] = c->prog_data.param[i];
1543       }
1544
1545       c->prog_data.nr_params = new_nr_params;
1546    } else {
1547       /* This should have been generated in the 8-wide pass already. */
1548       assert(this->params_remap);
1549    }
1550
1551    /* Now do the renumbering of the shader to remove unused params. */
1552    foreach_list(node, &this->instructions) {
1553       fs_inst *inst = (fs_inst *)node;
1554
1555       for (int i = 0; i < 3; i++) {
1556          int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1557
1558          if (inst->src[i].file != UNIFORM)
1559             continue;
1560
1561          /* as above alias to 0 */
1562          if (constant_nr < 0 || constant_nr >= (int)c->prog_data.nr_params) {
1563             constant_nr = 0;
1564          }
1565          assert(this->params_remap[constant_nr] != -1);
1566          inst->src[i].reg = this->params_remap[constant_nr];
1567          inst->src[i].reg_offset = 0;
1568       }
1569    }
1570
1571    return true;
1572 }
1573
1574 /*
1575  * Implements array access of uniforms by inserting a
1576  * PULL_CONSTANT_LOAD instruction.
1577  *
1578  * Unlike temporary GRF array access (where we don't support it due to
1579  * the difficulty of doing relative addressing on instruction
1580  * destinations), we could potentially do array access of uniforms
1581  * that were loaded in GRF space as push constants.  In real-world
1582  * usage we've seen, though, the arrays being used are always larger
1583  * than we could load as push constants, so just always move all
1584  * uniform array access out to a pull constant buffer.
1585  */
1586 void
1587 fs_visitor::move_uniform_array_access_to_pull_constants()
1588 {
1589    int pull_constant_loc[c->prog_data.nr_params];
1590
1591    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1592       pull_constant_loc[i] = -1;
1593    }
1594
1595    /* Walk through and find array access of uniforms.  Put a copy of that
1596     * uniform in the pull constant buffer.
1597     *
1598     * Note that we don't move constant-indexed accesses to arrays.  No
1599     * testing has been done of the performance impact of this choice.
1600     */
1601    foreach_list_safe(node, &this->instructions) {
1602       fs_inst *inst = (fs_inst *)node;
1603
1604       for (int i = 0 ; i < 3; i++) {
1605          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1606             continue;
1607
1608          int uniform = inst->src[i].reg;
1609
1610          /* If this array isn't already present in the pull constant buffer,
1611           * add it.
1612           */
1613          if (pull_constant_loc[uniform] == -1) {
1614             const float **values = &c->prog_data.param[uniform];
1615
1616             pull_constant_loc[uniform] = c->prog_data.nr_pull_params;
1617
1618             assert(param_size[uniform]);
1619
1620             for (int j = 0; j < param_size[uniform]; j++) {
1621                c->prog_data.pull_param[c->prog_data.nr_pull_params++] =
1622                   values[j];
1623             }
1624          }
1625
1626          /* Set up the annotation tracking for new generated instructions. */
1627          base_ir = inst->ir;
1628          current_annotation = inst->annotation;
1629
1630          fs_reg surf_index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1631          fs_reg temp = fs_reg(this, glsl_type::float_type);
1632          exec_list list = VARYING_PULL_CONSTANT_LOAD(temp,
1633                                                      surf_index,
1634                                                      *inst->src[i].reladdr,
1635                                                      pull_constant_loc[uniform] +
1636                                                      inst->src[i].reg_offset);
1637          inst->insert_before(&list);
1638
1639          inst->src[i].file = temp.file;
1640          inst->src[i].reg = temp.reg;
1641          inst->src[i].reg_offset = temp.reg_offset;
1642          inst->src[i].reladdr = NULL;
1643       }
1644    }
1645 }
1646
1647 /**
1648  * Choose accesses from the UNIFORM file to demote to using the pull
1649  * constant buffer.
1650  *
1651  * We allow a fragment shader to have more than the specified minimum
1652  * maximum number of fragment shader uniform components (64).  If
1653  * there are too many of these, they'd fill up all of register space.
1654  * So, this will push some of them out to the pull constant buffer and
1655  * update the program to load them.
1656  */
1657 void
1658 fs_visitor::setup_pull_constants()
1659 {
1660    /* Only allow 16 registers (128 uniform components) as push constants. */
1661    unsigned int max_uniform_components = 16 * 8;
1662    if (c->prog_data.nr_params <= max_uniform_components)
1663       return;
1664
1665    if (dispatch_width == 16) {
1666       fail("Pull constants not supported in 16-wide\n");
1667       return;
1668    }
1669
1670    /* Just demote the end of the list.  We could probably do better
1671     * here, demoting things that are rarely used in the program first.
1672     */
1673    unsigned int pull_uniform_base = max_uniform_components;
1674
1675    int pull_constant_loc[c->prog_data.nr_params];
1676    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1677       if (i < pull_uniform_base) {
1678          pull_constant_loc[i] = -1;
1679       } else {
1680          pull_constant_loc[i] = -1;
1681          /* If our constant is already being uploaded for reladdr purposes,
1682           * reuse it.
1683           */
1684          for (unsigned int j = 0; j < c->prog_data.nr_pull_params; j++) {
1685             if (c->prog_data.pull_param[j] == c->prog_data.param[i]) {
1686                pull_constant_loc[i] = j;
1687                break;
1688             }
1689          }
1690          if (pull_constant_loc[i] == -1) {
1691             int pull_index = c->prog_data.nr_pull_params++;
1692             c->prog_data.pull_param[pull_index] = c->prog_data.param[i];
1693             pull_constant_loc[i] = pull_index;;
1694          }
1695       }
1696    }
1697    c->prog_data.nr_params = pull_uniform_base;
1698
1699    foreach_list(node, &this->instructions) {
1700       fs_inst *inst = (fs_inst *)node;
1701
1702       for (int i = 0; i < 3; i++) {
1703          if (inst->src[i].file != UNIFORM)
1704             continue;
1705
1706          int pull_index = pull_constant_loc[inst->src[i].reg +
1707                                             inst->src[i].reg_offset];
1708          if (pull_index == -1)
1709             continue;
1710
1711          assert(!inst->src[i].reladdr);
1712
1713          fs_reg dst = fs_reg(this, glsl_type::float_type);
1714          fs_reg index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1715          fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1716          fs_inst *pull =
1717             new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1718                                  dst, index, offset);
1719          pull->ir = inst->ir;
1720          pull->annotation = inst->annotation;
1721
1722          inst->insert_before(pull);
1723
1724          inst->src[i].file = GRF;
1725          inst->src[i].reg = dst.reg;
1726          inst->src[i].reg_offset = 0;
1727          inst->src[i].smear = pull_index & 3;
1728       }
1729    }
1730 }
1731
1732 bool
1733 fs_visitor::opt_algebraic()
1734 {
1735    bool progress = false;
1736
1737    foreach_list(node, &this->instructions) {
1738       fs_inst *inst = (fs_inst *)node;
1739
1740       switch (inst->opcode) {
1741       case BRW_OPCODE_MUL:
1742          if (inst->src[1].file != IMM)
1743             continue;
1744
1745          /* a * 1.0 = a */
1746          if (inst->src[1].is_one()) {
1747             inst->opcode = BRW_OPCODE_MOV;
1748             inst->src[1] = reg_undef;
1749             progress = true;
1750             break;
1751          }
1752
1753          /* a * 0.0 = 0.0 */
1754          if (inst->src[1].is_zero()) {
1755             inst->opcode = BRW_OPCODE_MOV;
1756             inst->src[0] = inst->src[1];
1757             inst->src[1] = reg_undef;
1758             progress = true;
1759             break;
1760          }
1761
1762          break;
1763       case BRW_OPCODE_ADD:
1764          if (inst->src[1].file != IMM)
1765             continue;
1766
1767          /* a + 0.0 = a */
1768          if (inst->src[1].is_zero()) {
1769             inst->opcode = BRW_OPCODE_MOV;
1770             inst->src[1] = reg_undef;
1771             progress = true;
1772             break;
1773          }
1774          break;
1775       default:
1776          break;
1777       }
1778    }
1779
1780    return progress;
1781 }
1782
1783 /**
1784  * Removes any instructions writing a VGRF where that VGRF is not used by any
1785  * later instruction.
1786  */
1787 bool
1788 fs_visitor::dead_code_eliminate()
1789 {
1790    bool progress = false;
1791    int pc = 0;
1792
1793    calculate_live_intervals();
1794
1795    foreach_list_safe(node, &this->instructions) {
1796       fs_inst *inst = (fs_inst *)node;
1797
1798       if (inst->dst.file == GRF) {
1799          assert(this->virtual_grf_end[inst->dst.reg] >= pc);
1800          if (this->virtual_grf_end[inst->dst.reg] == pc) {
1801             inst->remove();
1802             progress = true;
1803          }
1804       }
1805
1806       pc++;
1807    }
1808
1809    if (progress)
1810       live_intervals_valid = false;
1811
1812    return progress;
1813 }
1814
1815 struct dead_code_hash_key
1816 {
1817    int vgrf;
1818    int reg_offset;
1819 };
1820
1821 static bool
1822 dead_code_hash_compare(const void *a, const void *b)
1823 {
1824    return memcmp(a, b, sizeof(struct dead_code_hash_key)) == 0;
1825 }
1826
1827 static void
1828 clear_dead_code_hash(struct hash_table *ht)
1829 {
1830    struct hash_entry *entry;
1831
1832    hash_table_foreach(ht, entry) {
1833       _mesa_hash_table_remove(ht, entry);
1834    }
1835 }
1836
1837 static void
1838 insert_dead_code_hash(struct hash_table *ht,
1839                       int vgrf, int reg_offset, fs_inst *inst)
1840 {
1841    /* We don't bother freeing keys, because they'll be GCed with the ht. */
1842    struct dead_code_hash_key *key = ralloc(ht, struct dead_code_hash_key);
1843
1844    key->vgrf = vgrf;
1845    key->reg_offset = reg_offset;
1846
1847    _mesa_hash_table_insert(ht, _mesa_hash_data(key, sizeof(*key)), key, inst);
1848 }
1849
1850 static struct hash_entry *
1851 get_dead_code_hash_entry(struct hash_table *ht, int vgrf, int reg_offset)
1852 {
1853    struct dead_code_hash_key key;
1854
1855    key.vgrf = vgrf;
1856    key.reg_offset = reg_offset;
1857
1858    return _mesa_hash_table_search(ht, _mesa_hash_data(&key, sizeof(key)), &key);
1859 }
1860
1861 static void
1862 remove_dead_code_hash(struct hash_table *ht,
1863                       int vgrf, int reg_offset)
1864 {
1865    struct hash_entry *entry = get_dead_code_hash_entry(ht, vgrf, reg_offset);
1866    if (!entry)
1867       return;
1868
1869    _mesa_hash_table_remove(ht, entry);
1870 }
1871
1872 /**
1873  * Walks basic blocks, removing any regs that are written but not read before
1874  * being redefined.
1875  *
1876  * The dead_code_eliminate() function implements a global dead code
1877  * elimination, but it only handles the removing the last write to a register
1878  * if it's never read.  This one can handle intermediate writes, but only
1879  * within a basic block.
1880  */
1881 bool
1882 fs_visitor::dead_code_eliminate_local()
1883 {
1884    struct hash_table *ht;
1885    bool progress = false;
1886
1887    ht = _mesa_hash_table_create(mem_ctx, dead_code_hash_compare);
1888
1889    foreach_list_safe(node, &this->instructions) {
1890       fs_inst *inst = (fs_inst *)node;
1891
1892       /* At a basic block, empty the HT since we don't understand dataflow
1893        * here.
1894        */
1895       if (inst->is_control_flow()) {
1896          clear_dead_code_hash(ht);
1897          continue;
1898       }
1899
1900       /* Clear the HT of any instructions that got read. */
1901       for (int i = 0; i < 3; i++) {
1902          fs_reg src = inst->src[i];
1903          if (src.file != GRF)
1904             continue;
1905
1906          int read = 1;
1907          if (inst->is_send_from_grf())
1908             read = virtual_grf_sizes[src.reg] - src.reg_offset;
1909
1910          for (int reg_offset = src.reg_offset;
1911               reg_offset < src.reg_offset + read;
1912               reg_offset++) {
1913             remove_dead_code_hash(ht, src.reg, reg_offset);
1914          }
1915       }
1916
1917       /* Add any update of a GRF to the HT, removing a previous write if it
1918        * wasn't read.
1919        */
1920       if (inst->dst.file == GRF) {
1921          if (inst->regs_written > 1) {
1922             /* We don't know how to trim channels from an instruction's
1923              * writes, so we can't incrementally remove unread channels from
1924              * it.  Just remove whatever it overwrites from the table
1925              */
1926             for (int i = 0; i < inst->regs_written; i++) {
1927                remove_dead_code_hash(ht,
1928                                      inst->dst.reg,
1929                                      inst->dst.reg_offset + i);
1930             }
1931          } else {
1932             struct hash_entry *entry =
1933                get_dead_code_hash_entry(ht, inst->dst.reg,
1934                                         inst->dst.reg_offset);
1935
1936             if (inst->is_partial_write()) {
1937                /* For a partial write, we can't remove any previous dead code
1938                 * candidate, since we're just modifying their result, but we can
1939                 * be dead code eliminiated ourselves.
1940                 */
1941                if (entry) {
1942                   entry->data = inst;
1943                } else {
1944                   insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
1945                                         inst);
1946                }
1947             } else {
1948                if (entry) {
1949                   /* We're completely updating a channel, and there was a
1950                    * previous write to the channel that wasn't read.  Kill it!
1951                    */
1952                   fs_inst *inst = (fs_inst *)entry->data;
1953                   inst->remove();
1954                   progress = true;
1955                   _mesa_hash_table_remove(ht, entry);
1956                }
1957
1958                insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
1959                                      inst);
1960             }
1961          }
1962       }
1963    }
1964
1965    _mesa_hash_table_destroy(ht, NULL);
1966
1967    if (progress)
1968       live_intervals_valid = false;
1969
1970    return progress;
1971 }
1972
1973 /**
1974  * Implements a second type of register coalescing: This one checks if
1975  * the two regs involved in a raw move don't interfere, in which case
1976  * they can both by stored in the same place and the MOV removed.
1977  */
1978 bool
1979 fs_visitor::register_coalesce_2()
1980 {
1981    bool progress = false;
1982
1983    calculate_live_intervals();
1984
1985    foreach_list_safe(node, &this->instructions) {
1986       fs_inst *inst = (fs_inst *)node;
1987
1988       if (inst->opcode != BRW_OPCODE_MOV ||
1989           inst->is_partial_write() ||
1990           inst->saturate ||
1991           inst->src[0].file != GRF ||
1992           inst->src[0].negate ||
1993           inst->src[0].abs ||
1994           inst->src[0].smear != -1 ||
1995           inst->dst.file != GRF ||
1996           inst->dst.type != inst->src[0].type ||
1997           virtual_grf_sizes[inst->src[0].reg] != 1 ||
1998           virtual_grf_interferes(inst->dst.reg, inst->src[0].reg)) {
1999          continue;
2000       }
2001
2002       int reg_from = inst->src[0].reg;
2003       assert(inst->src[0].reg_offset == 0);
2004       int reg_to = inst->dst.reg;
2005       int reg_to_offset = inst->dst.reg_offset;
2006
2007       foreach_list(node, &this->instructions) {
2008          fs_inst *scan_inst = (fs_inst *)node;
2009
2010          if (scan_inst->dst.file == GRF &&
2011              scan_inst->dst.reg == reg_from) {
2012             scan_inst->dst.reg = reg_to;
2013             scan_inst->dst.reg_offset = reg_to_offset;
2014          }
2015          for (int i = 0; i < 3; i++) {
2016             if (scan_inst->src[i].file == GRF &&
2017                 scan_inst->src[i].reg == reg_from) {
2018                scan_inst->src[i].reg = reg_to;
2019                scan_inst->src[i].reg_offset = reg_to_offset;
2020             }
2021          }
2022       }
2023
2024       inst->remove();
2025
2026       /* We don't need to recalculate live intervals inside the loop despite
2027        * flagging live_intervals_valid because we only use live intervals for
2028        * the interferes test, and we must have had a situation where the
2029        * intervals were:
2030        *
2031        *  from  to
2032        *  ^
2033        *  |
2034        *  v
2035        *        ^
2036        *        |
2037        *        v
2038        *
2039        * Some register R that might get coalesced with one of these two could
2040        * only be referencing "to", otherwise "from"'s range would have been
2041        * longer.  R's range could also only start at the end of "to" or later,
2042        * otherwise it will conflict with "to" when we try to coalesce "to"
2043        * into Rw anyway.
2044        */
2045       live_intervals_valid = false;
2046
2047       progress = true;
2048       continue;
2049    }
2050
2051    return progress;
2052 }
2053
2054 bool
2055 fs_visitor::register_coalesce()
2056 {
2057    bool progress = false;
2058    int if_depth = 0;
2059    int loop_depth = 0;
2060
2061    foreach_list_safe(node, &this->instructions) {
2062       fs_inst *inst = (fs_inst *)node;
2063
2064       /* Make sure that we dominate the instructions we're going to
2065        * scan for interfering with our coalescing, or we won't have
2066        * scanned enough to see if anything interferes with our
2067        * coalescing.  We don't dominate the following instructions if
2068        * we're in a loop or an if block.
2069        */
2070       switch (inst->opcode) {
2071       case BRW_OPCODE_DO:
2072          loop_depth++;
2073          break;
2074       case BRW_OPCODE_WHILE:
2075          loop_depth--;
2076          break;
2077       case BRW_OPCODE_IF:
2078          if_depth++;
2079          break;
2080       case BRW_OPCODE_ENDIF:
2081          if_depth--;
2082          break;
2083       default:
2084          break;
2085       }
2086       if (loop_depth || if_depth)
2087          continue;
2088
2089       if (inst->opcode != BRW_OPCODE_MOV ||
2090           inst->is_partial_write() ||
2091           inst->saturate ||
2092           inst->dst.file != GRF || (inst->src[0].file != GRF &&
2093                                     inst->src[0].file != UNIFORM)||
2094           inst->dst.type != inst->src[0].type)
2095          continue;
2096
2097       bool has_source_modifiers = (inst->src[0].abs ||
2098                                    inst->src[0].negate ||
2099                                    inst->src[0].smear != -1 ||
2100                                    inst->src[0].file == UNIFORM);
2101
2102       /* Found a move of a GRF to a GRF.  Let's see if we can coalesce
2103        * them: check for no writes to either one until the exit of the
2104        * program.
2105        */
2106       bool interfered = false;
2107
2108       for (fs_inst *scan_inst = (fs_inst *)inst->next;
2109            !scan_inst->is_tail_sentinel();
2110            scan_inst = (fs_inst *)scan_inst->next) {
2111          if (scan_inst->dst.file == GRF) {
2112             if (scan_inst->overwrites_reg(inst->dst) ||
2113                 scan_inst->overwrites_reg(inst->src[0])) {
2114                interfered = true;
2115                break;
2116             }
2117          }
2118
2119          /* The gen6 MATH instruction can't handle source modifiers or
2120           * unusual register regions, so avoid coalescing those for
2121           * now.  We should do something more specific.
2122           */
2123          if (has_source_modifiers && !can_do_source_mods(scan_inst)) {
2124             interfered = true;
2125             break;
2126          }
2127
2128          /* The accumulator result appears to get used for the
2129           * conditional modifier generation.  When negating a UD
2130           * value, there is a 33rd bit generated for the sign in the
2131           * accumulator value, so now you can't check, for example,
2132           * equality with a 32-bit value.  See piglit fs-op-neg-uint.
2133           */
2134          if (scan_inst->conditional_mod &&
2135              inst->src[0].negate &&
2136              inst->src[0].type == BRW_REGISTER_TYPE_UD) {
2137             interfered = true;
2138             break;
2139          }
2140       }
2141       if (interfered) {
2142          continue;
2143       }
2144
2145       /* Rewrite the later usage to point at the source of the move to
2146        * be removed.
2147        */
2148       for (fs_inst *scan_inst = inst;
2149            !scan_inst->is_tail_sentinel();
2150            scan_inst = (fs_inst *)scan_inst->next) {
2151          for (int i = 0; i < 3; i++) {
2152             if (scan_inst->src[i].file == GRF &&
2153                 scan_inst->src[i].reg == inst->dst.reg &&
2154                 scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
2155                fs_reg new_src = inst->src[0];
2156                if (scan_inst->src[i].abs) {
2157                   new_src.negate = 0;
2158                   new_src.abs = 1;
2159                }
2160                new_src.negate ^= scan_inst->src[i].negate;
2161                scan_inst->src[i] = new_src;
2162             }
2163          }
2164       }
2165
2166       inst->remove();
2167       progress = true;
2168    }
2169
2170    if (progress)
2171       live_intervals_valid = false;
2172
2173    return progress;
2174 }
2175
2176
2177 bool
2178 fs_visitor::compute_to_mrf()
2179 {
2180    bool progress = false;
2181    int next_ip = 0;
2182
2183    calculate_live_intervals();
2184
2185    foreach_list_safe(node, &this->instructions) {
2186       fs_inst *inst = (fs_inst *)node;
2187
2188       int ip = next_ip;
2189       next_ip++;
2190
2191       if (inst->opcode != BRW_OPCODE_MOV ||
2192           inst->is_partial_write() ||
2193           inst->dst.file != MRF || inst->src[0].file != GRF ||
2194           inst->dst.type != inst->src[0].type ||
2195           inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
2196          continue;
2197
2198       /* Work out which hardware MRF registers are written by this
2199        * instruction.
2200        */
2201       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2202       int mrf_high;
2203       if (inst->dst.reg & BRW_MRF_COMPR4) {
2204          mrf_high = mrf_low + 4;
2205       } else if (dispatch_width == 16 &&
2206                  (!inst->force_uncompressed && !inst->force_sechalf)) {
2207          mrf_high = mrf_low + 1;
2208       } else {
2209          mrf_high = mrf_low;
2210       }
2211
2212       /* Can't compute-to-MRF this GRF if someone else was going to
2213        * read it later.
2214        */
2215       if (this->virtual_grf_end[inst->src[0].reg] > ip)
2216          continue;
2217
2218       /* Found a move of a GRF to a MRF.  Let's see if we can go
2219        * rewrite the thing that made this GRF to write into the MRF.
2220        */
2221       fs_inst *scan_inst;
2222       for (scan_inst = (fs_inst *)inst->prev;
2223            scan_inst->prev != NULL;
2224            scan_inst = (fs_inst *)scan_inst->prev) {
2225          if (scan_inst->dst.file == GRF &&
2226              scan_inst->dst.reg == inst->src[0].reg) {
2227             /* Found the last thing to write our reg we want to turn
2228              * into a compute-to-MRF.
2229              */
2230
2231             /* If this one instruction didn't populate all the
2232              * channels, bail.  We might be able to rewrite everything
2233              * that writes that reg, but it would require smarter
2234              * tracking to delay the rewriting until complete success.
2235              */
2236             if (scan_inst->is_partial_write())
2237                break;
2238
2239             /* Things returning more than one register would need us to
2240              * understand coalescing out more than one MOV at a time.
2241              */
2242             if (scan_inst->regs_written > 1)
2243                break;
2244
2245             /* SEND instructions can't have MRF as a destination. */
2246             if (scan_inst->mlen)
2247                break;
2248
2249             if (intel->gen == 6) {
2250                /* gen6 math instructions must have the destination be
2251                 * GRF, so no compute-to-MRF for them.
2252                 */
2253                if (scan_inst->is_math()) {
2254                   break;
2255                }
2256             }
2257
2258             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2259                /* Found the creator of our MRF's source value. */
2260                scan_inst->dst.file = MRF;
2261                scan_inst->dst.reg = inst->dst.reg;
2262                scan_inst->saturate |= inst->saturate;
2263                inst->remove();
2264                progress = true;
2265             }
2266             break;
2267          }
2268
2269          /* We don't handle control flow here.  Most computation of
2270           * values that end up in MRFs are shortly before the MRF
2271           * write anyway.
2272           */
2273          if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2274             break;
2275
2276          /* You can't read from an MRF, so if someone else reads our
2277           * MRF's source GRF that we wanted to rewrite, that stops us.
2278           */
2279          bool interfered = false;
2280          for (int i = 0; i < 3; i++) {
2281             if (scan_inst->src[i].file == GRF &&
2282                 scan_inst->src[i].reg == inst->src[0].reg &&
2283                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2284                interfered = true;
2285             }
2286          }
2287          if (interfered)
2288             break;
2289
2290          if (scan_inst->dst.file == MRF) {
2291             /* If somebody else writes our MRF here, we can't
2292              * compute-to-MRF before that.
2293              */
2294             int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2295             int scan_mrf_high;
2296
2297             if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2298                scan_mrf_high = scan_mrf_low + 4;
2299             } else if (dispatch_width == 16 &&
2300                        (!scan_inst->force_uncompressed &&
2301                         !scan_inst->force_sechalf)) {
2302                scan_mrf_high = scan_mrf_low + 1;
2303             } else {
2304                scan_mrf_high = scan_mrf_low;
2305             }
2306
2307             if (mrf_low == scan_mrf_low ||
2308                 mrf_low == scan_mrf_high ||
2309                 mrf_high == scan_mrf_low ||
2310                 mrf_high == scan_mrf_high) {
2311                break;
2312             }
2313          }
2314
2315          if (scan_inst->mlen > 0) {
2316             /* Found a SEND instruction, which means that there are
2317              * live values in MRFs from base_mrf to base_mrf +
2318              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
2319              * above it.
2320              */
2321             if (mrf_low >= scan_inst->base_mrf &&
2322                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2323                break;
2324             }
2325             if (mrf_high >= scan_inst->base_mrf &&
2326                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2327                break;
2328             }
2329          }
2330       }
2331    }
2332
2333    if (progress)
2334       live_intervals_valid = false;
2335
2336    return progress;
2337 }
2338
2339 /**
2340  * Walks through basic blocks, looking for repeated MRF writes and
2341  * removing the later ones.
2342  */
2343 bool
2344 fs_visitor::remove_duplicate_mrf_writes()
2345 {
2346    fs_inst *last_mrf_move[16];
2347    bool progress = false;
2348
2349    /* Need to update the MRF tracking for compressed instructions. */
2350    if (dispatch_width == 16)
2351       return false;
2352
2353    memset(last_mrf_move, 0, sizeof(last_mrf_move));
2354
2355    foreach_list_safe(node, &this->instructions) {
2356       fs_inst *inst = (fs_inst *)node;
2357
2358       if (inst->is_control_flow()) {
2359          memset(last_mrf_move, 0, sizeof(last_mrf_move));
2360       }
2361
2362       if (inst->opcode == BRW_OPCODE_MOV &&
2363           inst->dst.file == MRF) {
2364          fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2365          if (prev_inst && inst->equals(prev_inst)) {
2366             inst->remove();
2367             progress = true;
2368             continue;
2369          }
2370       }
2371
2372       /* Clear out the last-write records for MRFs that were overwritten. */
2373       if (inst->dst.file == MRF) {
2374          last_mrf_move[inst->dst.reg] = NULL;
2375       }
2376
2377       if (inst->mlen > 0) {
2378          /* Found a SEND instruction, which will include two or fewer
2379           * implied MRF writes.  We could do better here.
2380           */
2381          for (int i = 0; i < implied_mrf_writes(inst); i++) {
2382             last_mrf_move[inst->base_mrf + i] = NULL;
2383          }
2384       }
2385
2386       /* Clear out any MRF move records whose sources got overwritten. */
2387       if (inst->dst.file == GRF) {
2388          for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2389             if (last_mrf_move[i] &&
2390                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2391                last_mrf_move[i] = NULL;
2392             }
2393          }
2394       }
2395
2396       if (inst->opcode == BRW_OPCODE_MOV &&
2397           inst->dst.file == MRF &&
2398           inst->src[0].file == GRF &&
2399           !inst->is_partial_write()) {
2400          last_mrf_move[inst->dst.reg] = inst;
2401       }
2402    }
2403
2404    if (progress)
2405       live_intervals_valid = false;
2406
2407    return progress;
2408 }
2409
2410 static void
2411 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2412                         int first_grf, int grf_len)
2413 {
2414    bool inst_16wide = (dispatch_width > 8 &&
2415                        !inst->force_uncompressed &&
2416                        !inst->force_sechalf);
2417
2418    /* Clear the flag for registers that actually got read (as expected). */
2419    for (int i = 0; i < 3; i++) {
2420       int grf;
2421       if (inst->src[i].file == GRF) {
2422          grf = inst->src[i].reg;
2423       } else if (inst->src[i].file == HW_REG &&
2424                  inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2425          grf = inst->src[i].fixed_hw_reg.nr;
2426       } else {
2427          continue;
2428       }
2429
2430       if (grf >= first_grf &&
2431           grf < first_grf + grf_len) {
2432          deps[grf - first_grf] = false;
2433          if (inst_16wide)
2434             deps[grf - first_grf + 1] = false;
2435       }
2436    }
2437 }
2438
2439 /**
2440  * Implements this workaround for the original 965:
2441  *
2442  *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2443  *      check for post destination dependencies on this instruction, software
2444  *      must ensure that there is no destination hazard for the case of ‘write
2445  *      followed by a posted write’ shown in the following example.
2446  *
2447  *      1. mov r3 0
2448  *      2. send r3.xy <rest of send instruction>
2449  *      3. mov r2 r3
2450  *
2451  *      Due to no post-destination dependency check on the ‘send’, the above
2452  *      code sequence could have two instructions (1 and 2) in flight at the
2453  *      same time that both consider ‘r3’ as the target of their final writes.
2454  */
2455 void
2456 fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
2457 {
2458    int reg_size = dispatch_width / 8;
2459    int write_len = inst->regs_written * reg_size;
2460    int first_write_grf = inst->dst.reg;
2461    bool needs_dep[BRW_MAX_MRF];
2462    assert(write_len < (int)sizeof(needs_dep) - 1);
2463
2464    memset(needs_dep, false, sizeof(needs_dep));
2465    memset(needs_dep, true, write_len);
2466
2467    clear_deps_for_inst_src(inst, dispatch_width,
2468                            needs_dep, first_write_grf, write_len);
2469
2470    /* Walk backwards looking for writes to registers we're writing which
2471     * aren't read since being written.  If we hit the start of the program,
2472     * we assume that there are no outstanding dependencies on entry to the
2473     * program.
2474     */
2475    for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2476         scan_inst != NULL;
2477         scan_inst = (fs_inst *)scan_inst->prev) {
2478
2479       /* If we hit control flow, assume that there *are* outstanding
2480        * dependencies, and force their cleanup before our instruction.
2481        */
2482       if (scan_inst->is_control_flow()) {
2483          for (int i = 0; i < write_len; i++) {
2484             if (needs_dep[i]) {
2485                inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2486             }
2487          }
2488          return;
2489       }
2490
2491       bool scan_inst_16wide = (dispatch_width > 8 &&
2492                                !scan_inst->force_uncompressed &&
2493                                !scan_inst->force_sechalf);
2494
2495       /* We insert our reads as late as possible on the assumption that any
2496        * instruction but a MOV that might have left us an outstanding
2497        * dependency has more latency than a MOV.
2498        */
2499       if (scan_inst->dst.file == GRF) {
2500          for (int i = 0; i < scan_inst->regs_written; i++) {
2501             int reg = scan_inst->dst.reg + i * reg_size;
2502
2503             if (reg >= first_write_grf &&
2504                 reg < first_write_grf + write_len &&
2505                 needs_dep[reg - first_write_grf]) {
2506                inst->insert_before(DEP_RESOLVE_MOV(reg));
2507                needs_dep[reg - first_write_grf] = false;
2508                if (scan_inst_16wide)
2509                   needs_dep[reg - first_write_grf + 1] = false;
2510             }
2511          }
2512       }
2513
2514       /* Clear the flag for registers that actually got read (as expected). */
2515       clear_deps_for_inst_src(scan_inst, dispatch_width,
2516                               needs_dep, first_write_grf, write_len);
2517
2518       /* Continue the loop only if we haven't resolved all the dependencies */
2519       int i;
2520       for (i = 0; i < write_len; i++) {
2521          if (needs_dep[i])
2522             break;
2523       }
2524       if (i == write_len)
2525          return;
2526    }
2527 }
2528
2529 /**
2530  * Implements this workaround for the original 965:
2531  *
2532  *     "[DevBW, DevCL] Errata: A destination register from a send can not be
2533  *      used as a destination register until after it has been sourced by an
2534  *      instruction with a different destination register.
2535  */
2536 void
2537 fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
2538 {
2539    int write_len = inst->regs_written * dispatch_width / 8;
2540    int first_write_grf = inst->dst.reg;
2541    bool needs_dep[BRW_MAX_MRF];
2542    assert(write_len < (int)sizeof(needs_dep) - 1);
2543
2544    memset(needs_dep, false, sizeof(needs_dep));
2545    memset(needs_dep, true, write_len);
2546    /* Walk forwards looking for writes to registers we're writing which aren't
2547     * read before being written.
2548     */
2549    for (fs_inst *scan_inst = (fs_inst *)inst->next;
2550         !scan_inst->is_tail_sentinel();
2551         scan_inst = (fs_inst *)scan_inst->next) {
2552       /* If we hit control flow, force resolve all remaining dependencies. */
2553       if (scan_inst->is_control_flow()) {
2554          for (int i = 0; i < write_len; i++) {
2555             if (needs_dep[i])
2556                scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2557          }
2558          return;
2559       }
2560
2561       /* Clear the flag for registers that actually got read (as expected). */
2562       clear_deps_for_inst_src(scan_inst, dispatch_width,
2563                               needs_dep, first_write_grf, write_len);
2564
2565       /* We insert our reads as late as possible since they're reading the
2566        * result of a SEND, which has massive latency.
2567        */
2568       if (scan_inst->dst.file == GRF &&
2569           scan_inst->dst.reg >= first_write_grf &&
2570           scan_inst->dst.reg < first_write_grf + write_len &&
2571           needs_dep[scan_inst->dst.reg - first_write_grf]) {
2572          scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2573          needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2574       }
2575
2576       /* Continue the loop only if we haven't resolved all the dependencies */
2577       int i;
2578       for (i = 0; i < write_len; i++) {
2579          if (needs_dep[i])
2580             break;
2581       }
2582       if (i == write_len)
2583          return;
2584    }
2585
2586    /* If we hit the end of the program, resolve all remaining dependencies out
2587     * of paranoia.
2588     */
2589    fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2590    assert(last_inst->eot);
2591    for (int i = 0; i < write_len; i++) {
2592       if (needs_dep[i])
2593          last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2594    }
2595 }
2596
2597 void
2598 fs_visitor::insert_gen4_send_dependency_workarounds()
2599 {
2600    if (intel->gen != 4 || intel->is_g4x)
2601       return;
2602
2603    /* Note that we're done with register allocation, so GRF fs_regs always
2604     * have a .reg_offset of 0.
2605     */
2606
2607    foreach_list_safe(node, &this->instructions) {
2608       fs_inst *inst = (fs_inst *)node;
2609
2610       if (inst->mlen != 0 && inst->dst.file == GRF) {
2611          insert_gen4_pre_send_dependency_workarounds(inst);
2612          insert_gen4_post_send_dependency_workarounds(inst);
2613       }
2614    }
2615 }
2616
2617 /**
2618  * Turns the generic expression-style uniform pull constant load instruction
2619  * into a hardware-specific series of instructions for loading a pull
2620  * constant.
2621  *
2622  * The expression style allows the CSE pass before this to optimize out
2623  * repeated loads from the same offset, and gives the pre-register-allocation
2624  * scheduling full flexibility, while the conversion to native instructions
2625  * allows the post-register-allocation scheduler the best information
2626  * possible.
2627  *
2628  * Note that execution masking for setting up pull constant loads is special:
2629  * the channels that need to be written are unrelated to the current execution
2630  * mask, since a later instruction will use one of the result channels as a
2631  * source operand for all 8 or 16 of its channels.
2632  */
2633 void
2634 fs_visitor::lower_uniform_pull_constant_loads()
2635 {
2636    foreach_list(node, &this->instructions) {
2637       fs_inst *inst = (fs_inst *)node;
2638
2639       if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2640          continue;
2641
2642       if (intel->gen >= 7) {
2643          /* The offset arg before was a vec4-aligned byte offset.  We need to
2644           * turn it into a dword offset.
2645           */
2646          fs_reg const_offset_reg = inst->src[1];
2647          assert(const_offset_reg.file == IMM &&
2648                 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2649          const_offset_reg.imm.u /= 4;
2650          fs_reg payload = fs_reg(this, glsl_type::uint_type);
2651
2652          /* This is actually going to be a MOV, but since only the first dword
2653           * is accessed, we have a special opcode to do just that one.  Note
2654           * that this needs to be an operation that will be considered a def
2655           * by live variable analysis, or register allocation will explode.
2656           */
2657          fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2658                                                payload, const_offset_reg);
2659          setup->force_writemask_all = true;
2660
2661          setup->ir = inst->ir;
2662          setup->annotation = inst->annotation;
2663          inst->insert_before(setup);
2664
2665          /* Similarly, this will only populate the first 4 channels of the
2666           * result register (since we only use smear values from 0-3), but we
2667           * don't tell the optimizer.
2668           */
2669          inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2670          inst->src[1] = payload;
2671
2672          this->live_intervals_valid = false;
2673       } else {
2674          /* Before register allocation, we didn't tell the scheduler about the
2675           * MRF we use.  We know it's safe to use this MRF because nothing
2676           * else does except for register spill/unspill, which generates and
2677           * uses its MRF within a single IR instruction.
2678           */
2679          inst->base_mrf = 14;
2680          inst->mlen = 1;
2681       }
2682    }
2683 }
2684
2685 void
2686 fs_visitor::dump_instruction(backend_instruction *be_inst)
2687 {
2688    fs_inst *inst = (fs_inst *)be_inst;
2689
2690    if (inst->predicate) {
2691       printf("(%cf0.%d) ",
2692              inst->predicate_inverse ? '-' : '+',
2693              inst->flag_subreg);
2694    }
2695
2696    printf("%s", brw_instruction_name(inst->opcode));
2697    if (inst->saturate)
2698       printf(".sat");
2699    if (inst->conditional_mod) {
2700       printf(".cmod");
2701       if (!inst->predicate &&
2702           (intel->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2703                               inst->opcode != BRW_OPCODE_IF &&
2704                               inst->opcode != BRW_OPCODE_WHILE))) {
2705          printf(".f0.%d\n", inst->flag_subreg);
2706       }
2707    }
2708    printf(" ");
2709
2710
2711    switch (inst->dst.file) {
2712    case GRF:
2713       printf("vgrf%d", inst->dst.reg);
2714       if (inst->dst.reg_offset)
2715          printf("+%d", inst->dst.reg_offset);
2716       break;
2717    case MRF:
2718       printf("m%d", inst->dst.reg);
2719       break;
2720    case BAD_FILE:
2721       printf("(null)");
2722       break;
2723    case UNIFORM:
2724       printf("***u%d***", inst->dst.reg);
2725       break;
2726    default:
2727       printf("???");
2728       break;
2729    }
2730    printf(", ");
2731
2732    for (int i = 0; i < 3; i++) {
2733       if (inst->src[i].negate)
2734          printf("-");
2735       if (inst->src[i].abs)
2736          printf("|");
2737       switch (inst->src[i].file) {
2738       case GRF:
2739          printf("vgrf%d", inst->src[i].reg);
2740          if (inst->src[i].reg_offset)
2741             printf("+%d", inst->src[i].reg_offset);
2742          break;
2743       case MRF:
2744          printf("***m%d***", inst->src[i].reg);
2745          break;
2746       case UNIFORM:
2747          printf("u%d", inst->src[i].reg);
2748          if (inst->src[i].reg_offset)
2749             printf(".%d", inst->src[i].reg_offset);
2750          break;
2751       case BAD_FILE:
2752          printf("(null)");
2753          break;
2754       case IMM:
2755          switch (inst->src[i].type) {
2756          case BRW_REGISTER_TYPE_F:
2757             printf("%ff", inst->src[i].imm.f);
2758             break;
2759          case BRW_REGISTER_TYPE_D:
2760             printf("%dd", inst->src[i].imm.i);
2761             break;
2762          case BRW_REGISTER_TYPE_UD:
2763             printf("%uu", inst->src[i].imm.u);
2764             break;
2765          default:
2766             printf("???");
2767             break;
2768          }
2769          break;
2770       default:
2771          printf("???");
2772          break;
2773       }
2774       if (inst->src[i].abs)
2775          printf("|");
2776
2777       if (i < 3)
2778          printf(", ");
2779    }
2780
2781    printf(" ");
2782
2783    if (inst->force_uncompressed)
2784       printf("1sthalf ");
2785
2786    if (inst->force_sechalf)
2787       printf("2ndhalf ");
2788
2789    printf("\n");
2790 }
2791
2792 /**
2793  * Possibly returns an instruction that set up @param reg.
2794  *
2795  * Sometimes we want to take the result of some expression/variable
2796  * dereference tree and rewrite the instruction generating the result
2797  * of the tree.  When processing the tree, we know that the
2798  * instructions generated are all writing temporaries that are dead
2799  * outside of this tree.  So, if we have some instructions that write
2800  * a temporary, we're free to point that temp write somewhere else.
2801  *
2802  * Note that this doesn't guarantee that the instruction generated
2803  * only reg -- it might be the size=4 destination of a texture instruction.
2804  */
2805 fs_inst *
2806 fs_visitor::get_instruction_generating_reg(fs_inst *start,
2807                                            fs_inst *end,
2808                                            fs_reg reg)
2809 {
2810    if (end == start ||
2811        end->is_partial_write() ||
2812        reg.reladdr ||
2813        !reg.equals(end->dst)) {
2814       return NULL;
2815    } else {
2816       return end;
2817    }
2818 }
2819
2820 void
2821 fs_visitor::setup_payload_gen6()
2822 {
2823    bool uses_depth =
2824       (fp->Base.InputsRead & (1 << VARYING_SLOT_POS)) != 0;
2825    unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
2826
2827    assert(intel->gen >= 6);
2828
2829    /* R0-1: masks, pixel X/Y coordinates. */
2830    c->nr_payload_regs = 2;
2831    /* R2: only for 32-pixel dispatch.*/
2832
2833    /* R3-26: barycentric interpolation coordinates.  These appear in the
2834     * same order that they appear in the brw_wm_barycentric_interp_mode
2835     * enum.  Each set of coordinates occupies 2 registers if dispatch width
2836     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
2837     * appear if they were enabled using the "Barycentric Interpolation
2838     * Mode" bits in WM_STATE.
2839     */
2840    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
2841       if (barycentric_interp_modes & (1 << i)) {
2842          c->barycentric_coord_reg[i] = c->nr_payload_regs;
2843          c->nr_payload_regs += 2;
2844          if (dispatch_width == 16) {
2845             c->nr_payload_regs += 2;
2846          }
2847       }
2848    }
2849
2850    /* R27: interpolated depth if uses source depth */
2851    if (uses_depth) {
2852       c->source_depth_reg = c->nr_payload_regs;
2853       c->nr_payload_regs++;
2854       if (dispatch_width == 16) {
2855          /* R28: interpolated depth if not 8-wide. */
2856          c->nr_payload_regs++;
2857       }
2858    }
2859    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
2860    if (uses_depth) {
2861       c->source_w_reg = c->nr_payload_regs;
2862       c->nr_payload_regs++;
2863       if (dispatch_width == 16) {
2864          /* R30: interpolated W if not 8-wide. */
2865          c->nr_payload_regs++;
2866       }
2867    }
2868    /* R31: MSAA position offsets. */
2869    /* R32-: bary for 32-pixel. */
2870    /* R58-59: interp W for 32-pixel. */
2871
2872    if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
2873       c->source_depth_to_render_target = true;
2874    }
2875 }
2876
2877 bool
2878 fs_visitor::run()
2879 {
2880    sanity_param_count = fp->Base.Parameters->NumParameters;
2881    uint32_t orig_nr_params = c->prog_data.nr_params;
2882
2883    if (intel->gen >= 6)
2884       setup_payload_gen6();
2885    else
2886       setup_payload_gen4();
2887
2888    if (0) {
2889       emit_dummy_fs();
2890    } else {
2891       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2892          emit_shader_time_begin();
2893
2894       calculate_urb_setup();
2895       if (intel->gen < 6)
2896          emit_interpolation_setup_gen4();
2897       else
2898          emit_interpolation_setup_gen6();
2899
2900       /* We handle discards by keeping track of the still-live pixels in f0.1.
2901        * Initialize it with the dispatched pixels.
2902        */
2903       if (fp->UsesKill) {
2904          fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
2905          discard_init->flag_subreg = 1;
2906       }
2907
2908       /* Generate FS IR for main().  (the visitor only descends into
2909        * functions called "main").
2910        */
2911       if (shader) {
2912          foreach_list(node, &*shader->ir) {
2913             ir_instruction *ir = (ir_instruction *)node;
2914             base_ir = ir;
2915             this->result = reg_undef;
2916             ir->accept(this);
2917          }
2918       } else {
2919          emit_fragment_program_code();
2920       }
2921       base_ir = NULL;
2922       if (failed)
2923          return false;
2924
2925       emit(FS_OPCODE_PLACEHOLDER_HALT);
2926
2927       emit_fb_writes();
2928
2929       split_virtual_grfs();
2930
2931       move_uniform_array_access_to_pull_constants();
2932       setup_pull_constants();
2933
2934       bool progress;
2935       do {
2936          progress = false;
2937
2938          compact_virtual_grfs();
2939
2940          progress = remove_duplicate_mrf_writes() || progress;
2941
2942          progress = opt_algebraic() || progress;
2943          progress = opt_cse() || progress;
2944          progress = opt_copy_propagate() || progress;
2945          progress = dead_code_eliminate() || progress;
2946          progress = dead_code_eliminate_local() || progress;
2947          progress = register_coalesce() || progress;
2948          progress = register_coalesce_2() || progress;
2949          progress = compute_to_mrf() || progress;
2950       } while (progress);
2951
2952       remove_dead_constants();
2953
2954       schedule_instructions(false);
2955
2956       lower_uniform_pull_constant_loads();
2957
2958       assign_curb_setup();
2959       assign_urb_setup();
2960
2961       if (0) {
2962          /* Debug of register spilling: Go spill everything. */
2963          for (int i = 0; i < virtual_grf_count; i++) {
2964             spill_reg(i);
2965          }
2966       }
2967
2968       if (0)
2969          assign_regs_trivial();
2970       else {
2971          while (!assign_regs()) {
2972             if (failed)
2973                break;
2974          }
2975       }
2976    }
2977    assert(force_uncompressed_stack == 0);
2978    assert(force_sechalf_stack == 0);
2979
2980    /* This must come after all optimization and register allocation, since
2981     * it inserts dead code that happens to have side effects, and it does
2982     * so based on the actual physical registers in use.
2983     */
2984    insert_gen4_send_dependency_workarounds();
2985
2986    if (failed)
2987       return false;
2988
2989    schedule_instructions(true);
2990
2991    if (dispatch_width == 8) {
2992       c->prog_data.reg_blocks = brw_register_blocks(grf_used);
2993    } else {
2994       c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
2995
2996       /* Make sure we didn't try to sneak in an extra uniform */
2997       assert(orig_nr_params == c->prog_data.nr_params);
2998       (void) orig_nr_params;
2999    }
3000
3001    /* If any state parameters were appended, then ParameterValues could have
3002     * been realloced, in which case the driver uniform storage set up by
3003     * _mesa_associate_uniform_storage() would point to freed memory.  Make
3004     * sure that didn't happen.
3005     */
3006    assert(sanity_param_count == fp->Base.Parameters->NumParameters);
3007
3008    return !failed;
3009 }
3010
3011 const unsigned *
3012 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
3013                struct gl_fragment_program *fp,
3014                struct gl_shader_program *prog,
3015                unsigned *final_assembly_size)
3016 {
3017    struct intel_context *intel = &brw->intel;
3018    bool start_busy = false;
3019    float start_time = 0;
3020
3021    if (unlikely(intel->perf_debug)) {
3022       start_busy = (intel->batch.last_bo &&
3023                     drm_intel_bo_busy(intel->batch.last_bo));
3024       start_time = get_time();
3025    }
3026
3027    struct brw_shader *shader = NULL;
3028    if (prog)
3029       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3030
3031    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3032       if (prog) {
3033          printf("GLSL IR for native fragment shader %d:\n", prog->Name);
3034          _mesa_print_ir(shader->ir, NULL);
3035          printf("\n\n");
3036       } else {
3037          printf("ARB_fragment_program %d ir for native fragment shader\n",
3038                 fp->Base.Id);
3039          _mesa_print_program(&fp->Base);
3040       }
3041    }
3042
3043    /* Now the main event: Visit the shader IR and generate our FS IR for it.
3044     */
3045    fs_visitor v(brw, c, prog, fp, 8);
3046    if (!v.run()) {
3047       if (prog) {
3048          prog->LinkStatus = false;
3049          ralloc_strcat(&prog->InfoLog, v.fail_msg);
3050       }
3051
3052       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3053                     v.fail_msg);
3054
3055       return NULL;
3056    }
3057
3058    exec_list *simd16_instructions = NULL;
3059    fs_visitor v2(brw, c, prog, fp, 16);
3060    bool no16 = INTEL_DEBUG & DEBUG_NO16;
3061    if (intel->gen >= 5 && c->prog_data.nr_pull_params == 0 && likely(!no16)) {
3062       v2.import_uniforms(&v);
3063       if (!v2.run()) {
3064          perf_debug("16-wide shader failed to compile, falling back to "
3065                     "8-wide at a 10-20%% performance cost: %s", v2.fail_msg);
3066       } else {
3067          simd16_instructions = &v2.instructions;
3068       }
3069    }
3070
3071    c->prog_data.dispatch_width = 8;
3072
3073    fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
3074    const unsigned *generated = g.generate_assembly(&v.instructions,
3075                                                    simd16_instructions,
3076                                                    final_assembly_size);
3077
3078    if (unlikely(intel->perf_debug) && shader) {
3079       if (shader->compiled_once)
3080          brw_wm_debug_recompile(brw, prog, &c->key);
3081       shader->compiled_once = true;
3082
3083       if (start_busy && !drm_intel_bo_busy(intel->batch.last_bo)) {
3084          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3085                     (get_time() - start_time) * 1000);
3086       }
3087    }
3088
3089    return generated;
3090 }
3091
3092 bool
3093 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
3094 {
3095    struct brw_context *brw = brw_context(ctx);
3096    struct intel_context *intel = &brw->intel;
3097    struct brw_wm_prog_key key;
3098
3099    if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
3100       return true;
3101
3102    struct gl_fragment_program *fp = (struct gl_fragment_program *)
3103       prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
3104    struct brw_fragment_program *bfp = brw_fragment_program(fp);
3105    bool program_uses_dfdy = fp->UsesDFdy;
3106
3107    memset(&key, 0, sizeof(key));
3108
3109    if (intel->gen < 6) {
3110       if (fp->UsesKill)
3111          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3112
3113       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3114          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3115
3116       /* Just assume depth testing. */
3117       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3118       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3119    }
3120
3121    if (intel->gen < 6)
3122       key.input_slots_valid |= BITFIELD64_BIT(VARYING_SLOT_POS);
3123
3124    for (int i = 0; i < VARYING_SLOT_MAX; i++) {
3125       if (!(fp->Base.InputsRead & BITFIELD64_BIT(i)))
3126          continue;
3127
3128       if (intel->gen < 6) {
3129          if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
3130             key.input_slots_valid |= BITFIELD64_BIT(i);
3131       }
3132    }
3133
3134    key.clamp_fragment_color = ctx->API == API_OPENGL_COMPAT;
3135
3136    for (int i = 0; i < MAX_SAMPLERS; i++) {
3137       if (fp->Base.ShadowSamplers & (1 << i)) {
3138          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3139          key.tex.swizzles[i] =
3140             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3141       } else {
3142          /* Color sampler: assume no swizzling. */
3143          key.tex.swizzles[i] = SWIZZLE_XYZW;
3144       }
3145    }
3146
3147    if (fp->Base.InputsRead & VARYING_BIT_POS) {
3148       key.drawable_height = ctx->DrawBuffer->Height;
3149    }
3150
3151    if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3152       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
3153    }
3154
3155    key.nr_color_regions = 1;
3156
3157    key.program_string_id = bfp->id;
3158
3159    uint32_t old_prog_offset = brw->wm.prog_offset;
3160    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3161
3162    bool success = do_wm_prog(brw, prog, bfp, &key);
3163
3164    brw->wm.prog_offset = old_prog_offset;
3165    brw->wm.prog_data = old_prog_data;
3166
3167    return success;
3168 }