src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 extern "C" {
  32
  33 #include <sys/types.h>
  34
  35 #include "main/hash_table.h"
  36 #include "main/macros.h"
  37 #include "main/shaderobj.h"
  38 #include "main/uniforms.h"
  39 #include "main/fbobject.h"
  40 #include "program/prog_parameter.h"
  41 #include "program/prog_print.h"
  42 #include "program/register_allocate.h"
  43 #include "program/sampler.h"
  44 #include "program/hash_table.h"
  45 #include "brw_context.h"
  46 #include "brw_eu.h"
  47 #include "brw_wm.h"
  48 }
  49 #include "brw_fs.h"
  50 #include "glsl/glsl_types.h"
  51 #include "glsl/ir_print_visitor.h"
  52
  53 void
  54 fs_inst::init()
  55 {
  56    memset(this, 0, sizeof(*this));
  57    this->opcode = BRW_OPCODE_NOP;
  58    this->conditional_mod = BRW_CONDITIONAL_NONE;
  59
  60    this->dst = reg_undef;
  61    this->src[0] = reg_undef;
  62    this->src[1] = reg_undef;
  63    this->src[2] = reg_undef;
  64
  65    /* This will be the case for almost all instructions. */
  66    this->regs_written = 1;
  67 }
  68
  69 fs_inst::fs_inst()
  70 {
  71    init();
  72 }
  73
  74 fs_inst::fs_inst(enum opcode opcode)
  75 {
  76    init();
  77    this->opcode = opcode;
  78 }
  79
  80 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
  81 {
  82    init();
  83    this->opcode = opcode;
  84    this->dst = dst;
  85
  86    if (dst.file == GRF)
  87       assert(dst.reg_offset >= 0);
  88 }
  89
  90 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
  91 {
  92    init();
  93    this->opcode = opcode;
  94    this->dst = dst;
  95    this->src[0] = src0;
  96
  97    if (dst.file == GRF)
  98       assert(dst.reg_offset >= 0);
  99    if (src[0].file == GRF)
 100       assert(src[0].reg_offset >= 0);
 101 }
 102
 103 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 104 {
 105    init();
 106    this->opcode = opcode;
 107    this->dst = dst;
 108    this->src[0] = src0;
 109    this->src[1] = src1;
 110
 111    if (dst.file == GRF)
 112       assert(dst.reg_offset >= 0);
 113    if (src[0].file == GRF)
 114       assert(src[0].reg_offset >= 0);
 115    if (src[1].file == GRF)
 116       assert(src[1].reg_offset >= 0);
 117 }
 118
 119 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
 120                  fs_reg src0, fs_reg src1, fs_reg src2)
 121 {
 122    init();
 123    this->opcode = opcode;
 124    this->dst = dst;
 125    this->src[0] = src0;
 126    this->src[1] = src1;
 127    this->src[2] = src2;
 128
 129    if (dst.file == GRF)
 130       assert(dst.reg_offset >= 0);
 131    if (src[0].file == GRF)
 132       assert(src[0].reg_offset >= 0);
 133    if (src[1].file == GRF)
 134       assert(src[1].reg_offset >= 0);
 135    if (src[2].file == GRF)
 136       assert(src[2].reg_offset >= 0);
 137 }
 138
 139 #define ALU1(op)                                                        \
 140    fs_inst *                                                            \
 141    fs_visitor::op(fs_reg dst, fs_reg src0)                              \
 142    {                                                                    \
 143       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
 144    }
 145
 146 #define ALU2(op)                                                        \
 147    fs_inst *                                                            \
 148    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1)                 \
 149    {                                                                    \
 150       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
 151    }
 152
 153 #define ALU3(op)                                                        \
 154    fs_inst *                                                            \
 155    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1, fs_reg src2)    \
 156    {                                                                    \
 157       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
 158    }
 159
 160 ALU1(NOT)
 161 ALU1(MOV)
 162 ALU1(FRC)
 163 ALU1(RNDD)
 164 ALU1(RNDE)
 165 ALU1(RNDZ)
 166 ALU2(ADD)
 167 ALU2(MUL)
 168 ALU2(MACH)
 169 ALU2(AND)
 170 ALU2(OR)
 171 ALU2(XOR)
 172 ALU2(SHL)
 173 ALU2(SHR)
 174 ALU2(ASR)
 175 ALU3(LRP)
 176 ALU1(BFREV)
 177 ALU3(BFE)
 178 ALU2(BFI1)
 179 ALU3(BFI2)
 180 ALU1(FBH)
 181 ALU1(FBL)
 182 ALU1(CBIT)
 183
 184 /** Gen4 predicated IF. */
 185 fs_inst *
 186 fs_visitor::IF(uint32_t predicate)
 187 {
 188    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
 189    inst->predicate = predicate;
 190    return inst;
 191 }
 192
 193 /** Gen6+ IF with embedded comparison. */
 194 fs_inst *
 195 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
 196 {
 197    assert(intel->gen >= 6);
 198    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
 199                                         reg_null_d, src0, src1);
 200    inst->conditional_mod = condition;
 201    return inst;
 202 }
 203
 204 /**
 205  * CMP: Sets the low bit of the destination channels with the result
 206  * of the comparison, while the upper bits are undefined, and updates
 207  * the flag register with the packed 16 bits of the result.
 208  */
 209 fs_inst *
 210 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
 211 {
 212    fs_inst *inst;
 213
 214    /* Take the instruction:
 215     *
 216     * CMP null<d> src0<f> src1<f>
 217     *
 218     * Original gen4 does type conversion to the destination type before
 219     * comparison, producing garbage results for floating point comparisons.
 220     * gen5 does the comparison on the execution type (resolved source types),
 221     * so dst type doesn't matter.  gen6 does comparison and then uses the
 222     * result as if it was the dst type with no conversion, which happens to
 223     * mostly work out for float-interpreted-as-int since our comparisons are
 224     * for >0, =0, <0.
 225     */
 226    if (intel->gen == 4) {
 227       dst.type = src0.type;
 228       if (dst.file == HW_REG)
 229          dst.fixed_hw_reg.type = dst.type;
 230    }
 231
 232    resolve_ud_negate(&src0);
 233    resolve_ud_negate(&src1);
 234
 235    inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
 236    inst->conditional_mod = condition;
 237
 238    return inst;
 239 }
 240
 241 exec_list
 242 fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
 243                                        fs_reg varying_offset,
 244                                        uint32_t const_offset)
 245 {
 246    exec_list instructions;
 247    fs_inst *inst;
 248
 249    /* We have our constant surface use a pitch of 4 bytes, so our index can
 250     * be any component of a vector, and then we load 4 contiguous
 251     * components starting from that.
 252     *
 253     * We break down the const_offset to a portion added to the variable
 254     * offset and a portion done using reg_offset, which means that if you
 255     * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
 256     * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
 257     * CSE can later notice that those loads are all the same and eliminate
 258     * the redundant ones.
 259     */
 260    fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
 261    instructions.push_tail(ADD(vec4_offset,
 262                               varying_offset, const_offset & ~3));
 263
 264    int scale = 1;
 265    if (intel->gen == 4 && dispatch_width == 8) {
 266       /* Pre-gen5, we can either use a SIMD8 message that requires (header,
 267        * u, v, r) as parameters, or we can just use the SIMD16 message
 268        * consisting of (header, u).  We choose the second, at the cost of a
 269        * longer return length.
 270        */
 271       scale = 2;
 272    }
 273
 274    enum opcode op;
 275    if (intel->gen >= 7)
 276       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
 277    else
 278       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
 279    fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4 * scale), dst.type);
 280    inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
 281    inst->regs_written = 4 * scale;
 282    instructions.push_tail(inst);
 283
 284    if (intel->gen < 7) {
 285       inst->base_mrf = 13;
 286       inst->header_present = true;
 287       if (intel->gen == 4)
 288          inst->mlen = 3;
 289       else
 290          inst->mlen = 1 + dispatch_width / 8;
 291    }
 292
 293    vec4_result.reg_offset += (const_offset & 3) * scale;
 294    instructions.push_tail(MOV(dst, vec4_result));
 295
 296    return instructions;
 297 }
 298
 299 /**
 300  * A helper for MOV generation for fixing up broken hardware SEND dependency
 301  * handling.
 302  */
 303 fs_inst *
 304 fs_visitor::DEP_RESOLVE_MOV(int grf)
 305 {
 306    fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
 307
 308    inst->ir = NULL;
 309    inst->annotation = "send dependency resolve";
 310
 311    /* The caller always wants uncompressed to emit the minimal extra
 312     * dependencies, and to avoid having to deal with aligning its regs to 2.
 313     */
 314    inst->force_uncompressed = true;
 315
 316    return inst;
 317 }
 318
 319 bool
 320 fs_inst::equals(fs_inst *inst)
 321 {
 322    return (opcode == inst->opcode &&
 323            dst.equals(inst->dst) &&
 324            src[0].equals(inst->src[0]) &&
 325            src[1].equals(inst->src[1]) &&
 326            src[2].equals(inst->src[2]) &&
 327            saturate == inst->saturate &&
 328            predicate == inst->predicate &&
 329            conditional_mod == inst->conditional_mod &&
 330            mlen == inst->mlen &&
 331            base_mrf == inst->base_mrf &&
 332            sampler == inst->sampler &&
 333            target == inst->target &&
 334            eot == inst->eot &&
 335            header_present == inst->header_present &&
 336            shadow_compare == inst->shadow_compare &&
 337            offset == inst->offset);
 338 }
 339
 340 bool
 341 fs_inst::overwrites_reg(const fs_reg &reg)
 342 {
 343    return (reg.file == dst.file &&
 344            reg.reg == dst.reg &&
 345            reg.reg_offset >= dst.reg_offset  &&
 346            reg.reg_offset < dst.reg_offset + regs_written);
 347 }
 348
 349 bool
 350 fs_inst::is_send_from_grf()
 351 {
 352    return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
 353            opcode == SHADER_OPCODE_SHADER_TIME_ADD ||
 354            (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
 355             src[1].file == GRF));
 356 }
 357
 358 bool
 359 fs_visitor::can_do_source_mods(fs_inst *inst)
 360 {
 361    if (intel->gen == 6 && inst->is_math())
 362       return false;
 363
 364    if (inst->is_send_from_grf())
 365       return false;
 366
 367    return true;
 368 }
 369
 370 void
 371 fs_reg::init()
 372 {
 373    memset(this, 0, sizeof(*this));
 374    this->smear = -1;
 375 }
 376
 377 /** Generic unset register constructor. */
 378 fs_reg::fs_reg()
 379 {
 380    init();
 381    this->file = BAD_FILE;
 382 }
 383
 384 /** Immediate value constructor. */
 385 fs_reg::fs_reg(float f)
 386 {
 387    init();
 388    this->file = IMM;
 389    this->type = BRW_REGISTER_TYPE_F;
 390    this->imm.f = f;
 391 }
 392
 393 /** Immediate value constructor. */
 394 fs_reg::fs_reg(int32_t i)
 395 {
 396    init();
 397    this->file = IMM;
 398    this->type = BRW_REGISTER_TYPE_D;
 399    this->imm.i = i;
 400 }
 401
 402 /** Immediate value constructor. */
 403 fs_reg::fs_reg(uint32_t u)
 404 {
 405    init();
 406    this->file = IMM;
 407    this->type = BRW_REGISTER_TYPE_UD;
 408    this->imm.u = u;
 409 }
 410
 411 /** Fixed brw_reg Immediate value constructor. */
 412 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
 413 {
 414    init();
 415    this->file = HW_REG;
 416    this->fixed_hw_reg = fixed_hw_reg;
 417    this->type = fixed_hw_reg.type;
 418 }
 419
 420 bool
 421 fs_reg::equals(const fs_reg &r) const
 422 {
 423    return (file == r.file &&
 424            reg == r.reg &&
 425            reg_offset == r.reg_offset &&
 426            type == r.type &&
 427            negate == r.negate &&
 428            abs == r.abs &&
 429            !reladdr && !r.reladdr &&
 430            memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
 431                   sizeof(fixed_hw_reg)) == 0 &&
 432            smear == r.smear &&
 433            imm.u == r.imm.u);
 434 }
 435
 436 bool
 437 fs_reg::is_zero() const
 438 {
 439    if (file != IMM)
 440       return false;
 441
 442    return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
 443 }
 444
 445 bool
 446 fs_reg::is_one() const
 447 {
 448    if (file != IMM)
 449       return false;
 450
 451    return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
 452 }
 453
 454 bool
 455 fs_reg::is_valid_3src() const
 456 {
 457    return file == GRF || file == UNIFORM;
 458 }
 459
 460 int
 461 fs_visitor::type_size(const struct glsl_type *type)
 462 {
 463    unsigned int size, i;
 464
 465    switch (type->base_type) {
 466    case GLSL_TYPE_UINT:
 467    case GLSL_TYPE_INT:
 468    case GLSL_TYPE_FLOAT:
 469    case GLSL_TYPE_BOOL:
 470       return type->components();
 471    case GLSL_TYPE_ARRAY:
 472       return type_size(type->fields.array) * type->length;
 473    case GLSL_TYPE_STRUCT:
 474       size = 0;
 475       for (i = 0; i < type->length; i++) {
 476          size += type_size(type->fields.structure[i].type);
 477       }
 478       return size;
 479    case GLSL_TYPE_SAMPLER:
 480       /* Samplers take up no register space, since they're baked in at
 481        * link time.
 482        */
 483       return 0;
 484    case GLSL_TYPE_VOID:
 485    case GLSL_TYPE_ERROR:
 486    case GLSL_TYPE_INTERFACE:
 487       assert(!"not reached");
 488       break;
 489    }
 490
 491    return 0;
 492 }
 493
 494 fs_reg
 495 fs_visitor::get_timestamp()
 496 {
 497    assert(intel->gen >= 7);
 498
 499    fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 500                                           BRW_ARF_TIMESTAMP,
 501                                           0),
 502                              BRW_REGISTER_TYPE_UD));
 503
 504    fs_reg dst = fs_reg(this, glsl_type::uint_type);
 505
 506    fs_inst *mov = emit(MOV(dst, ts));
 507    /* We want to read the 3 fields we care about (mostly field 0, but also 2)
 508     * even if it's not enabled in the dispatch.
 509     */
 510    mov->force_writemask_all = true;
 511    mov->force_uncompressed = true;
 512
 513    /* The caller wants the low 32 bits of the timestamp.  Since it's running
 514     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
 515     * which is plenty of time for our purposes.  It is identical across the
 516     * EUs, but since it's tracking GPU core speed it will increment at a
 517     * varying rate as render P-states change.
 518     *
 519     * The caller could also check if render P-states have changed (or anything
 520     * else that might disrupt timing) by setting smear to 2 and checking if
 521     * that field is != 0.
 522     */
 523    dst.smear = 0;
 524
 525    return dst;
 526 }
 527
 528 void
 529 fs_visitor::emit_shader_time_begin()
 530 {
 531    current_annotation = "shader time start";
 532    shader_start_time = get_timestamp();
 533 }
 534
 535 void
 536 fs_visitor::emit_shader_time_end()
 537 {
 538    current_annotation = "shader time end";
 539
 540    enum shader_time_shader_type type, written_type, reset_type;
 541    if (dispatch_width == 8) {
 542       type = ST_FS8;
 543       written_type = ST_FS8_WRITTEN;
 544       reset_type = ST_FS8_RESET;
 545    } else {
 546       assert(dispatch_width == 16);
 547       type = ST_FS16;
 548       written_type = ST_FS16_WRITTEN;
 549       reset_type = ST_FS16_RESET;
 550    }
 551
 552    fs_reg shader_end_time = get_timestamp();
 553
 554    /* Check that there weren't any timestamp reset events (assuming these
 555     * were the only two timestamp reads that happened).
 556     */
 557    fs_reg reset = shader_end_time;
 558    reset.smear = 2;
 559    fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
 560    test->conditional_mod = BRW_CONDITIONAL_Z;
 561    emit(IF(BRW_PREDICATE_NORMAL));
 562
 563    push_force_uncompressed();
 564    fs_reg start = shader_start_time;
 565    start.negate = true;
 566    fs_reg diff = fs_reg(this, glsl_type::uint_type);
 567    emit(ADD(diff, start, shader_end_time));
 568
 569    /* If there were no instructions between the two timestamp gets, the diff
 570     * is 2 cycles.  Remove that overhead, so I can forget about that when
 571     * trying to determine the time taken for single instructions.
 572     */
 573    emit(ADD(diff, diff, fs_reg(-2u)));
 574
 575    emit_shader_time_write(type, diff);
 576    emit_shader_time_write(written_type, fs_reg(1u));
 577    emit(BRW_OPCODE_ELSE);
 578    emit_shader_time_write(reset_type, fs_reg(1u));
 579    emit(BRW_OPCODE_ENDIF);
 580
 581    pop_force_uncompressed();
 582 }
 583
 584 void
 585 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
 586                                    fs_reg value)
 587 {
 588    int shader_time_index =
 589       brw_get_shader_time_index(brw, shader_prog, &fp->Base, type);
 590    fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
 591
 592    fs_reg payload;
 593    if (dispatch_width == 8)
 594       payload = fs_reg(this, glsl_type::uvec2_type);
 595    else
 596       payload = fs_reg(this, glsl_type::uint_type);
 597
 598    emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
 599                 fs_reg(), payload, offset, value));
 600 }
 601
 602 void
 603 fs_visitor::fail(const char *format, ...)
 604 {
 605    va_list va;
 606    char *msg;
 607
 608    if (failed)
 609       return;
 610
 611    failed = true;
 612
 613    va_start(va, format);
 614    msg = ralloc_vasprintf(mem_ctx, format, va);
 615    va_end(va);
 616    msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
 617
 618    this->fail_msg = msg;
 619
 620    if (INTEL_DEBUG & DEBUG_WM) {
 621       fprintf(stderr, "%s",  msg);
 622    }
 623 }
 624
 625 fs_inst *
 626 fs_visitor::emit(enum opcode opcode)
 627 {
 628    return emit(fs_inst(opcode));
 629 }
 630
 631 fs_inst *
 632 fs_visitor::emit(enum opcode opcode, fs_reg dst)
 633 {
 634    return emit(fs_inst(opcode, dst));
 635 }
 636
 637 fs_inst *
 638 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
 639 {
 640    return emit(fs_inst(opcode, dst, src0));
 641 }
 642
 643 fs_inst *
 644 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 645 {
 646    return emit(fs_inst(opcode, dst, src0, src1));
 647 }
 648
 649 fs_inst *
 650 fs_visitor::emit(enum opcode opcode, fs_reg dst,
 651                  fs_reg src0, fs_reg src1, fs_reg src2)
 652 {
 653    return emit(fs_inst(opcode, dst, src0, src1, src2));
 654 }
 655
 656 void
 657 fs_visitor::push_force_uncompressed()
 658 {
 659    force_uncompressed_stack++;
 660 }
 661
 662 void
 663 fs_visitor::pop_force_uncompressed()
 664 {
 665    force_uncompressed_stack--;
 666    assert(force_uncompressed_stack >= 0);
 667 }
 668
 669 void
 670 fs_visitor::push_force_sechalf()
 671 {
 672    force_sechalf_stack++;
 673 }
 674
 675 void
 676 fs_visitor::pop_force_sechalf()
 677 {
 678    force_sechalf_stack--;
 679    assert(force_sechalf_stack >= 0);
 680 }
 681
 682 /**
 683  * Returns true if the instruction has a flag that means it won't
 684  * update an entire destination register.
 685  *
 686  * For example, dead code elimination and live variable analysis want to know
 687  * when a write to a variable screens off any preceding values that were in
 688  * it.
 689  */
 690 bool
 691 fs_inst::is_partial_write()
 692 {
 693    return (this->predicate ||
 694            this->force_uncompressed ||
 695            this->force_sechalf);
 696 }
 697
 698 /**
 699  * Returns how many MRFs an FS opcode will write over.
 700  *
 701  * Note that this is not the 0 or 1 implied writes in an actual gen
 702  * instruction -- the FS opcodes often generate MOVs in addition.
 703  */
 704 int
 705 fs_visitor::implied_mrf_writes(fs_inst *inst)
 706 {
 707    if (inst->mlen == 0)
 708       return 0;
 709
 710    switch (inst->opcode) {
 711    case SHADER_OPCODE_RCP:
 712    case SHADER_OPCODE_RSQ:
 713    case SHADER_OPCODE_SQRT:
 714    case SHADER_OPCODE_EXP2:
 715    case SHADER_OPCODE_LOG2:
 716    case SHADER_OPCODE_SIN:
 717    case SHADER_OPCODE_COS:
 718       return 1 * dispatch_width / 8;
 719    case SHADER_OPCODE_POW:
 720    case SHADER_OPCODE_INT_QUOTIENT:
 721    case SHADER_OPCODE_INT_REMAINDER:
 722       return 2 * dispatch_width / 8;
 723    case SHADER_OPCODE_TEX:
 724    case FS_OPCODE_TXB:
 725    case SHADER_OPCODE_TXD:
 726    case SHADER_OPCODE_TXF:
 727    case SHADER_OPCODE_TXF_MS:
 728    case SHADER_OPCODE_TXL:
 729    case SHADER_OPCODE_TXS:
 730    case SHADER_OPCODE_LOD:
 731       return 1;
 732    case FS_OPCODE_FB_WRITE:
 733       return 2;
 734    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 735    case FS_OPCODE_UNSPILL:
 736       return 1;
 737    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
 738       return inst->mlen;
 739    case FS_OPCODE_SPILL:
 740       return 2;
 741    default:
 742       assert(!"not reached");
 743       return inst->mlen;
 744    }
 745 }
 746
 747 int
 748 fs_visitor::virtual_grf_alloc(int size)
 749 {
 750    if (virtual_grf_array_size <= virtual_grf_count) {
 751       if (virtual_grf_array_size == 0)
 752          virtual_grf_array_size = 16;
 753       else
 754          virtual_grf_array_size *= 2;
 755       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 756                                    virtual_grf_array_size);
 757    }
 758    virtual_grf_sizes[virtual_grf_count] = size;
 759    return virtual_grf_count++;
 760 }
 761
 762 /** Fixed HW reg constructor. */
 763 fs_reg::fs_reg(enum register_file file, int reg)
 764 {
 765    init();
 766    this->file = file;
 767    this->reg = reg;
 768    this->type = BRW_REGISTER_TYPE_F;
 769 }
 770
 771 /** Fixed HW reg constructor. */
 772 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
 773 {
 774    init();
 775    this->file = file;
 776    this->reg = reg;
 777    this->type = type;
 778 }
 779
 780 /** Automatic reg constructor. */
 781 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
 782 {
 783    init();
 784
 785    this->file = GRF;
 786    this->reg = v->virtual_grf_alloc(v->type_size(type));
 787    this->reg_offset = 0;
 788    this->type = brw_type_for_base_type(type);
 789 }
 790
 791 fs_reg *
 792 fs_visitor::variable_storage(ir_variable *var)
 793 {
 794    return (fs_reg *)hash_table_find(this->variable_ht, var);
 795 }
 796
 797 void
 798 import_uniforms_callback(const void *key,
 799                          void *data,
 800                          void *closure)
 801 {
 802    struct hash_table *dst_ht = (struct hash_table *)closure;
 803    const fs_reg *reg = (const fs_reg *)data;
 804
 805    if (reg->file != UNIFORM)
 806       return;
 807
 808    hash_table_insert(dst_ht, data, key);
 809 }
 810
 811 /* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
 812  * This brings in those uniform definitions
 813  */
 814 void
 815 fs_visitor::import_uniforms(fs_visitor *v)
 816 {
 817    hash_table_call_foreach(v->variable_ht,
 818                            import_uniforms_callback,
 819                            variable_ht);
 820    this->params_remap = v->params_remap;
 821 }
 822
 823 /* Our support for uniforms is piggy-backed on the struct
 824  * gl_fragment_program, because that's where the values actually
 825  * get stored, rather than in some global gl_shader_program uniform
 826  * store.
 827  */
 828 void
 829 fs_visitor::setup_uniform_values(ir_variable *ir)
 830 {
 831    int namelen = strlen(ir->name);
 832
 833    /* The data for our (non-builtin) uniforms is stored in a series of
 834     * gl_uniform_driver_storage structs for each subcomponent that
 835     * glGetUniformLocation() could name.  We know it's been set up in the same
 836     * order we'd walk the type, so walk the list of storage and find anything
 837     * with our name, or the prefix of a component that starts with our name.
 838     */
 839    unsigned params_before = c->prog_data.nr_params;
 840    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
 841       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 842
 843       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 844           (storage->name[namelen] != 0 &&
 845            storage->name[namelen] != '.' &&
 846            storage->name[namelen] != '[')) {
 847          continue;
 848       }
 849
 850       unsigned slots = storage->type->component_slots();
 851       if (storage->array_elements)
 852          slots *= storage->array_elements;
 853
 854       for (unsigned i = 0; i < slots; i++) {
 855          c->prog_data.param[c->prog_data.nr_params++] =
 856             &storage->storage[i].f;
 857       }
 858    }
 859
 860    /* Make sure we actually initialized the right amount of stuff here. */
 861    assert(params_before + ir->type->component_slots() ==
 862           c->prog_data.nr_params);
 863    (void)params_before;
 864 }
 865
 866
 867 /* Our support for builtin uniforms is even scarier than non-builtin.
 868  * It sits on top of the PROG_STATE_VAR parameters that are
 869  * automatically updated from GL context state.
 870  */
 871 void
 872 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
 873 {
 874    const ir_state_slot *const slots = ir->state_slots;
 875    assert(ir->state_slots != NULL);
 876
 877    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 878       /* This state reference has already been setup by ir_to_mesa, but we'll
 879        * get the same index back here.
 880        */
 881       int index = _mesa_add_state_reference(this->fp->Base.Parameters,
 882                                             (gl_state_index *)slots[i].tokens);
 883
 884       /* Add each of the unique swizzles of the element as a parameter.
 885        * This'll end up matching the expected layout of the
 886        * array/matrix/structure we're trying to fill in.
 887        */
 888       int last_swiz = -1;
 889       for (unsigned int j = 0; j < 4; j++) {
 890          int swiz = GET_SWZ(slots[i].swizzle, j);
 891          if (swiz == last_swiz)
 892             break;
 893          last_swiz = swiz;
 894
 895          c->prog_data.param[c->prog_data.nr_params++] =
 896             &fp->Base.Parameters->ParameterValues[index][swiz].f;
 897       }
 898    }
 899 }
 900
 901 fs_reg *
 902 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
 903 {
 904    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 905    fs_reg wpos = *reg;
 906    bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
 907
 908    /* gl_FragCoord.x */
 909    if (ir->pixel_center_integer) {
 910       emit(MOV(wpos, this->pixel_x));
 911    } else {
 912       emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
 913    }
 914    wpos.reg_offset++;
 915
 916    /* gl_FragCoord.y */
 917    if (!flip && ir->pixel_center_integer) {
 918       emit(MOV(wpos, this->pixel_y));
 919    } else {
 920       fs_reg pixel_y = this->pixel_y;
 921       float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
 922
 923       if (flip) {
 924          pixel_y.negate = true;
 925          offset += c->key.drawable_height - 1.0;
 926       }
 927
 928       emit(ADD(wpos, pixel_y, fs_reg(offset)));
 929    }
 930    wpos.reg_offset++;
 931
 932    /* gl_FragCoord.z */
 933    if (intel->gen >= 6) {
 934       emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
 935    } else {
 936       emit(FS_OPCODE_LINTERP, wpos,
 937            this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
 938            this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
 939            interp_reg(VARYING_SLOT_POS, 2));
 940    }
 941    wpos.reg_offset++;
 942
 943    /* gl_FragCoord.w: Already set up in emit_interpolation */
 944    emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
 945
 946    return reg;
 947 }
 948
 949 fs_inst *
 950 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
 951                          glsl_interp_qualifier interpolation_mode,
 952                          bool is_centroid)
 953 {
 954    brw_wm_barycentric_interp_mode barycoord_mode;
 955    if (intel->gen >= 6) {
 956       if (is_centroid) {
 957          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
 958             barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
 959          else
 960             barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
 961       } else {
 962          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
 963             barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
 964          else
 965             barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
 966       }
 967    } else {
 968       /* On Ironlake and below, there is only one interpolation mode.
 969        * Centroid interpolation doesn't mean anything on this hardware --
 970        * there is no multisampling.
 971        */
 972       barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
 973    }
 974    return emit(FS_OPCODE_LINTERP, attr,
 975                this->delta_x[barycoord_mode],
 976                this->delta_y[barycoord_mode], interp);
 977 }
 978
 979 fs_reg *
 980 fs_visitor::emit_general_interpolation(ir_variable *ir)
 981 {
 982    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 983    reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
 984    fs_reg attr = *reg;
 985
 986    unsigned int array_elements;
 987    const glsl_type *type;
 988
 989    if (ir->type->is_array()) {
 990       array_elements = ir->type->length;
 991       if (array_elements == 0) {
 992          fail("dereferenced array '%s' has length 0\n", ir->name);
 993       }
 994       type = ir->type->fields.array;
 995    } else {
 996       array_elements = 1;
 997       type = ir->type;
 998    }
 999
1000    glsl_interp_qualifier interpolation_mode =
1001       ir->determine_interpolation_mode(c->key.flat_shade);
1002
1003    int location = ir->location;
1004    for (unsigned int i = 0; i < array_elements; i++) {
1005       for (unsigned int j = 0; j < type->matrix_columns; j++) {
1006          if (urb_setup[location] == -1) {
1007             /* If there's no incoming setup data for this slot, don't
1008              * emit interpolation for it.
1009              */
1010             attr.reg_offset += type->vector_elements;
1011             location++;
1012             continue;
1013          }
1014
1015          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1016             /* Constant interpolation (flat shading) case. The SF has
1017              * handed us defined values in only the constant offset
1018              * field of the setup reg.
1019              */
1020             for (unsigned int k = 0; k < type->vector_elements; k++) {
1021                struct brw_reg interp = interp_reg(location, k);
1022                interp = suboffset(interp, 3);
1023                interp.type = reg->type;
1024                emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1025                attr.reg_offset++;
1026             }
1027          } else {
1028             /* Smooth/noperspective interpolation case. */
1029             for (unsigned int k = 0; k < type->vector_elements; k++) {
1030                /* FINISHME: At some point we probably want to push
1031                 * this farther by giving similar treatment to the
1032                 * other potentially constant components of the
1033                 * attribute, as well as making brw_vs_constval.c
1034                 * handle varyings other than gl_TexCoord.
1035                 */
1036                struct brw_reg interp = interp_reg(location, k);
1037                emit_linterp(attr, fs_reg(interp), interpolation_mode,
1038                             ir->centroid);
1039                if (brw->needs_unlit_centroid_workaround && ir->centroid) {
1040                   /* Get the pixel/sample mask into f0 so that we know
1041                    * which pixels are lit.  Then, for each channel that is
1042                    * unlit, replace the centroid data with non-centroid
1043                    * data.
1044                    */
1045                   emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1046                   fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1047                                                interpolation_mode, false);
1048                   inst->predicate = BRW_PREDICATE_NORMAL;
1049                   inst->predicate_inverse = true;
1050                }
1051                if (intel->gen < 6) {
1052                   emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1053                }
1054                attr.reg_offset++;
1055             }
1056
1057          }
1058          location++;
1059       }
1060    }
1061
1062    return reg;
1063 }
1064
1065 fs_reg *
1066 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1067 {
1068    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1069
1070    /* The frontfacing comes in as a bit in the thread payload. */
1071    if (intel->gen >= 6) {
1072       emit(BRW_OPCODE_ASR, *reg,
1073            fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1074            fs_reg(15));
1075       emit(BRW_OPCODE_NOT, *reg, *reg);
1076       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1077    } else {
1078       struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1079       /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1080        * us front face
1081        */
1082       emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1083       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1084    }
1085
1086    return reg;
1087 }
1088
1089 fs_reg
1090 fs_visitor::fix_math_operand(fs_reg src)
1091 {
1092    /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1093     * might be able to do better by doing execsize = 1 math and then
1094     * expanding that result out, but we would need to be careful with
1095     * masking.
1096     *
1097     * The hardware ignores source modifiers (negate and abs) on math
1098     * instructions, so we also move to a temp to set those up.
1099     */
1100    if (intel->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1101        !src.abs && !src.negate)
1102       return src;
1103
1104    /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1105     * operands to math
1106     */
1107    if (intel->gen >= 7 && src.file != IMM)
1108       return src;
1109
1110    fs_reg expanded = fs_reg(this, glsl_type::float_type);
1111    expanded.type = src.type;
1112    emit(BRW_OPCODE_MOV, expanded, src);
1113    return expanded;
1114 }
1115
1116 fs_inst *
1117 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1118 {
1119    switch (opcode) {
1120    case SHADER_OPCODE_RCP:
1121    case SHADER_OPCODE_RSQ:
1122    case SHADER_OPCODE_SQRT:
1123    case SHADER_OPCODE_EXP2:
1124    case SHADER_OPCODE_LOG2:
1125    case SHADER_OPCODE_SIN:
1126    case SHADER_OPCODE_COS:
1127       break;
1128    default:
1129       assert(!"not reached: bad math opcode");
1130       return NULL;
1131    }
1132
1133    /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
1134     * might be able to do better by doing execsize = 1 math and then
1135     * expanding that result out, but we would need to be careful with
1136     * masking.
1137     *
1138     * Gen 6 hardware ignores source modifiers (negate and abs) on math
1139     * instructions, so we also move to a temp to set those up.
1140     */
1141    if (intel->gen >= 6)
1142       src = fix_math_operand(src);
1143
1144    fs_inst *inst = emit(opcode, dst, src);
1145
1146    if (intel->gen < 6) {
1147       inst->base_mrf = 2;
1148       inst->mlen = dispatch_width / 8;
1149    }
1150
1151    return inst;
1152 }
1153
1154 fs_inst *
1155 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1156 {
1157    int base_mrf = 2;
1158    fs_inst *inst;
1159
1160    switch (opcode) {
1161    case SHADER_OPCODE_INT_QUOTIENT:
1162    case SHADER_OPCODE_INT_REMAINDER:
1163       if (intel->gen >= 7 && dispatch_width == 16)
1164          fail("16-wide INTDIV unsupported\n");
1165       break;
1166    case SHADER_OPCODE_POW:
1167       break;
1168    default:
1169       assert(!"not reached: unsupported binary math opcode.");
1170       return NULL;
1171    }
1172
1173    if (intel->gen >= 6) {
1174       src0 = fix_math_operand(src0);
1175       src1 = fix_math_operand(src1);
1176
1177       inst = emit(opcode, dst, src0, src1);
1178    } else {
1179       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1180        * "Message Payload":
1181        *
1182        * "Operand0[7].  For the INT DIV functions, this operand is the
1183        *  denominator."
1184        *  ...
1185        * "Operand1[7].  For the INT DIV functions, this operand is the
1186        *  numerator."
1187        */
1188       bool is_int_div = opcode != SHADER_OPCODE_POW;
1189       fs_reg &op0 = is_int_div ? src1 : src0;
1190       fs_reg &op1 = is_int_div ? src0 : src1;
1191
1192       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1193       inst = emit(opcode, dst, op0, reg_null_f);
1194
1195       inst->base_mrf = base_mrf;
1196       inst->mlen = 2 * dispatch_width / 8;
1197    }
1198    return inst;
1199 }
1200
1201 void
1202 fs_visitor::assign_curb_setup()
1203 {
1204    c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
1205    if (dispatch_width == 8) {
1206       c->prog_data.first_curbe_grf = c->nr_payload_regs;
1207    } else {
1208       c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1209    }
1210
1211    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1212    foreach_list(node, &this->instructions) {
1213       fs_inst *inst = (fs_inst *)node;
1214
1215       for (unsigned int i = 0; i < 3; i++) {
1216          if (inst->src[i].file == UNIFORM) {
1217             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1218             struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1219                                                   constant_nr / 8,
1220                                                   constant_nr % 8);
1221
1222             inst->src[i].file = HW_REG;
1223             inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
1224          }
1225       }
1226    }
1227 }
1228
1229 void
1230 fs_visitor::calculate_urb_setup()
1231 {
1232    for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1233       urb_setup[i] = -1;
1234    }
1235
1236    int urb_next = 0;
1237    /* Figure out where each of the incoming setup attributes lands. */
1238    if (intel->gen >= 6) {
1239       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1240          if (fp->Base.InputsRead & BITFIELD64_BIT(i)) {
1241             urb_setup[i] = urb_next++;
1242          }
1243       }
1244    } else {
1245       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1246       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1247          /* Point size is packed into the header, not as a general attribute */
1248          if (i == VARYING_SLOT_PSIZ)
1249             continue;
1250
1251          if (c->key.input_slots_valid & BITFIELD64_BIT(i)) {
1252             /* The back color slot is skipped when the front color is
1253              * also written to.  In addition, some slots can be
1254              * written in the vertex shader and not read in the
1255              * fragment shader.  So the register number must always be
1256              * incremented, mapped or not.
1257              */
1258             if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1259                urb_setup[i] = urb_next;
1260             urb_next++;
1261          }
1262       }
1263
1264       /*
1265        * It's a FS only attribute, and we did interpolation for this attribute
1266        * in SF thread. So, count it here, too.
1267        *
1268        * See compile_sf_prog() for more info.
1269        */
1270       if (fp->Base.InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1271          urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1272    }
1273
1274    /* Each attribute is 4 setup channels, each of which is half a reg. */
1275    c->prog_data.urb_read_length = urb_next * 2;
1276 }
1277
1278 void
1279 fs_visitor::assign_urb_setup()
1280 {
1281    int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1282
1283    /* Offset all the urb_setup[] index by the actual position of the
1284     * setup regs, now that the location of the constants has been chosen.
1285     */
1286    foreach_list(node, &this->instructions) {
1287       fs_inst *inst = (fs_inst *)node;
1288
1289       if (inst->opcode == FS_OPCODE_LINTERP) {
1290          assert(inst->src[2].file == HW_REG);
1291          inst->src[2].fixed_hw_reg.nr += urb_start;
1292       }
1293
1294       if (inst->opcode == FS_OPCODE_CINTERP) {
1295          assert(inst->src[0].file == HW_REG);
1296          inst->src[0].fixed_hw_reg.nr += urb_start;
1297       }
1298    }
1299
1300    this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length;
1301 }
1302
1303 /**
1304  * Split large virtual GRFs into separate components if we can.
1305  *
1306  * This is mostly duplicated with what brw_fs_vector_splitting does,
1307  * but that's really conservative because it's afraid of doing
1308  * splitting that doesn't result in real progress after the rest of
1309  * the optimization phases, which would cause infinite looping in
1310  * optimization.  We can do it once here, safely.  This also has the
1311  * opportunity to split interpolated values, or maybe even uniforms,
1312  * which we don't have at the IR level.
1313  *
1314  * We want to split, because virtual GRFs are what we register
1315  * allocate and spill (due to contiguousness requirements for some
1316  * instructions), and they're what we naturally generate in the
1317  * codegen process, but most virtual GRFs don't actually need to be
1318  * contiguous sets of GRFs.  If we split, we'll end up with reduced
1319  * live intervals and better dead code elimination and coalescing.
1320  */
1321 void
1322 fs_visitor::split_virtual_grfs()
1323 {
1324    int num_vars = this->virtual_grf_count;
1325    bool split_grf[num_vars];
1326    int new_virtual_grf[num_vars];
1327
1328    /* Try to split anything > 0 sized. */
1329    for (int i = 0; i < num_vars; i++) {
1330       if (this->virtual_grf_sizes[i] != 1)
1331          split_grf[i] = true;
1332       else
1333          split_grf[i] = false;
1334    }
1335
1336    if (brw->has_pln &&
1337        this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1338       /* PLN opcodes rely on the delta_xy being contiguous.  We only have to
1339        * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1340        * Gen6, that was the only supported interpolation mode, and since Gen6,
1341        * delta_x and delta_y are in fixed hardware registers.
1342        */
1343       split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1344          false;
1345    }
1346
1347    foreach_list(node, &this->instructions) {
1348       fs_inst *inst = (fs_inst *)node;
1349
1350       /* If there's a SEND message that requires contiguous destination
1351        * registers, no splitting is allowed.
1352        */
1353       if (inst->regs_written > 1) {
1354          split_grf[inst->dst.reg] = false;
1355       }
1356
1357       /* If we're sending from a GRF, don't split it, on the assumption that
1358        * the send is reading the whole thing.
1359        */
1360       if (inst->is_send_from_grf()) {
1361          split_grf[inst->src[0].reg] = false;
1362       }
1363    }
1364
1365    /* Allocate new space for split regs.  Note that the virtual
1366     * numbers will be contiguous.
1367     */
1368    for (int i = 0; i < num_vars; i++) {
1369       if (split_grf[i]) {
1370          new_virtual_grf[i] = virtual_grf_alloc(1);
1371          for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1372             int reg = virtual_grf_alloc(1);
1373             assert(reg == new_virtual_grf[i] + j - 1);
1374             (void) reg;
1375          }
1376          this->virtual_grf_sizes[i] = 1;
1377       }
1378    }
1379
1380    foreach_list(node, &this->instructions) {
1381       fs_inst *inst = (fs_inst *)node;
1382
1383       if (inst->dst.file == GRF &&
1384           split_grf[inst->dst.reg] &&
1385           inst->dst.reg_offset != 0) {
1386          inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1387                           inst->dst.reg_offset - 1);
1388          inst->dst.reg_offset = 0;
1389       }
1390       for (int i = 0; i < 3; i++) {
1391          if (inst->src[i].file == GRF &&
1392              split_grf[inst->src[i].reg] &&
1393              inst->src[i].reg_offset != 0) {
1394             inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1395                                 inst->src[i].reg_offset - 1);
1396             inst->src[i].reg_offset = 0;
1397          }
1398       }
1399    }
1400    this->live_intervals_valid = false;
1401 }
1402
1403 /**
1404  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1405  *
1406  * During code generation, we create tons of temporary variables, many of
1407  * which get immediately killed and are never used again.  Yet, in later
1408  * optimization and analysis passes, such as compute_live_intervals, we need
1409  * to loop over all the virtual GRFs.  Compacting them can save a lot of
1410  * overhead.
1411  */
1412 void
1413 fs_visitor::compact_virtual_grfs()
1414 {
1415    /* Mark which virtual GRFs are used, and count how many. */
1416    int remap_table[this->virtual_grf_count];
1417    memset(remap_table, -1, sizeof(remap_table));
1418
1419    foreach_list(node, &this->instructions) {
1420       const fs_inst *inst = (const fs_inst *) node;
1421
1422       if (inst->dst.file == GRF)
1423          remap_table[inst->dst.reg] = 0;
1424
1425       for (int i = 0; i < 3; i++) {
1426          if (inst->src[i].file == GRF)
1427             remap_table[inst->src[i].reg] = 0;
1428       }
1429    }
1430
1431    /* In addition to registers used in instructions, fs_visitor keeps
1432     * direct references to certain special values which must be patched:
1433     */
1434    fs_reg *special[] = {
1435       &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1436       &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1437       &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1438       &delta_x[0], &delta_x[1], &delta_x[2],
1439       &delta_x[3], &delta_x[4], &delta_x[5],
1440       &delta_y[0], &delta_y[1], &delta_y[2],
1441       &delta_y[3], &delta_y[4], &delta_y[5],
1442    };
1443    STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1444    STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1445
1446    /* Treat all special values as used, to be conservative */
1447    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1448       if (special[i]->file == GRF)
1449          remap_table[special[i]->reg] = 0;
1450    }
1451
1452    /* Compact the GRF arrays. */
1453    int new_index = 0;
1454    for (int i = 0; i < this->virtual_grf_count; i++) {
1455       if (remap_table[i] != -1) {
1456          remap_table[i] = new_index;
1457          virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1458          if (live_intervals_valid) {
1459             virtual_grf_use[new_index] = virtual_grf_use[i];
1460             virtual_grf_def[new_index] = virtual_grf_def[i];
1461          }
1462          ++new_index;
1463       }
1464    }
1465
1466    this->virtual_grf_count = new_index;
1467
1468    /* Patch all the instructions to use the newly renumbered registers */
1469    foreach_list(node, &this->instructions) {
1470       fs_inst *inst = (fs_inst *) node;
1471
1472       if (inst->dst.file == GRF)
1473          inst->dst.reg = remap_table[inst->dst.reg];
1474
1475       for (int i = 0; i < 3; i++) {
1476          if (inst->src[i].file == GRF)
1477             inst->src[i].reg = remap_table[inst->src[i].reg];
1478       }
1479    }
1480
1481    /* Patch all the references to special values */
1482    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1483       if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1484          special[i]->reg = remap_table[special[i]->reg];
1485    }
1486 }
1487
1488 bool
1489 fs_visitor::remove_dead_constants()
1490 {
1491    if (dispatch_width == 8) {
1492       this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
1493
1494       for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
1495          this->params_remap[i] = -1;
1496
1497       /* Find which params are still in use. */
1498       foreach_list(node, &this->instructions) {
1499          fs_inst *inst = (fs_inst *)node;
1500
1501          for (int i = 0; i < 3; i++) {
1502             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1503
1504             if (inst->src[i].file != UNIFORM)
1505                continue;
1506
1507             assert(constant_nr < (int)c->prog_data.nr_params);
1508
1509             /* For now, set this to non-negative.  We'll give it the
1510              * actual new number in a moment, in order to keep the
1511              * register numbers nicely ordered.
1512              */
1513             this->params_remap[constant_nr] = 0;
1514          }
1515       }
1516
1517       /* Figure out what the new numbers for the params will be.  At some
1518        * point when we're doing uniform array access, we're going to want
1519        * to keep the distinction between .reg and .reg_offset, but for
1520        * now we don't care.
1521        */
1522       unsigned int new_nr_params = 0;
1523       for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1524          if (this->params_remap[i] != -1) {
1525             this->params_remap[i] = new_nr_params++;
1526          }
1527       }
1528
1529       /* Update the list of params to be uploaded to match our new numbering. */
1530       for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1531          int remapped = this->params_remap[i];
1532
1533          if (remapped == -1)
1534             continue;
1535
1536          c->prog_data.param[remapped] = c->prog_data.param[i];
1537       }
1538
1539       c->prog_data.nr_params = new_nr_params;
1540    } else {
1541       /* This should have been generated in the 8-wide pass already. */
1542       assert(this->params_remap);
1543    }
1544
1545    /* Now do the renumbering of the shader to remove unused params. */
1546    foreach_list(node, &this->instructions) {
1547       fs_inst *inst = (fs_inst *)node;
1548
1549       for (int i = 0; i < 3; i++) {
1550          int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1551
1552          if (inst->src[i].file != UNIFORM)
1553             continue;
1554
1555          assert(this->params_remap[constant_nr] != -1);
1556          inst->src[i].reg = this->params_remap[constant_nr];
1557          inst->src[i].reg_offset = 0;
1558       }
1559    }
1560
1561    return true;
1562 }
1563
1564 /*
1565  * Implements array access of uniforms by inserting a
1566  * PULL_CONSTANT_LOAD instruction.
1567  *
1568  * Unlike temporary GRF array access (where we don't support it due to
1569  * the difficulty of doing relative addressing on instruction
1570  * destinations), we could potentially do array access of uniforms
1571  * that were loaded in GRF space as push constants.  In real-world
1572  * usage we've seen, though, the arrays being used are always larger
1573  * than we could load as push constants, so just always move all
1574  * uniform array access out to a pull constant buffer.
1575  */
1576 void
1577 fs_visitor::move_uniform_array_access_to_pull_constants()
1578 {
1579    int pull_constant_loc[c->prog_data.nr_params];
1580
1581    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1582       pull_constant_loc[i] = -1;
1583    }
1584
1585    /* Walk through and find array access of uniforms.  Put a copy of that
1586     * uniform in the pull constant buffer.
1587     *
1588     * Note that we don't move constant-indexed accesses to arrays.  No
1589     * testing has been done of the performance impact of this choice.
1590     */
1591    foreach_list_safe(node, &this->instructions) {
1592       fs_inst *inst = (fs_inst *)node;
1593
1594       for (int i = 0 ; i < 3; i++) {
1595          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1596             continue;
1597
1598          int uniform = inst->src[i].reg;
1599
1600          /* If this array isn't already present in the pull constant buffer,
1601           * add it.
1602           */
1603          if (pull_constant_loc[uniform] == -1) {
1604             const float **values = &c->prog_data.param[uniform];
1605
1606             pull_constant_loc[uniform] = c->prog_data.nr_pull_params;
1607
1608             assert(param_size[uniform]);
1609
1610             for (int j = 0; j < param_size[uniform]; j++) {
1611                c->prog_data.pull_param[c->prog_data.nr_pull_params++] =
1612                   values[j];
1613             }
1614          }
1615
1616          /* Set up the annotation tracking for new generated instructions. */
1617          base_ir = inst->ir;
1618          current_annotation = inst->annotation;
1619
1620          fs_reg surf_index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1621          fs_reg temp = fs_reg(this, glsl_type::float_type);
1622          exec_list list = VARYING_PULL_CONSTANT_LOAD(temp,
1623                                                      surf_index,
1624                                                      *inst->src[i].reladdr,
1625                                                      pull_constant_loc[uniform] +
1626                                                      inst->src[i].reg_offset);
1627          inst->insert_before(&list);
1628
1629          inst->src[i].file = temp.file;
1630          inst->src[i].reg = temp.reg;
1631          inst->src[i].reg_offset = temp.reg_offset;
1632          inst->src[i].reladdr = NULL;
1633       }
1634    }
1635 }
1636
1637 /**
1638  * Choose accesses from the UNIFORM file to demote to using the pull
1639  * constant buffer.
1640  *
1641  * We allow a fragment shader to have more than the specified minimum
1642  * maximum number of fragment shader uniform components (64).  If
1643  * there are too many of these, they'd fill up all of register space.
1644  * So, this will push some of them out to the pull constant buffer and
1645  * update the program to load them.
1646  */
1647 void
1648 fs_visitor::setup_pull_constants()
1649 {
1650    /* Only allow 16 registers (128 uniform components) as push constants. */
1651    unsigned int max_uniform_components = 16 * 8;
1652    if (c->prog_data.nr_params <= max_uniform_components)
1653       return;
1654
1655    if (dispatch_width == 16) {
1656       fail("Pull constants not supported in 16-wide\n");
1657       return;
1658    }
1659
1660    /* Just demote the end of the list.  We could probably do better
1661     * here, demoting things that are rarely used in the program first.
1662     */
1663    unsigned int pull_uniform_base = max_uniform_components;
1664
1665    int pull_constant_loc[c->prog_data.nr_params];
1666    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1667       if (i < pull_uniform_base) {
1668          pull_constant_loc[i] = -1;
1669       } else {
1670          pull_constant_loc[i] = -1;
1671          /* If our constant is already being uploaded for reladdr purposes,
1672           * reuse it.
1673           */
1674          for (unsigned int j = 0; j < c->prog_data.nr_pull_params; j++) {
1675             if (c->prog_data.pull_param[j] == c->prog_data.param[i]) {
1676                pull_constant_loc[i] = j;
1677                break;
1678             }
1679          }
1680          if (pull_constant_loc[i] == -1) {
1681             int pull_index = c->prog_data.nr_pull_params++;
1682             c->prog_data.pull_param[pull_index] = c->prog_data.param[i];
1683             pull_constant_loc[i] = pull_index;;
1684          }
1685       }
1686    }
1687    c->prog_data.nr_params = pull_uniform_base;
1688
1689    foreach_list(node, &this->instructions) {
1690       fs_inst *inst = (fs_inst *)node;
1691
1692       for (int i = 0; i < 3; i++) {
1693          if (inst->src[i].file != UNIFORM)
1694             continue;
1695
1696          int pull_index = pull_constant_loc[inst->src[i].reg +
1697                                             inst->src[i].reg_offset];
1698          if (pull_index == -1)
1699             continue;
1700
1701          assert(!inst->src[i].reladdr);
1702
1703          fs_reg dst = fs_reg(this, glsl_type::float_type);
1704          fs_reg index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1705          fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1706          fs_inst *pull =
1707             new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1708                                  dst, index, offset);
1709          pull->ir = inst->ir;
1710          pull->annotation = inst->annotation;
1711
1712          inst->insert_before(pull);
1713
1714          inst->src[i].file = GRF;
1715          inst->src[i].reg = dst.reg;
1716          inst->src[i].reg_offset = 0;
1717          inst->src[i].smear = pull_index & 3;
1718       }
1719    }
1720 }
1721
1722 bool
1723 fs_visitor::opt_algebraic()
1724 {
1725    bool progress = false;
1726
1727    foreach_list(node, &this->instructions) {
1728       fs_inst *inst = (fs_inst *)node;
1729
1730       switch (inst->opcode) {
1731       case BRW_OPCODE_MUL:
1732          if (inst->src[1].file != IMM)
1733             continue;
1734
1735          /* a * 1.0 = a */
1736          if (inst->src[1].is_one()) {
1737             inst->opcode = BRW_OPCODE_MOV;
1738             inst->src[1] = reg_undef;
1739             progress = true;
1740             break;
1741          }
1742
1743          /* a * 0.0 = 0.0 */
1744          if (inst->src[1].is_zero()) {
1745             inst->opcode = BRW_OPCODE_MOV;
1746             inst->src[0] = inst->src[1];
1747             inst->src[1] = reg_undef;
1748             progress = true;
1749             break;
1750          }
1751
1752          break;
1753       case BRW_OPCODE_ADD:
1754          if (inst->src[1].file != IMM)
1755             continue;
1756
1757          /* a + 0.0 = a */
1758          if (inst->src[1].is_zero()) {
1759             inst->opcode = BRW_OPCODE_MOV;
1760             inst->src[1] = reg_undef;
1761             progress = true;
1762             break;
1763          }
1764          break;
1765       default:
1766          break;
1767       }
1768    }
1769
1770    return progress;
1771 }
1772
1773 /**
1774  * Must be called after calculate_live_intervales() to remove unused
1775  * writes to registers -- register allocation will fail otherwise
1776  * because something deffed but not used won't be considered to
1777  * interfere with other regs.
1778  */
1779 bool
1780 fs_visitor::dead_code_eliminate()
1781 {
1782    bool progress = false;
1783    int pc = 0;
1784
1785    calculate_live_intervals();
1786
1787    foreach_list_safe(node, &this->instructions) {
1788       fs_inst *inst = (fs_inst *)node;
1789
1790       if (inst->dst.file == GRF && this->virtual_grf_use[inst->dst.reg] <= pc) {
1791          inst->remove();
1792          progress = true;
1793       }
1794
1795       pc++;
1796    }
1797
1798    if (progress)
1799       live_intervals_valid = false;
1800
1801    return progress;
1802 }
1803
1804 struct dead_code_hash_key
1805 {
1806    int vgrf;
1807    int reg_offset;
1808 };
1809
1810 static bool
1811 dead_code_hash_compare(const void *a, const void *b)
1812 {
1813    return memcmp(a, b, sizeof(struct dead_code_hash_key)) == 0;
1814 }
1815
1816 static void
1817 clear_dead_code_hash(struct hash_table *ht)
1818 {
1819    struct hash_entry *entry;
1820
1821    hash_table_foreach(ht, entry) {
1822       _mesa_hash_table_remove(ht, entry);
1823    }
1824 }
1825
1826 static void
1827 insert_dead_code_hash(struct hash_table *ht,
1828                       int vgrf, int reg_offset, fs_inst *inst)
1829 {
1830    /* We don't bother freeing keys, because they'll be GCed with the ht. */
1831    struct dead_code_hash_key *key = ralloc(ht, struct dead_code_hash_key);
1832
1833    key->vgrf = vgrf;
1834    key->reg_offset = reg_offset;
1835
1836    _mesa_hash_table_insert(ht, _mesa_hash_data(key, sizeof(*key)), key, inst);
1837 }
1838
1839 static struct hash_entry *
1840 get_dead_code_hash_entry(struct hash_table *ht, int vgrf, int reg_offset)
1841 {
1842    struct dead_code_hash_key key;
1843
1844    key.vgrf = vgrf;
1845    key.reg_offset = reg_offset;
1846
1847    return _mesa_hash_table_search(ht, _mesa_hash_data(&key, sizeof(key)), &key);
1848 }
1849
1850 static void
1851 remove_dead_code_hash(struct hash_table *ht,
1852                       int vgrf, int reg_offset)
1853 {
1854    struct hash_entry *entry = get_dead_code_hash_entry(ht, vgrf, reg_offset);
1855    if (!entry)
1856       return;
1857
1858    _mesa_hash_table_remove(ht, entry);
1859 }
1860
1861 /**
1862  * Walks basic blocks, removing any regs that are written but not read before
1863  * being redefined.
1864  *
1865  * The dead_code_eliminate() function implements a global dead code
1866  * elimination, but it only handles the removing the last write to a register
1867  * if it's never read.  This one can handle intermediate writes, but only
1868  * within a basic block.
1869  */
1870 bool
1871 fs_visitor::dead_code_eliminate_local()
1872 {
1873    struct hash_table *ht;
1874    bool progress = false;
1875
1876    ht = _mesa_hash_table_create(mem_ctx, dead_code_hash_compare);
1877
1878    foreach_list_safe(node, &this->instructions) {
1879       fs_inst *inst = (fs_inst *)node;
1880
1881       /* At a basic block, empty the HT since we don't understand dataflow
1882        * here.
1883        */
1884       if (inst->is_control_flow()) {
1885          clear_dead_code_hash(ht);
1886          continue;
1887       }
1888
1889       /* Clear the HT of any instructions that got read. */
1890       for (int i = 0; i < 3; i++) {
1891          fs_reg src = inst->src[i];
1892          if (src.file != GRF)
1893             continue;
1894
1895          int read = 1;
1896          if (inst->is_send_from_grf())
1897             read = virtual_grf_sizes[src.reg] - src.reg_offset;
1898
1899          for (int reg_offset = src.reg_offset;
1900               reg_offset < src.reg_offset + read;
1901               reg_offset++) {
1902             remove_dead_code_hash(ht, src.reg, reg_offset);
1903          }
1904       }
1905
1906       /* Add any update of a GRF to the HT, removing a previous write if it
1907        * wasn't read.
1908        */
1909       if (inst->dst.file == GRF) {
1910          if (inst->regs_written > 1) {
1911             /* We don't know how to trim channels from an instruction's
1912              * writes, so we can't incrementally remove unread channels from
1913              * it.  Just remove whatever it overwrites from the table
1914              */
1915             for (int i = 0; i < inst->regs_written; i++) {
1916                remove_dead_code_hash(ht,
1917                                      inst->dst.reg,
1918                                      inst->dst.reg_offset + i);
1919             }
1920          } else {
1921             struct hash_entry *entry =
1922                get_dead_code_hash_entry(ht, inst->dst.reg,
1923                                         inst->dst.reg_offset);
1924
1925             if (inst->is_partial_write()) {
1926                /* For a partial write, we can't remove any previous dead code
1927                 * candidate, since we're just modifying their result, but we can
1928                 * be dead code eliminiated ourselves.
1929                 */
1930                if (entry) {
1931                   entry->data = inst;
1932                } else {
1933                   insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
1934                                         inst);
1935                }
1936             } else {
1937                if (entry) {
1938                   /* We're completely updating a channel, and there was a
1939                    * previous write to the channel that wasn't read.  Kill it!
1940                    */
1941                   fs_inst *inst = (fs_inst *)entry->data;
1942                   inst->remove();
1943                   progress = true;
1944                   _mesa_hash_table_remove(ht, entry);
1945                }
1946
1947                insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
1948                                      inst);
1949             }
1950          }
1951       }
1952    }
1953
1954    _mesa_hash_table_destroy(ht, NULL);
1955
1956    if (progress)
1957       live_intervals_valid = false;
1958
1959    return progress;
1960 }
1961
1962 /**
1963  * Implements a second type of register coalescing: This one checks if
1964  * the two regs involved in a raw move don't interfere, in which case
1965  * they can both by stored in the same place and the MOV removed.
1966  */
1967 bool
1968 fs_visitor::register_coalesce_2()
1969 {
1970    bool progress = false;
1971
1972    calculate_live_intervals();
1973
1974    foreach_list_safe(node, &this->instructions) {
1975       fs_inst *inst = (fs_inst *)node;
1976
1977       if (inst->opcode != BRW_OPCODE_MOV ||
1978           inst->is_partial_write() ||
1979           inst->saturate ||
1980           inst->src[0].file != GRF ||
1981           inst->src[0].negate ||
1982           inst->src[0].abs ||
1983           inst->src[0].smear != -1 ||
1984           inst->dst.file != GRF ||
1985           inst->dst.type != inst->src[0].type ||
1986           virtual_grf_sizes[inst->src[0].reg] != 1 ||
1987           virtual_grf_interferes(inst->dst.reg, inst->src[0].reg)) {
1988          continue;
1989       }
1990
1991       int reg_from = inst->src[0].reg;
1992       assert(inst->src[0].reg_offset == 0);
1993       int reg_to = inst->dst.reg;
1994       int reg_to_offset = inst->dst.reg_offset;
1995
1996       foreach_list(node, &this->instructions) {
1997          fs_inst *scan_inst = (fs_inst *)node;
1998
1999          if (scan_inst->dst.file == GRF &&
2000              scan_inst->dst.reg == reg_from) {
2001             scan_inst->dst.reg = reg_to;
2002             scan_inst->dst.reg_offset = reg_to_offset;
2003          }
2004          for (int i = 0; i < 3; i++) {
2005             if (scan_inst->src[i].file == GRF &&
2006                 scan_inst->src[i].reg == reg_from) {
2007                scan_inst->src[i].reg = reg_to;
2008                scan_inst->src[i].reg_offset = reg_to_offset;
2009             }
2010          }
2011       }
2012
2013       inst->remove();
2014
2015       /* We don't need to recalculate live intervals inside the loop despite
2016        * flagging live_intervals_valid because we only use live intervals for
2017        * the interferes test, and we must have had a situation where the
2018        * intervals were:
2019        *
2020        *  from  to
2021        *  ^
2022        *  |
2023        *  v
2024        *        ^
2025        *        |
2026        *        v
2027        *
2028        * Some register R that might get coalesced with one of these two could
2029        * only be referencing "to", otherwise "from"'s range would have been
2030        * longer.  R's range could also only start at the end of "to" or later,
2031        * otherwise it will conflict with "to" when we try to coalesce "to"
2032        * into Rw anyway.
2033        */
2034       live_intervals_valid = false;
2035
2036       progress = true;
2037       continue;
2038    }
2039
2040    return progress;
2041 }
2042
2043 bool
2044 fs_visitor::register_coalesce()
2045 {
2046    bool progress = false;
2047    int if_depth = 0;
2048    int loop_depth = 0;
2049
2050    foreach_list_safe(node, &this->instructions) {
2051       fs_inst *inst = (fs_inst *)node;
2052
2053       /* Make sure that we dominate the instructions we're going to
2054        * scan for interfering with our coalescing, or we won't have
2055        * scanned enough to see if anything interferes with our
2056        * coalescing.  We don't dominate the following instructions if
2057        * we're in a loop or an if block.
2058        */
2059       switch (inst->opcode) {
2060       case BRW_OPCODE_DO:
2061          loop_depth++;
2062          break;
2063       case BRW_OPCODE_WHILE:
2064          loop_depth--;
2065          break;
2066       case BRW_OPCODE_IF:
2067          if_depth++;
2068          break;
2069       case BRW_OPCODE_ENDIF:
2070          if_depth--;
2071          break;
2072       default:
2073          break;
2074       }
2075       if (loop_depth || if_depth)
2076          continue;
2077
2078       if (inst->opcode != BRW_OPCODE_MOV ||
2079           inst->is_partial_write() ||
2080           inst->saturate ||
2081           inst->dst.file != GRF || (inst->src[0].file != GRF &&
2082                                     inst->src[0].file != UNIFORM)||
2083           inst->dst.type != inst->src[0].type)
2084          continue;
2085
2086       bool has_source_modifiers = (inst->src[0].abs ||
2087                                    inst->src[0].negate ||
2088                                    inst->src[0].smear != -1 ||
2089                                    inst->src[0].file == UNIFORM);
2090
2091       /* Found a move of a GRF to a GRF.  Let's see if we can coalesce
2092        * them: check for no writes to either one until the exit of the
2093        * program.
2094        */
2095       bool interfered = false;
2096
2097       for (fs_inst *scan_inst = (fs_inst *)inst->next;
2098            !scan_inst->is_tail_sentinel();
2099            scan_inst = (fs_inst *)scan_inst->next) {
2100          if (scan_inst->dst.file == GRF) {
2101             if (scan_inst->overwrites_reg(inst->dst) ||
2102                 scan_inst->overwrites_reg(inst->src[0])) {
2103                interfered = true;
2104                break;
2105             }
2106          }
2107
2108          /* The gen6 MATH instruction can't handle source modifiers or
2109           * unusual register regions, so avoid coalescing those for
2110           * now.  We should do something more specific.
2111           */
2112          if (has_source_modifiers && !can_do_source_mods(scan_inst)) {
2113             interfered = true;
2114             break;
2115          }
2116
2117          /* The accumulator result appears to get used for the
2118           * conditional modifier generation.  When negating a UD
2119           * value, there is a 33rd bit generated for the sign in the
2120           * accumulator value, so now you can't check, for example,
2121           * equality with a 32-bit value.  See piglit fs-op-neg-uint.
2122           */
2123          if (scan_inst->conditional_mod &&
2124              inst->src[0].negate &&
2125              inst->src[0].type == BRW_REGISTER_TYPE_UD) {
2126             interfered = true;
2127             break;
2128          }
2129       }
2130       if (interfered) {
2131          continue;
2132       }
2133
2134       /* Rewrite the later usage to point at the source of the move to
2135        * be removed.
2136        */
2137       for (fs_inst *scan_inst = inst;
2138            !scan_inst->is_tail_sentinel();
2139            scan_inst = (fs_inst *)scan_inst->next) {
2140          for (int i = 0; i < 3; i++) {
2141             if (scan_inst->src[i].file == GRF &&
2142                 scan_inst->src[i].reg == inst->dst.reg &&
2143                 scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
2144                fs_reg new_src = inst->src[0];
2145                if (scan_inst->src[i].abs) {
2146                   new_src.negate = 0;
2147                   new_src.abs = 1;
2148                }
2149                new_src.negate ^= scan_inst->src[i].negate;
2150                scan_inst->src[i] = new_src;
2151             }
2152          }
2153       }
2154
2155       inst->remove();
2156       progress = true;
2157    }
2158
2159    if (progress)
2160       live_intervals_valid = false;
2161
2162    return progress;
2163 }
2164
2165
2166 bool
2167 fs_visitor::compute_to_mrf()
2168 {
2169    bool progress = false;
2170    int next_ip = 0;
2171
2172    calculate_live_intervals();
2173
2174    foreach_list_safe(node, &this->instructions) {
2175       fs_inst *inst = (fs_inst *)node;
2176
2177       int ip = next_ip;
2178       next_ip++;
2179
2180       if (inst->opcode != BRW_OPCODE_MOV ||
2181           inst->is_partial_write() ||
2182           inst->dst.file != MRF || inst->src[0].file != GRF ||
2183           inst->dst.type != inst->src[0].type ||
2184           inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
2185          continue;
2186
2187       /* Work out which hardware MRF registers are written by this
2188        * instruction.
2189        */
2190       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2191       int mrf_high;
2192       if (inst->dst.reg & BRW_MRF_COMPR4) {
2193          mrf_high = mrf_low + 4;
2194       } else if (dispatch_width == 16 &&
2195                  (!inst->force_uncompressed && !inst->force_sechalf)) {
2196          mrf_high = mrf_low + 1;
2197       } else {
2198          mrf_high = mrf_low;
2199       }
2200
2201       /* Can't compute-to-MRF this GRF if someone else was going to
2202        * read it later.
2203        */
2204       if (this->virtual_grf_use[inst->src[0].reg] > ip)
2205          continue;
2206
2207       /* Found a move of a GRF to a MRF.  Let's see if we can go
2208        * rewrite the thing that made this GRF to write into the MRF.
2209        */
2210       fs_inst *scan_inst;
2211       for (scan_inst = (fs_inst *)inst->prev;
2212            scan_inst->prev != NULL;
2213            scan_inst = (fs_inst *)scan_inst->prev) {
2214          if (scan_inst->dst.file == GRF &&
2215              scan_inst->dst.reg == inst->src[0].reg) {
2216             /* Found the last thing to write our reg we want to turn
2217              * into a compute-to-MRF.
2218              */
2219
2220             /* If this one instruction didn't populate all the
2221              * channels, bail.  We might be able to rewrite everything
2222              * that writes that reg, but it would require smarter
2223              * tracking to delay the rewriting until complete success.
2224              */
2225             if (scan_inst->is_partial_write())
2226                break;
2227
2228             /* Things returning more than one register would need us to
2229              * understand coalescing out more than one MOV at a time.
2230              */
2231             if (scan_inst->regs_written > 1)
2232                break;
2233
2234             /* SEND instructions can't have MRF as a destination. */
2235             if (scan_inst->mlen)
2236                break;
2237
2238             if (intel->gen == 6) {
2239                /* gen6 math instructions must have the destination be
2240                 * GRF, so no compute-to-MRF for them.
2241                 */
2242                if (scan_inst->is_math()) {
2243                   break;
2244                }
2245             }
2246
2247             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2248                /* Found the creator of our MRF's source value. */
2249                scan_inst->dst.file = MRF;
2250                scan_inst->dst.reg = inst->dst.reg;
2251                scan_inst->saturate |= inst->saturate;
2252                inst->remove();
2253                progress = true;
2254             }
2255             break;
2256          }
2257
2258          /* We don't handle control flow here.  Most computation of
2259           * values that end up in MRFs are shortly before the MRF
2260           * write anyway.
2261           */
2262          if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2263             break;
2264
2265          /* You can't read from an MRF, so if someone else reads our
2266           * MRF's source GRF that we wanted to rewrite, that stops us.
2267           */
2268          bool interfered = false;
2269          for (int i = 0; i < 3; i++) {
2270             if (scan_inst->src[i].file == GRF &&
2271                 scan_inst->src[i].reg == inst->src[0].reg &&
2272                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2273                interfered = true;
2274             }
2275          }
2276          if (interfered)
2277             break;
2278
2279          if (scan_inst->dst.file == MRF) {
2280             /* If somebody else writes our MRF here, we can't
2281              * compute-to-MRF before that.
2282              */
2283             int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2284             int scan_mrf_high;
2285
2286             if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2287                scan_mrf_high = scan_mrf_low + 4;
2288             } else if (dispatch_width == 16 &&
2289                        (!scan_inst->force_uncompressed &&
2290                         !scan_inst->force_sechalf)) {
2291                scan_mrf_high = scan_mrf_low + 1;
2292             } else {
2293                scan_mrf_high = scan_mrf_low;
2294             }
2295
2296             if (mrf_low == scan_mrf_low ||
2297                 mrf_low == scan_mrf_high ||
2298                 mrf_high == scan_mrf_low ||
2299                 mrf_high == scan_mrf_high) {
2300                break;
2301             }
2302          }
2303
2304          if (scan_inst->mlen > 0) {
2305             /* Found a SEND instruction, which means that there are
2306              * live values in MRFs from base_mrf to base_mrf +
2307              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
2308              * above it.
2309              */
2310             if (mrf_low >= scan_inst->base_mrf &&
2311                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2312                break;
2313             }
2314             if (mrf_high >= scan_inst->base_mrf &&
2315                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2316                break;
2317             }
2318          }
2319       }
2320    }
2321
2322    if (progress)
2323       live_intervals_valid = false;
2324
2325    return progress;
2326 }
2327
2328 /**
2329  * Walks through basic blocks, looking for repeated MRF writes and
2330  * removing the later ones.
2331  */
2332 bool
2333 fs_visitor::remove_duplicate_mrf_writes()
2334 {
2335    fs_inst *last_mrf_move[16];
2336    bool progress = false;
2337
2338    /* Need to update the MRF tracking for compressed instructions. */
2339    if (dispatch_width == 16)
2340       return false;
2341
2342    memset(last_mrf_move, 0, sizeof(last_mrf_move));
2343
2344    foreach_list_safe(node, &this->instructions) {
2345       fs_inst *inst = (fs_inst *)node;
2346
2347       if (inst->is_control_flow()) {
2348          memset(last_mrf_move, 0, sizeof(last_mrf_move));
2349       }
2350
2351       if (inst->opcode == BRW_OPCODE_MOV &&
2352           inst->dst.file == MRF) {
2353          fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2354          if (prev_inst && inst->equals(prev_inst)) {
2355             inst->remove();
2356             progress = true;
2357             continue;
2358          }
2359       }
2360
2361       /* Clear out the last-write records for MRFs that were overwritten. */
2362       if (inst->dst.file == MRF) {
2363          last_mrf_move[inst->dst.reg] = NULL;
2364       }
2365
2366       if (inst->mlen > 0) {
2367          /* Found a SEND instruction, which will include two or fewer
2368           * implied MRF writes.  We could do better here.
2369           */
2370          for (int i = 0; i < implied_mrf_writes(inst); i++) {
2371             last_mrf_move[inst->base_mrf + i] = NULL;
2372          }
2373       }
2374
2375       /* Clear out any MRF move records whose sources got overwritten. */
2376       if (inst->dst.file == GRF) {
2377          for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2378             if (last_mrf_move[i] &&
2379                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2380                last_mrf_move[i] = NULL;
2381             }
2382          }
2383       }
2384
2385       if (inst->opcode == BRW_OPCODE_MOV &&
2386           inst->dst.file == MRF &&
2387           inst->src[0].file == GRF &&
2388           !inst->is_partial_write()) {
2389          last_mrf_move[inst->dst.reg] = inst;
2390       }
2391    }
2392
2393    if (progress)
2394       live_intervals_valid = false;
2395
2396    return progress;
2397 }
2398
2399 static void
2400 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2401                         int first_grf, int grf_len)
2402 {
2403    bool inst_16wide = (dispatch_width > 8 &&
2404                        !inst->force_uncompressed &&
2405                        !inst->force_sechalf);
2406
2407    /* Clear the flag for registers that actually got read (as expected). */
2408    for (int i = 0; i < 3; i++) {
2409       int grf;
2410       if (inst->src[i].file == GRF) {
2411          grf = inst->src[i].reg;
2412       } else if (inst->src[i].file == HW_REG &&
2413                  inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2414          grf = inst->src[i].fixed_hw_reg.nr;
2415       } else {
2416          continue;
2417       }
2418
2419       if (grf >= first_grf &&
2420           grf < first_grf + grf_len) {
2421          deps[grf - first_grf] = false;
2422          if (inst_16wide)
2423             deps[grf - first_grf + 1] = false;
2424       }
2425    }
2426 }
2427
2428 /**
2429  * Implements this workaround for the original 965:
2430  *
2431  *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2432  *      check for post destination dependencies on this instruction, software
2433  *      must ensure that there is no destination hazard for the case of ‘write
2434  *      followed by a posted write’ shown in the following example.
2435  *
2436  *      1. mov r3 0
2437  *      2. send r3.xy <rest of send instruction>
2438  *      3. mov r2 r3
2439  *
2440  *      Due to no post-destination dependency check on the ‘send’, the above
2441  *      code sequence could have two instructions (1 and 2) in flight at the
2442  *      same time that both consider ‘r3’ as the target of their final writes.
2443  */
2444 void
2445 fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
2446 {
2447    int reg_size = dispatch_width / 8;
2448    int write_len = inst->regs_written * reg_size;
2449    int first_write_grf = inst->dst.reg;
2450    bool needs_dep[BRW_MAX_MRF];
2451    assert(write_len < (int)sizeof(needs_dep) - 1);
2452
2453    memset(needs_dep, false, sizeof(needs_dep));
2454    memset(needs_dep, true, write_len);
2455
2456    clear_deps_for_inst_src(inst, dispatch_width,
2457                            needs_dep, first_write_grf, write_len);
2458
2459    /* Walk backwards looking for writes to registers we're writing which
2460     * aren't read since being written.  If we hit the start of the program,
2461     * we assume that there are no outstanding dependencies on entry to the
2462     * program.
2463     */
2464    for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2465         scan_inst != NULL;
2466         scan_inst = (fs_inst *)scan_inst->prev) {
2467
2468       /* If we hit control flow, assume that there *are* outstanding
2469        * dependencies, and force their cleanup before our instruction.
2470        */
2471       if (scan_inst->is_control_flow()) {
2472          for (int i = 0; i < write_len; i++) {
2473             if (needs_dep[i]) {
2474                inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2475             }
2476          }
2477          return;
2478       }
2479
2480       bool scan_inst_16wide = (dispatch_width > 8 &&
2481                                !scan_inst->force_uncompressed &&
2482                                !scan_inst->force_sechalf);
2483
2484       /* We insert our reads as late as possible on the assumption that any
2485        * instruction but a MOV that might have left us an outstanding
2486        * dependency has more latency than a MOV.
2487        */
2488       if (scan_inst->dst.file == GRF) {
2489          for (int i = 0; i < scan_inst->regs_written; i++) {
2490             int reg = scan_inst->dst.reg + i * reg_size;
2491
2492             if (reg >= first_write_grf &&
2493                 reg < first_write_grf + write_len &&
2494                 needs_dep[reg - first_write_grf]) {
2495                inst->insert_before(DEP_RESOLVE_MOV(reg));
2496                needs_dep[reg - first_write_grf] = false;
2497                if (scan_inst_16wide)
2498                   needs_dep[reg - first_write_grf + 1] = false;
2499             }
2500          }
2501       }
2502
2503       /* Clear the flag for registers that actually got read (as expected). */
2504       clear_deps_for_inst_src(scan_inst, dispatch_width,
2505                               needs_dep, first_write_grf, write_len);
2506
2507       /* Continue the loop only if we haven't resolved all the dependencies */
2508       int i;
2509       for (i = 0; i < write_len; i++) {
2510          if (needs_dep[i])
2511             break;
2512       }
2513       if (i == write_len)
2514          return;
2515    }
2516 }
2517
2518 /**
2519  * Implements this workaround for the original 965:
2520  *
2521  *     "[DevBW, DevCL] Errata: A destination register from a send can not be
2522  *      used as a destination register until after it has been sourced by an
2523  *      instruction with a different destination register.
2524  */
2525 void
2526 fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
2527 {
2528    int write_len = inst->regs_written * dispatch_width / 8;
2529    int first_write_grf = inst->dst.reg;
2530    bool needs_dep[BRW_MAX_MRF];
2531    assert(write_len < (int)sizeof(needs_dep) - 1);
2532
2533    memset(needs_dep, false, sizeof(needs_dep));
2534    memset(needs_dep, true, write_len);
2535    /* Walk forwards looking for writes to registers we're writing which aren't
2536     * read before being written.
2537     */
2538    for (fs_inst *scan_inst = (fs_inst *)inst->next;
2539         !scan_inst->is_tail_sentinel();
2540         scan_inst = (fs_inst *)scan_inst->next) {
2541       /* If we hit control flow, force resolve all remaining dependencies. */
2542       if (scan_inst->is_control_flow()) {
2543          for (int i = 0; i < write_len; i++) {
2544             if (needs_dep[i])
2545                scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2546          }
2547          return;
2548       }
2549
2550       /* Clear the flag for registers that actually got read (as expected). */
2551       clear_deps_for_inst_src(scan_inst, dispatch_width,
2552                               needs_dep, first_write_grf, write_len);
2553
2554       /* We insert our reads as late as possible since they're reading the
2555        * result of a SEND, which has massive latency.
2556        */
2557       if (scan_inst->dst.file == GRF &&
2558           scan_inst->dst.reg >= first_write_grf &&
2559           scan_inst->dst.reg < first_write_grf + write_len &&
2560           needs_dep[scan_inst->dst.reg - first_write_grf]) {
2561          scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2562          needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2563       }
2564
2565       /* Continue the loop only if we haven't resolved all the dependencies */
2566       int i;
2567       for (i = 0; i < write_len; i++) {
2568          if (needs_dep[i])
2569             break;
2570       }
2571       if (i == write_len)
2572          return;
2573    }
2574
2575    /* If we hit the end of the program, resolve all remaining dependencies out
2576     * of paranoia.
2577     */
2578    fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2579    assert(last_inst->eot);
2580    for (int i = 0; i < write_len; i++) {
2581       if (needs_dep[i])
2582          last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2583    }
2584 }
2585
2586 void
2587 fs_visitor::insert_gen4_send_dependency_workarounds()
2588 {
2589    if (intel->gen != 4 || intel->is_g4x)
2590       return;
2591
2592    /* Note that we're done with register allocation, so GRF fs_regs always
2593     * have a .reg_offset of 0.
2594     */
2595
2596    foreach_list_safe(node, &this->instructions) {
2597       fs_inst *inst = (fs_inst *)node;
2598
2599       if (inst->mlen != 0 && inst->dst.file == GRF) {
2600          insert_gen4_pre_send_dependency_workarounds(inst);
2601          insert_gen4_post_send_dependency_workarounds(inst);
2602       }
2603    }
2604 }
2605
2606 /**
2607  * Turns the generic expression-style uniform pull constant load instruction
2608  * into a hardware-specific series of instructions for loading a pull
2609  * constant.
2610  *
2611  * The expression style allows the CSE pass before this to optimize out
2612  * repeated loads from the same offset, and gives the pre-register-allocation
2613  * scheduling full flexibility, while the conversion to native instructions
2614  * allows the post-register-allocation scheduler the best information
2615  * possible.
2616  *
2617  * Note that execution masking for setting up pull constant loads is special:
2618  * the channels that need to be written are unrelated to the current execution
2619  * mask, since a later instruction will use one of the result channels as a
2620  * source operand for all 8 or 16 of its channels.
2621  */
2622 void
2623 fs_visitor::lower_uniform_pull_constant_loads()
2624 {
2625    foreach_list(node, &this->instructions) {
2626       fs_inst *inst = (fs_inst *)node;
2627
2628       if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2629          continue;
2630
2631       if (intel->gen >= 7) {
2632          /* The offset arg before was a vec4-aligned byte offset.  We need to
2633           * turn it into a dword offset.
2634           */
2635          fs_reg const_offset_reg = inst->src[1];
2636          assert(const_offset_reg.file == IMM &&
2637                 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2638          const_offset_reg.imm.u /= 4;
2639          fs_reg payload = fs_reg(this, glsl_type::uint_type);
2640
2641          /* This is actually going to be a MOV, but since only the first dword
2642           * is accessed, we have a special opcode to do just that one.  Note
2643           * that this needs to be an operation that will be considered a def
2644           * by live variable analysis, or register allocation will explode.
2645           */
2646          fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2647                                                payload, const_offset_reg);
2648          setup->force_writemask_all = true;
2649
2650          setup->ir = inst->ir;
2651          setup->annotation = inst->annotation;
2652          inst->insert_before(setup);
2653
2654          /* Similarly, this will only populate the first 4 channels of the
2655           * result register (since we only use smear values from 0-3), but we
2656           * don't tell the optimizer.
2657           */
2658          inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2659          inst->src[1] = payload;
2660
2661          this->live_intervals_valid = false;
2662       } else {
2663          /* Before register allocation, we didn't tell the scheduler about the
2664           * MRF we use.  We know it's safe to use this MRF because nothing
2665           * else does except for register spill/unspill, which generates and
2666           * uses its MRF within a single IR instruction.
2667           */
2668          inst->base_mrf = 14;
2669          inst->mlen = 1;
2670       }
2671    }
2672 }
2673
2674 void
2675 fs_visitor::dump_instruction(backend_instruction *be_inst)
2676 {
2677    fs_inst *inst = (fs_inst *)be_inst;
2678
2679    if (inst->predicate) {
2680       printf("(%cf0.%d) ",
2681              inst->predicate_inverse ? '-' : '+',
2682              inst->flag_subreg);
2683    }
2684
2685    printf("%s", brw_instruction_name(inst->opcode));
2686    if (inst->saturate)
2687       printf(".sat");
2688    if (inst->conditional_mod) {
2689       printf(".cmod");
2690       if (!inst->predicate &&
2691           (intel->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2692                               inst->opcode != BRW_OPCODE_IF &&
2693                               inst->opcode != BRW_OPCODE_WHILE))) {
2694          printf(".f0.%d\n", inst->flag_subreg);
2695       }
2696    }
2697    printf(" ");
2698
2699
2700    switch (inst->dst.file) {
2701    case GRF:
2702       printf("vgrf%d", inst->dst.reg);
2703       if (inst->dst.reg_offset)
2704          printf("+%d", inst->dst.reg_offset);
2705       break;
2706    case MRF:
2707       printf("m%d", inst->dst.reg);
2708       break;
2709    case BAD_FILE:
2710       printf("(null)");
2711       break;
2712    case UNIFORM:
2713       printf("***u%d***", inst->dst.reg);
2714       break;
2715    default:
2716       printf("???");
2717       break;
2718    }
2719    printf(", ");
2720
2721    for (int i = 0; i < 3; i++) {
2722       if (inst->src[i].negate)
2723          printf("-");
2724       if (inst->src[i].abs)
2725          printf("|");
2726       switch (inst->src[i].file) {
2727       case GRF:
2728          printf("vgrf%d", inst->src[i].reg);
2729          if (inst->src[i].reg_offset)
2730             printf("+%d", inst->src[i].reg_offset);
2731          break;
2732       case MRF:
2733          printf("***m%d***", inst->src[i].reg);
2734          break;
2735       case UNIFORM:
2736          printf("u%d", inst->src[i].reg);
2737          if (inst->src[i].reg_offset)
2738             printf(".%d", inst->src[i].reg_offset);
2739          break;
2740       case BAD_FILE:
2741          printf("(null)");
2742          break;
2743       case IMM:
2744          switch (inst->src[i].type) {
2745          case BRW_REGISTER_TYPE_F:
2746             printf("%ff", inst->src[i].imm.f);
2747             break;
2748          case BRW_REGISTER_TYPE_D:
2749             printf("%dd", inst->src[i].imm.i);
2750             break;
2751          case BRW_REGISTER_TYPE_UD:
2752             printf("%uu", inst->src[i].imm.u);
2753             break;
2754          default:
2755             printf("???");
2756             break;
2757          }
2758          break;
2759       default:
2760          printf("???");
2761          break;
2762       }
2763       if (inst->src[i].abs)
2764          printf("|");
2765
2766       if (i < 3)
2767          printf(", ");
2768    }
2769
2770    printf(" ");
2771
2772    if (inst->force_uncompressed)
2773       printf("1sthalf ");
2774
2775    if (inst->force_sechalf)
2776       printf("2ndhalf ");
2777
2778    printf("\n");
2779 }
2780
2781 /**
2782  * Possibly returns an instruction that set up @param reg.
2783  *
2784  * Sometimes we want to take the result of some expression/variable
2785  * dereference tree and rewrite the instruction generating the result
2786  * of the tree.  When processing the tree, we know that the
2787  * instructions generated are all writing temporaries that are dead
2788  * outside of this tree.  So, if we have some instructions that write
2789  * a temporary, we're free to point that temp write somewhere else.
2790  *
2791  * Note that this doesn't guarantee that the instruction generated
2792  * only reg -- it might be the size=4 destination of a texture instruction.
2793  */
2794 fs_inst *
2795 fs_visitor::get_instruction_generating_reg(fs_inst *start,
2796                                            fs_inst *end,
2797                                            fs_reg reg)
2798 {
2799    if (end == start ||
2800        end->is_partial_write() ||
2801        reg.reladdr ||
2802        !reg.equals(end->dst)) {
2803       return NULL;
2804    } else {
2805       return end;
2806    }
2807 }
2808
2809 void
2810 fs_visitor::setup_payload_gen6()
2811 {
2812    bool uses_depth =
2813       (fp->Base.InputsRead & (1 << VARYING_SLOT_POS)) != 0;
2814    unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
2815
2816    assert(intel->gen >= 6);
2817
2818    /* R0-1: masks, pixel X/Y coordinates. */
2819    c->nr_payload_regs = 2;
2820    /* R2: only for 32-pixel dispatch.*/
2821
2822    /* R3-26: barycentric interpolation coordinates.  These appear in the
2823     * same order that they appear in the brw_wm_barycentric_interp_mode
2824     * enum.  Each set of coordinates occupies 2 registers if dispatch width
2825     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
2826     * appear if they were enabled using the "Barycentric Interpolation
2827     * Mode" bits in WM_STATE.
2828     */
2829    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
2830       if (barycentric_interp_modes & (1 << i)) {
2831          c->barycentric_coord_reg[i] = c->nr_payload_regs;
2832          c->nr_payload_regs += 2;
2833          if (dispatch_width == 16) {
2834             c->nr_payload_regs += 2;
2835          }
2836       }
2837    }
2838
2839    /* R27: interpolated depth if uses source depth */
2840    if (uses_depth) {
2841       c->source_depth_reg = c->nr_payload_regs;
2842       c->nr_payload_regs++;
2843       if (dispatch_width == 16) {
2844          /* R28: interpolated depth if not 8-wide. */
2845          c->nr_payload_regs++;
2846       }
2847    }
2848    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
2849    if (uses_depth) {
2850       c->source_w_reg = c->nr_payload_regs;
2851       c->nr_payload_regs++;
2852       if (dispatch_width == 16) {
2853          /* R30: interpolated W if not 8-wide. */
2854          c->nr_payload_regs++;
2855       }
2856    }
2857    /* R31: MSAA position offsets. */
2858    /* R32-: bary for 32-pixel. */
2859    /* R58-59: interp W for 32-pixel. */
2860
2861    if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
2862       c->source_depth_to_render_target = true;
2863    }
2864 }
2865
2866 bool
2867 fs_visitor::run()
2868 {
2869    sanity_param_count = fp->Base.Parameters->NumParameters;
2870    uint32_t orig_nr_params = c->prog_data.nr_params;
2871
2872    if (intel->gen >= 6)
2873       setup_payload_gen6();
2874    else
2875       setup_payload_gen4();
2876
2877    if (0) {
2878       emit_dummy_fs();
2879    } else {
2880       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2881          emit_shader_time_begin();
2882
2883       calculate_urb_setup();
2884       if (intel->gen < 6)
2885          emit_interpolation_setup_gen4();
2886       else
2887          emit_interpolation_setup_gen6();
2888
2889       /* We handle discards by keeping track of the still-live pixels in f0.1.
2890        * Initialize it with the dispatched pixels.
2891        */
2892       if (fp->UsesKill) {
2893          fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
2894          discard_init->flag_subreg = 1;
2895       }
2896
2897       /* Generate FS IR for main().  (the visitor only descends into
2898        * functions called "main").
2899        */
2900       if (shader) {
2901          foreach_list(node, &*shader->ir) {
2902             ir_instruction *ir = (ir_instruction *)node;
2903             base_ir = ir;
2904             this->result = reg_undef;
2905             ir->accept(this);
2906          }
2907       } else {
2908          emit_fragment_program_code();
2909       }
2910       base_ir = NULL;
2911       if (failed)
2912          return false;
2913
2914       emit(FS_OPCODE_PLACEHOLDER_HALT);
2915
2916       emit_fb_writes();
2917
2918       split_virtual_grfs();
2919
2920       move_uniform_array_access_to_pull_constants();
2921       setup_pull_constants();
2922
2923       bool progress;
2924       do {
2925          progress = false;
2926
2927          compact_virtual_grfs();
2928
2929          progress = remove_duplicate_mrf_writes() || progress;
2930
2931          progress = opt_algebraic() || progress;
2932          progress = opt_cse() || progress;
2933          progress = opt_copy_propagate() || progress;
2934          progress = dead_code_eliminate() || progress;
2935          progress = dead_code_eliminate_local() || progress;
2936          progress = register_coalesce() || progress;
2937          progress = register_coalesce_2() || progress;
2938          progress = compute_to_mrf() || progress;
2939       } while (progress);
2940
2941       remove_dead_constants();
2942
2943       schedule_instructions(false);
2944
2945       lower_uniform_pull_constant_loads();
2946
2947       assign_curb_setup();
2948       assign_urb_setup();
2949
2950       if (0) {
2951          /* Debug of register spilling: Go spill everything. */
2952          for (int i = 0; i < virtual_grf_count; i++) {
2953             spill_reg(i);
2954          }
2955       }
2956
2957       if (0)
2958          assign_regs_trivial();
2959       else {
2960          while (!assign_regs()) {
2961             if (failed)
2962                break;
2963          }
2964       }
2965    }
2966    assert(force_uncompressed_stack == 0);
2967    assert(force_sechalf_stack == 0);
2968
2969    /* This must come after all optimization and register allocation, since
2970     * it inserts dead code that happens to have side effects, and it does
2971     * so based on the actual physical registers in use.
2972     */
2973    insert_gen4_send_dependency_workarounds();
2974
2975    if (failed)
2976       return false;
2977
2978    schedule_instructions(true);
2979
2980    if (dispatch_width == 8) {
2981       c->prog_data.reg_blocks = brw_register_blocks(grf_used);
2982    } else {
2983       c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
2984
2985       /* Make sure we didn't try to sneak in an extra uniform */
2986       assert(orig_nr_params == c->prog_data.nr_params);
2987       (void) orig_nr_params;
2988    }
2989
2990    /* If any state parameters were appended, then ParameterValues could have
2991     * been realloced, in which case the driver uniform storage set up by
2992     * _mesa_associate_uniform_storage() would point to freed memory.  Make
2993     * sure that didn't happen.
2994     */
2995    assert(sanity_param_count == fp->Base.Parameters->NumParameters);
2996
2997    return !failed;
2998 }
2999
3000 const unsigned *
3001 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
3002                struct gl_fragment_program *fp,
3003                struct gl_shader_program *prog,
3004                unsigned *final_assembly_size)
3005 {
3006    struct intel_context *intel = &brw->intel;
3007    bool start_busy = false;
3008    float start_time = 0;
3009
3010    if (unlikely(intel->perf_debug)) {
3011       start_busy = (intel->batch.last_bo &&
3012                     drm_intel_bo_busy(intel->batch.last_bo));
3013       start_time = get_time();
3014    }
3015
3016    struct brw_shader *shader = NULL;
3017    if (prog)
3018       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3019
3020    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3021       if (prog) {
3022          printf("GLSL IR for native fragment shader %d:\n", prog->Name);
3023          _mesa_print_ir(shader->ir, NULL);
3024          printf("\n\n");
3025       } else {
3026          printf("ARB_fragment_program %d ir for native fragment shader\n",
3027                 fp->Base.Id);
3028          _mesa_print_program(&fp->Base);
3029       }
3030    }
3031
3032    /* Now the main event: Visit the shader IR and generate our FS IR for it.
3033     */
3034    fs_visitor v(brw, c, prog, fp, 8);
3035    if (!v.run()) {
3036       if (prog) {
3037          prog->LinkStatus = false;
3038          ralloc_strcat(&prog->InfoLog, v.fail_msg);
3039       }
3040
3041       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3042                     v.fail_msg);
3043
3044       return NULL;
3045    }
3046
3047    exec_list *simd16_instructions = NULL;
3048    fs_visitor v2(brw, c, prog, fp, 16);
3049    bool no16 = INTEL_DEBUG & DEBUG_NO16;
3050    if (intel->gen >= 5 && c->prog_data.nr_pull_params == 0 && likely(!no16)) {
3051       v2.import_uniforms(&v);
3052       if (!v2.run()) {
3053          perf_debug("16-wide shader failed to compile, falling back to "
3054                     "8-wide at a 10-20%% performance cost: %s", v2.fail_msg);
3055       } else {
3056          simd16_instructions = &v2.instructions;
3057       }
3058    }
3059
3060    c->prog_data.dispatch_width = 8;
3061
3062    fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
3063    const unsigned *generated = g.generate_assembly(&v.instructions,
3064                                                    simd16_instructions,
3065                                                    final_assembly_size);
3066
3067    if (unlikely(intel->perf_debug) && shader) {
3068       if (shader->compiled_once)
3069          brw_wm_debug_recompile(brw, prog, &c->key);
3070       shader->compiled_once = true;
3071
3072       if (start_busy && !drm_intel_bo_busy(intel->batch.last_bo)) {
3073          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3074                     (get_time() - start_time) * 1000);
3075       }
3076    }
3077
3078    return generated;
3079 }
3080
3081 bool
3082 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
3083 {
3084    struct brw_context *brw = brw_context(ctx);
3085    struct intel_context *intel = &brw->intel;
3086    struct brw_wm_prog_key key;
3087
3088    if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
3089       return true;
3090
3091    struct gl_fragment_program *fp = (struct gl_fragment_program *)
3092       prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
3093    struct brw_fragment_program *bfp = brw_fragment_program(fp);
3094    bool program_uses_dfdy = fp->UsesDFdy;
3095
3096    memset(&key, 0, sizeof(key));
3097
3098    if (intel->gen < 6) {
3099       if (fp->UsesKill)
3100          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3101
3102       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3103          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3104
3105       /* Just assume depth testing. */
3106       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3107       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3108    }
3109
3110    if (intel->gen < 6)
3111       key.input_slots_valid |= BITFIELD64_BIT(VARYING_SLOT_POS);
3112
3113    for (int i = 0; i < VARYING_SLOT_MAX; i++) {
3114       if (!(fp->Base.InputsRead & BITFIELD64_BIT(i)))
3115          continue;
3116
3117       if (intel->gen < 6) {
3118          if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
3119             key.input_slots_valid |= BITFIELD64_BIT(i);
3120       }
3121    }
3122
3123    key.clamp_fragment_color = ctx->API == API_OPENGL_COMPAT;
3124
3125    for (int i = 0; i < MAX_SAMPLERS; i++) {
3126       if (fp->Base.ShadowSamplers & (1 << i)) {
3127          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3128          key.tex.swizzles[i] =
3129             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3130       } else {
3131          /* Color sampler: assume no swizzling. */
3132          key.tex.swizzles[i] = SWIZZLE_XYZW;
3133       }
3134    }
3135
3136    if (fp->Base.InputsRead & VARYING_BIT_POS) {
3137       key.drawable_height = ctx->DrawBuffer->Height;
3138    }
3139
3140    if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3141       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
3142    }
3143
3144    key.nr_color_regions = 1;
3145
3146    key.program_string_id = bfp->id;
3147
3148    uint32_t old_prog_offset = brw->wm.prog_offset;
3149    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3150
3151    bool success = do_wm_prog(brw, prog, bfp, &key);
3152
3153    brw->wm.prog_offset = old_prog_offset;
3154    brw->wm.prog_data = old_prog_data;
3155
3156    return success;
3157 }