src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 extern "C" {
  32
  33 #include <sys/types.h>
  34
  35 #include "main/hash_table.h"
  36 #include "main/macros.h"
  37 #include "main/shaderobj.h"
  38 #include "main/uniforms.h"
  39 #include "main/fbobject.h"
  40 #include "program/prog_parameter.h"
  41 #include "program/prog_print.h"
  42 #include "program/register_allocate.h"
  43 #include "program/sampler.h"
  44 #include "program/hash_table.h"
  45 #include "brw_context.h"
  46 #include "brw_eu.h"
  47 #include "brw_wm.h"
  48 }
  49 #include "brw_fs.h"
  50 #include "glsl/glsl_types.h"
  51
  52 void
  53 fs_inst::init()
  54 {
  55    memset(this, 0, sizeof(*this));
  56    this->opcode = BRW_OPCODE_NOP;
  57    this->conditional_mod = BRW_CONDITIONAL_NONE;
  58
  59    this->dst = reg_undef;
  60    this->src[0] = reg_undef;
  61    this->src[1] = reg_undef;
  62    this->src[2] = reg_undef;
  63
  64    /* This will be the case for almost all instructions. */
  65    this->regs_written = 1;
  66 }
  67
  68 fs_inst::fs_inst()
  69 {
  70    init();
  71 }
  72
  73 fs_inst::fs_inst(enum opcode opcode)
  74 {
  75    init();
  76    this->opcode = opcode;
  77 }
  78
  79 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
  80 {
  81    init();
  82    this->opcode = opcode;
  83    this->dst = dst;
  84
  85    if (dst.file == GRF)
  86       assert(dst.reg_offset >= 0);
  87 }
  88
  89 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
  90 {
  91    init();
  92    this->opcode = opcode;
  93    this->dst = dst;
  94    this->src[0] = src0;
  95
  96    if (dst.file == GRF)
  97       assert(dst.reg_offset >= 0);
  98    if (src[0].file == GRF)
  99       assert(src[0].reg_offset >= 0);
 100 }
 101
 102 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 103 {
 104    init();
 105    this->opcode = opcode;
 106    this->dst = dst;
 107    this->src[0] = src0;
 108    this->src[1] = src1;
 109
 110    if (dst.file == GRF)
 111       assert(dst.reg_offset >= 0);
 112    if (src[0].file == GRF)
 113       assert(src[0].reg_offset >= 0);
 114    if (src[1].file == GRF)
 115       assert(src[1].reg_offset >= 0);
 116 }
 117
 118 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
 119                  fs_reg src0, fs_reg src1, fs_reg src2)
 120 {
 121    init();
 122    this->opcode = opcode;
 123    this->dst = dst;
 124    this->src[0] = src0;
 125    this->src[1] = src1;
 126    this->src[2] = src2;
 127
 128    if (dst.file == GRF)
 129       assert(dst.reg_offset >= 0);
 130    if (src[0].file == GRF)
 131       assert(src[0].reg_offset >= 0);
 132    if (src[1].file == GRF)
 133       assert(src[1].reg_offset >= 0);
 134    if (src[2].file == GRF)
 135       assert(src[2].reg_offset >= 0);
 136 }
 137
 138 #define ALU1(op)                                                        \
 139    fs_inst *                                                            \
 140    fs_visitor::op(fs_reg dst, fs_reg src0)                              \
 141    {                                                                    \
 142       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
 143    }
 144
 145 #define ALU2(op)                                                        \
 146    fs_inst *                                                            \
 147    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1)                 \
 148    {                                                                    \
 149       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
 150    }
 151
 152 #define ALU3(op)                                                        \
 153    fs_inst *                                                            \
 154    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1, fs_reg src2)    \
 155    {                                                                    \
 156       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
 157    }
 158
 159 ALU1(NOT)
 160 ALU1(MOV)
 161 ALU1(FRC)
 162 ALU1(RNDD)
 163 ALU1(RNDE)
 164 ALU1(RNDZ)
 165 ALU2(ADD)
 166 ALU2(MUL)
 167 ALU2(MACH)
 168 ALU2(AND)
 169 ALU2(OR)
 170 ALU2(XOR)
 171 ALU2(SHL)
 172 ALU2(SHR)
 173 ALU2(ASR)
 174 ALU3(LRP)
 175 ALU1(BFREV)
 176 ALU3(BFE)
 177 ALU2(BFI1)
 178 ALU3(BFI2)
 179 ALU1(FBH)
 180 ALU1(FBL)
 181 ALU1(CBIT)
 182 ALU3(MAD)
 183
 184 /** Gen4 predicated IF. */
 185 fs_inst *
 186 fs_visitor::IF(uint32_t predicate)
 187 {
 188    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
 189    inst->predicate = predicate;
 190    return inst;
 191 }
 192
 193 /** Gen6+ IF with embedded comparison. */
 194 fs_inst *
 195 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
 196 {
 197    assert(brw->gen >= 6);
 198    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
 199                                         reg_null_d, src0, src1);
 200    inst->conditional_mod = condition;
 201    return inst;
 202 }
 203
 204 /**
 205  * CMP: Sets the low bit of the destination channels with the result
 206  * of the comparison, while the upper bits are undefined, and updates
 207  * the flag register with the packed 16 bits of the result.
 208  */
 209 fs_inst *
 210 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
 211 {
 212    fs_inst *inst;
 213
 214    /* Take the instruction:
 215     *
 216     * CMP null<d> src0<f> src1<f>
 217     *
 218     * Original gen4 does type conversion to the destination type before
 219     * comparison, producing garbage results for floating point comparisons.
 220     * gen5 does the comparison on the execution type (resolved source types),
 221     * so dst type doesn't matter.  gen6 does comparison and then uses the
 222     * result as if it was the dst type with no conversion, which happens to
 223     * mostly work out for float-interpreted-as-int since our comparisons are
 224     * for >0, =0, <0.
 225     */
 226    if (brw->gen == 4) {
 227       dst.type = src0.type;
 228       if (dst.file == HW_REG)
 229          dst.fixed_hw_reg.type = dst.type;
 230    }
 231
 232    resolve_ud_negate(&src0);
 233    resolve_ud_negate(&src1);
 234
 235    inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
 236    inst->conditional_mod = condition;
 237
 238    return inst;
 239 }
 240
 241 exec_list
 242 fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
 243                                        fs_reg varying_offset,
 244                                        uint32_t const_offset)
 245 {
 246    exec_list instructions;
 247    fs_inst *inst;
 248
 249    /* We have our constant surface use a pitch of 4 bytes, so our index can
 250     * be any component of a vector, and then we load 4 contiguous
 251     * components starting from that.
 252     *
 253     * We break down the const_offset to a portion added to the variable
 254     * offset and a portion done using reg_offset, which means that if you
 255     * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
 256     * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
 257     * CSE can later notice that those loads are all the same and eliminate
 258     * the redundant ones.
 259     */
 260    fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
 261    instructions.push_tail(ADD(vec4_offset,
 262                               varying_offset, const_offset & ~3));
 263
 264    int scale = 1;
 265    if (brw->gen == 4 && dispatch_width == 8) {
 266       /* Pre-gen5, we can either use a SIMD8 message that requires (header,
 267        * u, v, r) as parameters, or we can just use the SIMD16 message
 268        * consisting of (header, u).  We choose the second, at the cost of a
 269        * longer return length.
 270        */
 271       scale = 2;
 272    }
 273
 274    enum opcode op;
 275    if (brw->gen >= 7)
 276       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
 277    else
 278       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
 279    fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4 * scale), dst.type);
 280    inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
 281    inst->regs_written = 4 * scale;
 282    instructions.push_tail(inst);
 283
 284    if (brw->gen < 7) {
 285       inst->base_mrf = 13;
 286       inst->header_present = true;
 287       if (brw->gen == 4)
 288          inst->mlen = 3;
 289       else
 290          inst->mlen = 1 + dispatch_width / 8;
 291    }
 292
 293    vec4_result.reg_offset += (const_offset & 3) * scale;
 294    instructions.push_tail(MOV(dst, vec4_result));
 295
 296    return instructions;
 297 }
 298
 299 /**
 300  * A helper for MOV generation for fixing up broken hardware SEND dependency
 301  * handling.
 302  */
 303 fs_inst *
 304 fs_visitor::DEP_RESOLVE_MOV(int grf)
 305 {
 306    fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
 307
 308    inst->ir = NULL;
 309    inst->annotation = "send dependency resolve";
 310
 311    /* The caller always wants uncompressed to emit the minimal extra
 312     * dependencies, and to avoid having to deal with aligning its regs to 2.
 313     */
 314    inst->force_uncompressed = true;
 315
 316    return inst;
 317 }
 318
 319 bool
 320 fs_inst::equals(fs_inst *inst)
 321 {
 322    return (opcode == inst->opcode &&
 323            dst.equals(inst->dst) &&
 324            src[0].equals(inst->src[0]) &&
 325            src[1].equals(inst->src[1]) &&
 326            src[2].equals(inst->src[2]) &&
 327            saturate == inst->saturate &&
 328            predicate == inst->predicate &&
 329            conditional_mod == inst->conditional_mod &&
 330            mlen == inst->mlen &&
 331            base_mrf == inst->base_mrf &&
 332            sampler == inst->sampler &&
 333            target == inst->target &&
 334            eot == inst->eot &&
 335            header_present == inst->header_present &&
 336            shadow_compare == inst->shadow_compare &&
 337            offset == inst->offset);
 338 }
 339
 340 bool
 341 fs_inst::overwrites_reg(const fs_reg &reg)
 342 {
 343    return (reg.file == dst.file &&
 344            reg.reg == dst.reg &&
 345            reg.reg_offset >= dst.reg_offset  &&
 346            reg.reg_offset < dst.reg_offset + regs_written);
 347 }
 348
 349 bool
 350 fs_inst::is_send_from_grf()
 351 {
 352    return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
 353            opcode == SHADER_OPCODE_SHADER_TIME_ADD ||
 354            (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
 355             src[1].file == GRF));
 356 }
 357
 358 bool
 359 fs_visitor::can_do_source_mods(fs_inst *inst)
 360 {
 361    if (brw->gen == 6 && inst->is_math())
 362       return false;
 363
 364    if (inst->is_send_from_grf())
 365       return false;
 366
 367    return true;
 368 }
 369
 370 void
 371 fs_reg::init()
 372 {
 373    memset(this, 0, sizeof(*this));
 374    this->smear = -1;
 375 }
 376
 377 /** Generic unset register constructor. */
 378 fs_reg::fs_reg()
 379 {
 380    init();
 381    this->file = BAD_FILE;
 382 }
 383
 384 /** Immediate value constructor. */
 385 fs_reg::fs_reg(float f)
 386 {
 387    init();
 388    this->file = IMM;
 389    this->type = BRW_REGISTER_TYPE_F;
 390    this->imm.f = f;
 391 }
 392
 393 /** Immediate value constructor. */
 394 fs_reg::fs_reg(int32_t i)
 395 {
 396    init();
 397    this->file = IMM;
 398    this->type = BRW_REGISTER_TYPE_D;
 399    this->imm.i = i;
 400 }
 401
 402 /** Immediate value constructor. */
 403 fs_reg::fs_reg(uint32_t u)
 404 {
 405    init();
 406    this->file = IMM;
 407    this->type = BRW_REGISTER_TYPE_UD;
 408    this->imm.u = u;
 409 }
 410
 411 /** Fixed brw_reg Immediate value constructor. */
 412 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
 413 {
 414    init();
 415    this->file = HW_REG;
 416    this->fixed_hw_reg = fixed_hw_reg;
 417    this->type = fixed_hw_reg.type;
 418 }
 419
 420 bool
 421 fs_reg::equals(const fs_reg &r) const
 422 {
 423    return (file == r.file &&
 424            reg == r.reg &&
 425            reg_offset == r.reg_offset &&
 426            type == r.type &&
 427            negate == r.negate &&
 428            abs == r.abs &&
 429            !reladdr && !r.reladdr &&
 430            memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
 431                   sizeof(fixed_hw_reg)) == 0 &&
 432            smear == r.smear &&
 433            imm.u == r.imm.u);
 434 }
 435
 436 bool
 437 fs_reg::is_zero() const
 438 {
 439    if (file != IMM)
 440       return false;
 441
 442    return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
 443 }
 444
 445 bool
 446 fs_reg::is_one() const
 447 {
 448    if (file != IMM)
 449       return false;
 450
 451    return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
 452 }
 453
 454 bool
 455 fs_reg::is_valid_3src() const
 456 {
 457    return file == GRF || file == UNIFORM;
 458 }
 459
 460 int
 461 fs_visitor::type_size(const struct glsl_type *type)
 462 {
 463    unsigned int size, i;
 464
 465    switch (type->base_type) {
 466    case GLSL_TYPE_UINT:
 467    case GLSL_TYPE_INT:
 468    case GLSL_TYPE_FLOAT:
 469    case GLSL_TYPE_BOOL:
 470       return type->components();
 471    case GLSL_TYPE_ARRAY:
 472       return type_size(type->fields.array) * type->length;
 473    case GLSL_TYPE_STRUCT:
 474       size = 0;
 475       for (i = 0; i < type->length; i++) {
 476          size += type_size(type->fields.structure[i].type);
 477       }
 478       return size;
 479    case GLSL_TYPE_SAMPLER:
 480       /* Samplers take up no register space, since they're baked in at
 481        * link time.
 482        */
 483       return 0;
 484    case GLSL_TYPE_VOID:
 485    case GLSL_TYPE_ERROR:
 486    case GLSL_TYPE_INTERFACE:
 487       assert(!"not reached");
 488       break;
 489    }
 490
 491    return 0;
 492 }
 493
 494 fs_reg
 495 fs_visitor::get_timestamp()
 496 {
 497    assert(brw->gen >= 7);
 498
 499    fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 500                                           BRW_ARF_TIMESTAMP,
 501                                           0),
 502                              BRW_REGISTER_TYPE_UD));
 503
 504    fs_reg dst = fs_reg(this, glsl_type::uint_type);
 505
 506    fs_inst *mov = emit(MOV(dst, ts));
 507    /* We want to read the 3 fields we care about (mostly field 0, but also 2)
 508     * even if it's not enabled in the dispatch.
 509     */
 510    mov->force_writemask_all = true;
 511    mov->force_uncompressed = true;
 512
 513    /* The caller wants the low 32 bits of the timestamp.  Since it's running
 514     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
 515     * which is plenty of time for our purposes.  It is identical across the
 516     * EUs, but since it's tracking GPU core speed it will increment at a
 517     * varying rate as render P-states change.
 518     *
 519     * The caller could also check if render P-states have changed (or anything
 520     * else that might disrupt timing) by setting smear to 2 and checking if
 521     * that field is != 0.
 522     */
 523    dst.smear = 0;
 524
 525    return dst;
 526 }
 527
 528 void
 529 fs_visitor::emit_shader_time_begin()
 530 {
 531    current_annotation = "shader time start";
 532    shader_start_time = get_timestamp();
 533 }
 534
 535 void
 536 fs_visitor::emit_shader_time_end()
 537 {
 538    current_annotation = "shader time end";
 539
 540    enum shader_time_shader_type type, written_type, reset_type;
 541    if (dispatch_width == 8) {
 542       type = ST_FS8;
 543       written_type = ST_FS8_WRITTEN;
 544       reset_type = ST_FS8_RESET;
 545    } else {
 546       assert(dispatch_width == 16);
 547       type = ST_FS16;
 548       written_type = ST_FS16_WRITTEN;
 549       reset_type = ST_FS16_RESET;
 550    }
 551
 552    fs_reg shader_end_time = get_timestamp();
 553
 554    /* Check that there weren't any timestamp reset events (assuming these
 555     * were the only two timestamp reads that happened).
 556     */
 557    fs_reg reset = shader_end_time;
 558    reset.smear = 2;
 559    fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
 560    test->conditional_mod = BRW_CONDITIONAL_Z;
 561    emit(IF(BRW_PREDICATE_NORMAL));
 562
 563    push_force_uncompressed();
 564    fs_reg start = shader_start_time;
 565    start.negate = true;
 566    fs_reg diff = fs_reg(this, glsl_type::uint_type);
 567    emit(ADD(diff, start, shader_end_time));
 568
 569    /* If there were no instructions between the two timestamp gets, the diff
 570     * is 2 cycles.  Remove that overhead, so I can forget about that when
 571     * trying to determine the time taken for single instructions.
 572     */
 573    emit(ADD(diff, diff, fs_reg(-2u)));
 574
 575    emit_shader_time_write(type, diff);
 576    emit_shader_time_write(written_type, fs_reg(1u));
 577    emit(BRW_OPCODE_ELSE);
 578    emit_shader_time_write(reset_type, fs_reg(1u));
 579    emit(BRW_OPCODE_ENDIF);
 580
 581    pop_force_uncompressed();
 582 }
 583
 584 void
 585 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
 586                                    fs_reg value)
 587 {
 588    int shader_time_index =
 589       brw_get_shader_time_index(brw, shader_prog, &fp->Base, type);
 590    fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
 591
 592    fs_reg payload;
 593    if (dispatch_width == 8)
 594       payload = fs_reg(this, glsl_type::uvec2_type);
 595    else
 596       payload = fs_reg(this, glsl_type::uint_type);
 597
 598    emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
 599                 fs_reg(), payload, offset, value));
 600 }
 601
 602 void
 603 fs_visitor::fail(const char *format, ...)
 604 {
 605    va_list va;
 606    char *msg;
 607
 608    if (failed)
 609       return;
 610
 611    failed = true;
 612
 613    va_start(va, format);
 614    msg = ralloc_vasprintf(mem_ctx, format, va);
 615    va_end(va);
 616    msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
 617
 618    this->fail_msg = msg;
 619
 620    if (INTEL_DEBUG & DEBUG_WM) {
 621       fprintf(stderr, "%s",  msg);
 622    }
 623 }
 624
 625 fs_inst *
 626 fs_visitor::emit(enum opcode opcode)
 627 {
 628    return emit(fs_inst(opcode));
 629 }
 630
 631 fs_inst *
 632 fs_visitor::emit(enum opcode opcode, fs_reg dst)
 633 {
 634    return emit(fs_inst(opcode, dst));
 635 }
 636
 637 fs_inst *
 638 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
 639 {
 640    return emit(fs_inst(opcode, dst, src0));
 641 }
 642
 643 fs_inst *
 644 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 645 {
 646    return emit(fs_inst(opcode, dst, src0, src1));
 647 }
 648
 649 fs_inst *
 650 fs_visitor::emit(enum opcode opcode, fs_reg dst,
 651                  fs_reg src0, fs_reg src1, fs_reg src2)
 652 {
 653    return emit(fs_inst(opcode, dst, src0, src1, src2));
 654 }
 655
 656 void
 657 fs_visitor::push_force_uncompressed()
 658 {
 659    force_uncompressed_stack++;
 660 }
 661
 662 void
 663 fs_visitor::pop_force_uncompressed()
 664 {
 665    force_uncompressed_stack--;
 666    assert(force_uncompressed_stack >= 0);
 667 }
 668
 669 void
 670 fs_visitor::push_force_sechalf()
 671 {
 672    force_sechalf_stack++;
 673 }
 674
 675 void
 676 fs_visitor::pop_force_sechalf()
 677 {
 678    force_sechalf_stack--;
 679    assert(force_sechalf_stack >= 0);
 680 }
 681
 682 /**
 683  * Returns true if the instruction has a flag that means it won't
 684  * update an entire destination register.
 685  *
 686  * For example, dead code elimination and live variable analysis want to know
 687  * when a write to a variable screens off any preceding values that were in
 688  * it.
 689  */
 690 bool
 691 fs_inst::is_partial_write()
 692 {
 693    return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
 694            this->force_uncompressed ||
 695            this->force_sechalf);
 696 }
 697
 698 /**
 699  * Returns how many MRFs an FS opcode will write over.
 700  *
 701  * Note that this is not the 0 or 1 implied writes in an actual gen
 702  * instruction -- the FS opcodes often generate MOVs in addition.
 703  */
 704 int
 705 fs_visitor::implied_mrf_writes(fs_inst *inst)
 706 {
 707    if (inst->mlen == 0)
 708       return 0;
 709
 710    switch (inst->opcode) {
 711    case SHADER_OPCODE_RCP:
 712    case SHADER_OPCODE_RSQ:
 713    case SHADER_OPCODE_SQRT:
 714    case SHADER_OPCODE_EXP2:
 715    case SHADER_OPCODE_LOG2:
 716    case SHADER_OPCODE_SIN:
 717    case SHADER_OPCODE_COS:
 718       return 1 * dispatch_width / 8;
 719    case SHADER_OPCODE_POW:
 720    case SHADER_OPCODE_INT_QUOTIENT:
 721    case SHADER_OPCODE_INT_REMAINDER:
 722       return 2 * dispatch_width / 8;
 723    case SHADER_OPCODE_TEX:
 724    case FS_OPCODE_TXB:
 725    case SHADER_OPCODE_TXD:
 726    case SHADER_OPCODE_TXF:
 727    case SHADER_OPCODE_TXF_MS:
 728    case SHADER_OPCODE_TXL:
 729    case SHADER_OPCODE_TXS:
 730    case SHADER_OPCODE_LOD:
 731       return 1;
 732    case FS_OPCODE_FB_WRITE:
 733       return 2;
 734    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 735    case FS_OPCODE_UNSPILL:
 736       return 1;
 737    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
 738       return inst->mlen;
 739    case FS_OPCODE_SPILL:
 740       return 2;
 741    default:
 742       assert(!"not reached");
 743       return inst->mlen;
 744    }
 745 }
 746
 747 int
 748 fs_visitor::virtual_grf_alloc(int size)
 749 {
 750    if (virtual_grf_array_size <= virtual_grf_count) {
 751       if (virtual_grf_array_size == 0)
 752          virtual_grf_array_size = 16;
 753       else
 754          virtual_grf_array_size *= 2;
 755       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 756                                    virtual_grf_array_size);
 757    }
 758    virtual_grf_sizes[virtual_grf_count] = size;
 759    return virtual_grf_count++;
 760 }
 761
 762 /** Fixed HW reg constructor. */
 763 fs_reg::fs_reg(enum register_file file, int reg)
 764 {
 765    init();
 766    this->file = file;
 767    this->reg = reg;
 768    this->type = BRW_REGISTER_TYPE_F;
 769 }
 770
 771 /** Fixed HW reg constructor. */
 772 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
 773 {
 774    init();
 775    this->file = file;
 776    this->reg = reg;
 777    this->type = type;
 778 }
 779
 780 /** Automatic reg constructor. */
 781 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
 782 {
 783    init();
 784
 785    this->file = GRF;
 786    this->reg = v->virtual_grf_alloc(v->type_size(type));
 787    this->reg_offset = 0;
 788    this->type = brw_type_for_base_type(type);
 789 }
 790
 791 fs_reg *
 792 fs_visitor::variable_storage(ir_variable *var)
 793 {
 794    return (fs_reg *)hash_table_find(this->variable_ht, var);
 795 }
 796
 797 void
 798 import_uniforms_callback(const void *key,
 799                          void *data,
 800                          void *closure)
 801 {
 802    struct hash_table *dst_ht = (struct hash_table *)closure;
 803    const fs_reg *reg = (const fs_reg *)data;
 804
 805    if (reg->file != UNIFORM)
 806       return;
 807
 808    hash_table_insert(dst_ht, data, key);
 809 }
 810
 811 /* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
 812  * This brings in those uniform definitions
 813  */
 814 void
 815 fs_visitor::import_uniforms(fs_visitor *v)
 816 {
 817    hash_table_call_foreach(v->variable_ht,
 818                            import_uniforms_callback,
 819                            variable_ht);
 820    this->params_remap = v->params_remap;
 821    this->nr_params_remap = v->nr_params_remap;
 822 }
 823
 824 /* Our support for uniforms is piggy-backed on the struct
 825  * gl_fragment_program, because that's where the values actually
 826  * get stored, rather than in some global gl_shader_program uniform
 827  * store.
 828  */
 829 void
 830 fs_visitor::setup_uniform_values(ir_variable *ir)
 831 {
 832    int namelen = strlen(ir->name);
 833
 834    /* The data for our (non-builtin) uniforms is stored in a series of
 835     * gl_uniform_driver_storage structs for each subcomponent that
 836     * glGetUniformLocation() could name.  We know it's been set up in the same
 837     * order we'd walk the type, so walk the list of storage and find anything
 838     * with our name, or the prefix of a component that starts with our name.
 839     */
 840    unsigned params_before = c->prog_data.nr_params;
 841    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
 842       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 843
 844       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 845           (storage->name[namelen] != 0 &&
 846            storage->name[namelen] != '.' &&
 847            storage->name[namelen] != '[')) {
 848          continue;
 849       }
 850
 851       unsigned slots = storage->type->component_slots();
 852       if (storage->array_elements)
 853          slots *= storage->array_elements;
 854
 855       for (unsigned i = 0; i < slots; i++) {
 856          c->prog_data.param[c->prog_data.nr_params++] =
 857             &storage->storage[i].f;
 858       }
 859    }
 860
 861    /* Make sure we actually initialized the right amount of stuff here. */
 862    assert(params_before + ir->type->component_slots() ==
 863           c->prog_data.nr_params);
 864    (void)params_before;
 865 }
 866
 867
 868 /* Our support for builtin uniforms is even scarier than non-builtin.
 869  * It sits on top of the PROG_STATE_VAR parameters that are
 870  * automatically updated from GL context state.
 871  */
 872 void
 873 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
 874 {
 875    const ir_state_slot *const slots = ir->state_slots;
 876    assert(ir->state_slots != NULL);
 877
 878    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 879       /* This state reference has already been setup by ir_to_mesa, but we'll
 880        * get the same index back here.
 881        */
 882       int index = _mesa_add_state_reference(this->fp->Base.Parameters,
 883                                             (gl_state_index *)slots[i].tokens);
 884
 885       /* Add each of the unique swizzles of the element as a parameter.
 886        * This'll end up matching the expected layout of the
 887        * array/matrix/structure we're trying to fill in.
 888        */
 889       int last_swiz = -1;
 890       for (unsigned int j = 0; j < 4; j++) {
 891          int swiz = GET_SWZ(slots[i].swizzle, j);
 892          if (swiz == last_swiz)
 893             break;
 894          last_swiz = swiz;
 895
 896          c->prog_data.param[c->prog_data.nr_params++] =
 897             &fp->Base.Parameters->ParameterValues[index][swiz].f;
 898       }
 899    }
 900 }
 901
 902 fs_reg *
 903 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
 904 {
 905    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 906    fs_reg wpos = *reg;
 907    bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
 908
 909    /* gl_FragCoord.x */
 910    if (ir->pixel_center_integer) {
 911       emit(MOV(wpos, this->pixel_x));
 912    } else {
 913       emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
 914    }
 915    wpos.reg_offset++;
 916
 917    /* gl_FragCoord.y */
 918    if (!flip && ir->pixel_center_integer) {
 919       emit(MOV(wpos, this->pixel_y));
 920    } else {
 921       fs_reg pixel_y = this->pixel_y;
 922       float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
 923
 924       if (flip) {
 925          pixel_y.negate = true;
 926          offset += c->key.drawable_height - 1.0;
 927       }
 928
 929       emit(ADD(wpos, pixel_y, fs_reg(offset)));
 930    }
 931    wpos.reg_offset++;
 932
 933    /* gl_FragCoord.z */
 934    if (brw->gen >= 6) {
 935       emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
 936    } else {
 937       emit(FS_OPCODE_LINTERP, wpos,
 938            this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
 939            this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
 940            interp_reg(VARYING_SLOT_POS, 2));
 941    }
 942    wpos.reg_offset++;
 943
 944    /* gl_FragCoord.w: Already set up in emit_interpolation */
 945    emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
 946
 947    return reg;
 948 }
 949
 950 fs_inst *
 951 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
 952                          glsl_interp_qualifier interpolation_mode,
 953                          bool is_centroid)
 954 {
 955    brw_wm_barycentric_interp_mode barycoord_mode;
 956    if (brw->gen >= 6) {
 957       if (is_centroid) {
 958          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
 959             barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
 960          else
 961             barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
 962       } else {
 963          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
 964             barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
 965          else
 966             barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
 967       }
 968    } else {
 969       /* On Ironlake and below, there is only one interpolation mode.
 970        * Centroid interpolation doesn't mean anything on this hardware --
 971        * there is no multisampling.
 972        */
 973       barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
 974    }
 975    return emit(FS_OPCODE_LINTERP, attr,
 976                this->delta_x[barycoord_mode],
 977                this->delta_y[barycoord_mode], interp);
 978 }
 979
 980 fs_reg *
 981 fs_visitor::emit_general_interpolation(ir_variable *ir)
 982 {
 983    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 984    reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
 985    fs_reg attr = *reg;
 986
 987    unsigned int array_elements;
 988    const glsl_type *type;
 989
 990    if (ir->type->is_array()) {
 991       array_elements = ir->type->length;
 992       if (array_elements == 0) {
 993          fail("dereferenced array '%s' has length 0\n", ir->name);
 994       }
 995       type = ir->type->fields.array;
 996    } else {
 997       array_elements = 1;
 998       type = ir->type;
 999    }
1000
1001    glsl_interp_qualifier interpolation_mode =
1002       ir->determine_interpolation_mode(c->key.flat_shade);
1003
1004    int location = ir->location;
1005    for (unsigned int i = 0; i < array_elements; i++) {
1006       for (unsigned int j = 0; j < type->matrix_columns; j++) {
1007          if (urb_setup[location] == -1) {
1008             /* If there's no incoming setup data for this slot, don't
1009              * emit interpolation for it.
1010              */
1011             attr.reg_offset += type->vector_elements;
1012             location++;
1013             continue;
1014          }
1015
1016          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1017             /* Constant interpolation (flat shading) case. The SF has
1018              * handed us defined values in only the constant offset
1019              * field of the setup reg.
1020              */
1021             for (unsigned int k = 0; k < type->vector_elements; k++) {
1022                struct brw_reg interp = interp_reg(location, k);
1023                interp = suboffset(interp, 3);
1024                interp.type = reg->type;
1025                emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1026                attr.reg_offset++;
1027             }
1028          } else {
1029             /* Smooth/noperspective interpolation case. */
1030             for (unsigned int k = 0; k < type->vector_elements; k++) {
1031                /* FINISHME: At some point we probably want to push
1032                 * this farther by giving similar treatment to the
1033                 * other potentially constant components of the
1034                 * attribute, as well as making brw_vs_constval.c
1035                 * handle varyings other than gl_TexCoord.
1036                 */
1037                struct brw_reg interp = interp_reg(location, k);
1038                emit_linterp(attr, fs_reg(interp), interpolation_mode,
1039                             ir->centroid);
1040                if (brw->needs_unlit_centroid_workaround && ir->centroid) {
1041                   /* Get the pixel/sample mask into f0 so that we know
1042                    * which pixels are lit.  Then, for each channel that is
1043                    * unlit, replace the centroid data with non-centroid
1044                    * data.
1045                    */
1046                   emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1047                   fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1048                                                interpolation_mode, false);
1049                   inst->predicate = BRW_PREDICATE_NORMAL;
1050                   inst->predicate_inverse = true;
1051                }
1052                if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1053                   emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1054                }
1055                attr.reg_offset++;
1056             }
1057
1058          }
1059          location++;
1060       }
1061    }
1062
1063    return reg;
1064 }
1065
1066 fs_reg *
1067 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1068 {
1069    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1070
1071    /* The frontfacing comes in as a bit in the thread payload. */
1072    if (brw->gen >= 6) {
1073       emit(BRW_OPCODE_ASR, *reg,
1074            fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1075            fs_reg(15));
1076       emit(BRW_OPCODE_NOT, *reg, *reg);
1077       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1078    } else {
1079       struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1080       /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1081        * us front face
1082        */
1083       emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1084       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1085    }
1086
1087    return reg;
1088 }
1089
1090 fs_reg
1091 fs_visitor::fix_math_operand(fs_reg src)
1092 {
1093    /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1094     * might be able to do better by doing execsize = 1 math and then
1095     * expanding that result out, but we would need to be careful with
1096     * masking.
1097     *
1098     * The hardware ignores source modifiers (negate and abs) on math
1099     * instructions, so we also move to a temp to set those up.
1100     */
1101    if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1102        !src.abs && !src.negate)
1103       return src;
1104
1105    /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1106     * operands to math
1107     */
1108    if (brw->gen >= 7 && src.file != IMM)
1109       return src;
1110
1111    fs_reg expanded = fs_reg(this, glsl_type::float_type);
1112    expanded.type = src.type;
1113    emit(BRW_OPCODE_MOV, expanded, src);
1114    return expanded;
1115 }
1116
1117 fs_inst *
1118 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1119 {
1120    switch (opcode) {
1121    case SHADER_OPCODE_RCP:
1122    case SHADER_OPCODE_RSQ:
1123    case SHADER_OPCODE_SQRT:
1124    case SHADER_OPCODE_EXP2:
1125    case SHADER_OPCODE_LOG2:
1126    case SHADER_OPCODE_SIN:
1127    case SHADER_OPCODE_COS:
1128       break;
1129    default:
1130       assert(!"not reached: bad math opcode");
1131       return NULL;
1132    }
1133
1134    /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
1135     * might be able to do better by doing execsize = 1 math and then
1136     * expanding that result out, but we would need to be careful with
1137     * masking.
1138     *
1139     * Gen 6 hardware ignores source modifiers (negate and abs) on math
1140     * instructions, so we also move to a temp to set those up.
1141     */
1142    if (brw->gen >= 6)
1143       src = fix_math_operand(src);
1144
1145    fs_inst *inst = emit(opcode, dst, src);
1146
1147    if (brw->gen < 6) {
1148       inst->base_mrf = 2;
1149       inst->mlen = dispatch_width / 8;
1150    }
1151
1152    return inst;
1153 }
1154
1155 fs_inst *
1156 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1157 {
1158    int base_mrf = 2;
1159    fs_inst *inst;
1160
1161    switch (opcode) {
1162    case SHADER_OPCODE_INT_QUOTIENT:
1163    case SHADER_OPCODE_INT_REMAINDER:
1164       if (brw->gen >= 7 && dispatch_width == 16)
1165          fail("16-wide INTDIV unsupported\n");
1166       break;
1167    case SHADER_OPCODE_POW:
1168       break;
1169    default:
1170       assert(!"not reached: unsupported binary math opcode.");
1171       return NULL;
1172    }
1173
1174    if (brw->gen >= 6) {
1175       src0 = fix_math_operand(src0);
1176       src1 = fix_math_operand(src1);
1177
1178       inst = emit(opcode, dst, src0, src1);
1179    } else {
1180       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1181        * "Message Payload":
1182        *
1183        * "Operand0[7].  For the INT DIV functions, this operand is the
1184        *  denominator."
1185        *  ...
1186        * "Operand1[7].  For the INT DIV functions, this operand is the
1187        *  numerator."
1188        */
1189       bool is_int_div = opcode != SHADER_OPCODE_POW;
1190       fs_reg &op0 = is_int_div ? src1 : src0;
1191       fs_reg &op1 = is_int_div ? src0 : src1;
1192
1193       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1194       inst = emit(opcode, dst, op0, reg_null_f);
1195
1196       inst->base_mrf = base_mrf;
1197       inst->mlen = 2 * dispatch_width / 8;
1198    }
1199    return inst;
1200 }
1201
1202 void
1203 fs_visitor::assign_curb_setup()
1204 {
1205    c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
1206    if (dispatch_width == 8) {
1207       c->prog_data.first_curbe_grf = c->nr_payload_regs;
1208    } else {
1209       c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1210    }
1211
1212    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1213    foreach_list(node, &this->instructions) {
1214       fs_inst *inst = (fs_inst *)node;
1215
1216       for (unsigned int i = 0; i < 3; i++) {
1217          if (inst->src[i].file == UNIFORM) {
1218             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1219             struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1220                                                   constant_nr / 8,
1221                                                   constant_nr % 8);
1222
1223             inst->src[i].file = HW_REG;
1224             inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
1225          }
1226       }
1227    }
1228 }
1229
1230 void
1231 fs_visitor::calculate_urb_setup()
1232 {
1233    for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1234       urb_setup[i] = -1;
1235    }
1236
1237    int urb_next = 0;
1238    /* Figure out where each of the incoming setup attributes lands. */
1239    if (brw->gen >= 6) {
1240       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1241          if (fp->Base.InputsRead & BITFIELD64_BIT(i)) {
1242             urb_setup[i] = urb_next++;
1243          }
1244       }
1245    } else {
1246       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1247       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1248          /* Point size is packed into the header, not as a general attribute */
1249          if (i == VARYING_SLOT_PSIZ)
1250             continue;
1251
1252          if (c->key.input_slots_valid & BITFIELD64_BIT(i)) {
1253             /* The back color slot is skipped when the front color is
1254              * also written to.  In addition, some slots can be
1255              * written in the vertex shader and not read in the
1256              * fragment shader.  So the register number must always be
1257              * incremented, mapped or not.
1258              */
1259             if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1260                urb_setup[i] = urb_next;
1261             urb_next++;
1262          }
1263       }
1264
1265       /*
1266        * It's a FS only attribute, and we did interpolation for this attribute
1267        * in SF thread. So, count it here, too.
1268        *
1269        * See compile_sf_prog() for more info.
1270        */
1271       if (fp->Base.InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1272          urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1273    }
1274
1275    /* Each attribute is 4 setup channels, each of which is half a reg. */
1276    c->prog_data.urb_read_length = urb_next * 2;
1277 }
1278
1279 void
1280 fs_visitor::assign_urb_setup()
1281 {
1282    int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1283
1284    /* Offset all the urb_setup[] index by the actual position of the
1285     * setup regs, now that the location of the constants has been chosen.
1286     */
1287    foreach_list(node, &this->instructions) {
1288       fs_inst *inst = (fs_inst *)node;
1289
1290       if (inst->opcode == FS_OPCODE_LINTERP) {
1291          assert(inst->src[2].file == HW_REG);
1292          inst->src[2].fixed_hw_reg.nr += urb_start;
1293       }
1294
1295       if (inst->opcode == FS_OPCODE_CINTERP) {
1296          assert(inst->src[0].file == HW_REG);
1297          inst->src[0].fixed_hw_reg.nr += urb_start;
1298       }
1299    }
1300
1301    this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length;
1302 }
1303
1304 /**
1305  * Split large virtual GRFs into separate components if we can.
1306  *
1307  * This is mostly duplicated with what brw_fs_vector_splitting does,
1308  * but that's really conservative because it's afraid of doing
1309  * splitting that doesn't result in real progress after the rest of
1310  * the optimization phases, which would cause infinite looping in
1311  * optimization.  We can do it once here, safely.  This also has the
1312  * opportunity to split interpolated values, or maybe even uniforms,
1313  * which we don't have at the IR level.
1314  *
1315  * We want to split, because virtual GRFs are what we register
1316  * allocate and spill (due to contiguousness requirements for some
1317  * instructions), and they're what we naturally generate in the
1318  * codegen process, but most virtual GRFs don't actually need to be
1319  * contiguous sets of GRFs.  If we split, we'll end up with reduced
1320  * live intervals and better dead code elimination and coalescing.
1321  */
1322 void
1323 fs_visitor::split_virtual_grfs()
1324 {
1325    int num_vars = this->virtual_grf_count;
1326    bool split_grf[num_vars];
1327    int new_virtual_grf[num_vars];
1328
1329    /* Try to split anything > 0 sized. */
1330    for (int i = 0; i < num_vars; i++) {
1331       if (this->virtual_grf_sizes[i] != 1)
1332          split_grf[i] = true;
1333       else
1334          split_grf[i] = false;
1335    }
1336
1337    if (brw->has_pln &&
1338        this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1339       /* PLN opcodes rely on the delta_xy being contiguous.  We only have to
1340        * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1341        * Gen6, that was the only supported interpolation mode, and since Gen6,
1342        * delta_x and delta_y are in fixed hardware registers.
1343        */
1344       split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1345          false;
1346    }
1347
1348    foreach_list(node, &this->instructions) {
1349       fs_inst *inst = (fs_inst *)node;
1350
1351       /* If there's a SEND message that requires contiguous destination
1352        * registers, no splitting is allowed.
1353        */
1354       if (inst->regs_written > 1) {
1355          split_grf[inst->dst.reg] = false;
1356       }
1357
1358       /* If we're sending from a GRF, don't split it, on the assumption that
1359        * the send is reading the whole thing.
1360        */
1361       if (inst->is_send_from_grf()) {
1362          split_grf[inst->src[0].reg] = false;
1363       }
1364    }
1365
1366    /* Allocate new space for split regs.  Note that the virtual
1367     * numbers will be contiguous.
1368     */
1369    for (int i = 0; i < num_vars; i++) {
1370       if (split_grf[i]) {
1371          new_virtual_grf[i] = virtual_grf_alloc(1);
1372          for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1373             int reg = virtual_grf_alloc(1);
1374             assert(reg == new_virtual_grf[i] + j - 1);
1375             (void) reg;
1376          }
1377          this->virtual_grf_sizes[i] = 1;
1378       }
1379    }
1380
1381    foreach_list(node, &this->instructions) {
1382       fs_inst *inst = (fs_inst *)node;
1383
1384       if (inst->dst.file == GRF &&
1385           split_grf[inst->dst.reg] &&
1386           inst->dst.reg_offset != 0) {
1387          inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1388                           inst->dst.reg_offset - 1);
1389          inst->dst.reg_offset = 0;
1390       }
1391       for (int i = 0; i < 3; i++) {
1392          if (inst->src[i].file == GRF &&
1393              split_grf[inst->src[i].reg] &&
1394              inst->src[i].reg_offset != 0) {
1395             inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1396                                 inst->src[i].reg_offset - 1);
1397             inst->src[i].reg_offset = 0;
1398          }
1399       }
1400    }
1401    this->live_intervals_valid = false;
1402 }
1403
1404 /**
1405  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1406  *
1407  * During code generation, we create tons of temporary variables, many of
1408  * which get immediately killed and are never used again.  Yet, in later
1409  * optimization and analysis passes, such as compute_live_intervals, we need
1410  * to loop over all the virtual GRFs.  Compacting them can save a lot of
1411  * overhead.
1412  */
1413 void
1414 fs_visitor::compact_virtual_grfs()
1415 {
1416    /* Mark which virtual GRFs are used, and count how many. */
1417    int remap_table[this->virtual_grf_count];
1418    memset(remap_table, -1, sizeof(remap_table));
1419
1420    foreach_list(node, &this->instructions) {
1421       const fs_inst *inst = (const fs_inst *) node;
1422
1423       if (inst->dst.file == GRF)
1424          remap_table[inst->dst.reg] = 0;
1425
1426       for (int i = 0; i < 3; i++) {
1427          if (inst->src[i].file == GRF)
1428             remap_table[inst->src[i].reg] = 0;
1429       }
1430    }
1431
1432    /* In addition to registers used in instructions, fs_visitor keeps
1433     * direct references to certain special values which must be patched:
1434     */
1435    fs_reg *special[] = {
1436       &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1437       &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1438       &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1439       &delta_x[0], &delta_x[1], &delta_x[2],
1440       &delta_x[3], &delta_x[4], &delta_x[5],
1441       &delta_y[0], &delta_y[1], &delta_y[2],
1442       &delta_y[3], &delta_y[4], &delta_y[5],
1443    };
1444    STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1445    STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1446
1447    /* Treat all special values as used, to be conservative */
1448    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1449       if (special[i]->file == GRF)
1450          remap_table[special[i]->reg] = 0;
1451    }
1452
1453    /* Compact the GRF arrays. */
1454    int new_index = 0;
1455    for (int i = 0; i < this->virtual_grf_count; i++) {
1456       if (remap_table[i] != -1) {
1457          remap_table[i] = new_index;
1458          virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1459          if (live_intervals_valid) {
1460             virtual_grf_start[new_index] = virtual_grf_start[i];
1461             virtual_grf_end[new_index] = virtual_grf_end[i];
1462          }
1463          ++new_index;
1464       }
1465    }
1466
1467    this->virtual_grf_count = new_index;
1468
1469    /* Patch all the instructions to use the newly renumbered registers */
1470    foreach_list(node, &this->instructions) {
1471       fs_inst *inst = (fs_inst *) node;
1472
1473       if (inst->dst.file == GRF)
1474          inst->dst.reg = remap_table[inst->dst.reg];
1475
1476       for (int i = 0; i < 3; i++) {
1477          if (inst->src[i].file == GRF)
1478             inst->src[i].reg = remap_table[inst->src[i].reg];
1479       }
1480    }
1481
1482    /* Patch all the references to special values */
1483    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1484       if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1485          special[i]->reg = remap_table[special[i]->reg];
1486    }
1487 }
1488
1489 bool
1490 fs_visitor::remove_dead_constants()
1491 {
1492    if (dispatch_width == 8) {
1493       this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
1494       this->nr_params_remap = c->prog_data.nr_params;
1495
1496       for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
1497          this->params_remap[i] = -1;
1498
1499       /* Find which params are still in use. */
1500       foreach_list(node, &this->instructions) {
1501          fs_inst *inst = (fs_inst *)node;
1502
1503          for (int i = 0; i < 3; i++) {
1504             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1505
1506             if (inst->src[i].file != UNIFORM)
1507                continue;
1508
1509             /* Section 5.11 of the OpenGL 4.3 spec says:
1510              *
1511              *     "Out-of-bounds reads return undefined values, which include
1512              *     values from other variables of the active program or zero."
1513              */
1514             if (constant_nr < 0 || constant_nr >= (int)c->prog_data.nr_params) {
1515                constant_nr = 0;
1516             }
1517
1518             /* For now, set this to non-negative.  We'll give it the
1519              * actual new number in a moment, in order to keep the
1520              * register numbers nicely ordered.
1521              */
1522             this->params_remap[constant_nr] = 0;
1523          }
1524       }
1525
1526       /* Figure out what the new numbers for the params will be.  At some
1527        * point when we're doing uniform array access, we're going to want
1528        * to keep the distinction between .reg and .reg_offset, but for
1529        * now we don't care.
1530        */
1531       unsigned int new_nr_params = 0;
1532       for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1533          if (this->params_remap[i] != -1) {
1534             this->params_remap[i] = new_nr_params++;
1535          }
1536       }
1537
1538       /* Update the list of params to be uploaded to match our new numbering. */
1539       for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1540          int remapped = this->params_remap[i];
1541
1542          if (remapped == -1)
1543             continue;
1544
1545          c->prog_data.param[remapped] = c->prog_data.param[i];
1546       }
1547
1548       c->prog_data.nr_params = new_nr_params;
1549    } else {
1550       /* This should have been generated in the 8-wide pass already. */
1551       assert(this->params_remap);
1552    }
1553
1554    /* Now do the renumbering of the shader to remove unused params. */
1555    foreach_list(node, &this->instructions) {
1556       fs_inst *inst = (fs_inst *)node;
1557
1558       for (int i = 0; i < 3; i++) {
1559          int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1560
1561          if (inst->src[i].file != UNIFORM)
1562             continue;
1563
1564          /* as above alias to 0 */
1565          if (constant_nr < 0 || constant_nr >= (int)this->nr_params_remap) {
1566             constant_nr = 0;
1567          }
1568          assert(this->params_remap[constant_nr] != -1);
1569          inst->src[i].reg = this->params_remap[constant_nr];
1570          inst->src[i].reg_offset = 0;
1571       }
1572    }
1573
1574    return true;
1575 }
1576
1577 /*
1578  * Implements array access of uniforms by inserting a
1579  * PULL_CONSTANT_LOAD instruction.
1580  *
1581  * Unlike temporary GRF array access (where we don't support it due to
1582  * the difficulty of doing relative addressing on instruction
1583  * destinations), we could potentially do array access of uniforms
1584  * that were loaded in GRF space as push constants.  In real-world
1585  * usage we've seen, though, the arrays being used are always larger
1586  * than we could load as push constants, so just always move all
1587  * uniform array access out to a pull constant buffer.
1588  */
1589 void
1590 fs_visitor::move_uniform_array_access_to_pull_constants()
1591 {
1592    int pull_constant_loc[c->prog_data.nr_params];
1593
1594    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1595       pull_constant_loc[i] = -1;
1596    }
1597
1598    /* Walk through and find array access of uniforms.  Put a copy of that
1599     * uniform in the pull constant buffer.
1600     *
1601     * Note that we don't move constant-indexed accesses to arrays.  No
1602     * testing has been done of the performance impact of this choice.
1603     */
1604    foreach_list_safe(node, &this->instructions) {
1605       fs_inst *inst = (fs_inst *)node;
1606
1607       for (int i = 0 ; i < 3; i++) {
1608          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1609             continue;
1610
1611          int uniform = inst->src[i].reg;
1612
1613          /* If this array isn't already present in the pull constant buffer,
1614           * add it.
1615           */
1616          if (pull_constant_loc[uniform] == -1) {
1617             const float **values = &c->prog_data.param[uniform];
1618
1619             pull_constant_loc[uniform] = c->prog_data.nr_pull_params;
1620
1621             assert(param_size[uniform]);
1622
1623             for (int j = 0; j < param_size[uniform]; j++) {
1624                c->prog_data.pull_param[c->prog_data.nr_pull_params++] =
1625                   values[j];
1626             }
1627          }
1628
1629          /* Set up the annotation tracking for new generated instructions. */
1630          base_ir = inst->ir;
1631          current_annotation = inst->annotation;
1632
1633          fs_reg surf_index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1634          fs_reg temp = fs_reg(this, glsl_type::float_type);
1635          exec_list list = VARYING_PULL_CONSTANT_LOAD(temp,
1636                                                      surf_index,
1637                                                      *inst->src[i].reladdr,
1638                                                      pull_constant_loc[uniform] +
1639                                                      inst->src[i].reg_offset);
1640          inst->insert_before(&list);
1641
1642          inst->src[i].file = temp.file;
1643          inst->src[i].reg = temp.reg;
1644          inst->src[i].reg_offset = temp.reg_offset;
1645          inst->src[i].reladdr = NULL;
1646       }
1647    }
1648 }
1649
1650 /**
1651  * Choose accesses from the UNIFORM file to demote to using the pull
1652  * constant buffer.
1653  *
1654  * We allow a fragment shader to have more than the specified minimum
1655  * maximum number of fragment shader uniform components (64).  If
1656  * there are too many of these, they'd fill up all of register space.
1657  * So, this will push some of them out to the pull constant buffer and
1658  * update the program to load them.
1659  */
1660 void
1661 fs_visitor::setup_pull_constants()
1662 {
1663    /* Only allow 16 registers (128 uniform components) as push constants. */
1664    unsigned int max_uniform_components = 16 * 8;
1665    if (c->prog_data.nr_params <= max_uniform_components)
1666       return;
1667
1668    if (dispatch_width == 16) {
1669       fail("Pull constants not supported in 16-wide\n");
1670       return;
1671    }
1672
1673    /* Just demote the end of the list.  We could probably do better
1674     * here, demoting things that are rarely used in the program first.
1675     */
1676    unsigned int pull_uniform_base = max_uniform_components;
1677
1678    int pull_constant_loc[c->prog_data.nr_params];
1679    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1680       if (i < pull_uniform_base) {
1681          pull_constant_loc[i] = -1;
1682       } else {
1683          pull_constant_loc[i] = -1;
1684          /* If our constant is already being uploaded for reladdr purposes,
1685           * reuse it.
1686           */
1687          for (unsigned int j = 0; j < c->prog_data.nr_pull_params; j++) {
1688             if (c->prog_data.pull_param[j] == c->prog_data.param[i]) {
1689                pull_constant_loc[i] = j;
1690                break;
1691             }
1692          }
1693          if (pull_constant_loc[i] == -1) {
1694             int pull_index = c->prog_data.nr_pull_params++;
1695             c->prog_data.pull_param[pull_index] = c->prog_data.param[i];
1696             pull_constant_loc[i] = pull_index;;
1697          }
1698       }
1699    }
1700    c->prog_data.nr_params = pull_uniform_base;
1701
1702    foreach_list(node, &this->instructions) {
1703       fs_inst *inst = (fs_inst *)node;
1704
1705       for (int i = 0; i < 3; i++) {
1706          if (inst->src[i].file != UNIFORM)
1707             continue;
1708
1709          int pull_index = pull_constant_loc[inst->src[i].reg +
1710                                             inst->src[i].reg_offset];
1711          if (pull_index == -1)
1712             continue;
1713
1714          assert(!inst->src[i].reladdr);
1715
1716          fs_reg dst = fs_reg(this, glsl_type::float_type);
1717          fs_reg index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1718          fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1719          fs_inst *pull =
1720             new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1721                                  dst, index, offset);
1722          pull->ir = inst->ir;
1723          pull->annotation = inst->annotation;
1724
1725          inst->insert_before(pull);
1726
1727          inst->src[i].file = GRF;
1728          inst->src[i].reg = dst.reg;
1729          inst->src[i].reg_offset = 0;
1730          inst->src[i].smear = pull_index & 3;
1731       }
1732    }
1733 }
1734
1735 bool
1736 fs_visitor::opt_algebraic()
1737 {
1738    bool progress = false;
1739
1740    foreach_list(node, &this->instructions) {
1741       fs_inst *inst = (fs_inst *)node;
1742
1743       switch (inst->opcode) {
1744       case BRW_OPCODE_MUL:
1745          if (inst->src[1].file != IMM)
1746             continue;
1747
1748          /* a * 1.0 = a */
1749          if (inst->src[1].is_one()) {
1750             inst->opcode = BRW_OPCODE_MOV;
1751             inst->src[1] = reg_undef;
1752             progress = true;
1753             break;
1754          }
1755
1756          /* a * 0.0 = 0.0 */
1757          if (inst->src[1].is_zero()) {
1758             inst->opcode = BRW_OPCODE_MOV;
1759             inst->src[0] = inst->src[1];
1760             inst->src[1] = reg_undef;
1761             progress = true;
1762             break;
1763          }
1764
1765          break;
1766       case BRW_OPCODE_ADD:
1767          if (inst->src[1].file != IMM)
1768             continue;
1769
1770          /* a + 0.0 = a */
1771          if (inst->src[1].is_zero()) {
1772             inst->opcode = BRW_OPCODE_MOV;
1773             inst->src[1] = reg_undef;
1774             progress = true;
1775             break;
1776          }
1777          break;
1778       default:
1779          break;
1780       }
1781    }
1782
1783    return progress;
1784 }
1785
1786 /**
1787  * Removes any instructions writing a VGRF where that VGRF is not used by any
1788  * later instruction.
1789  */
1790 bool
1791 fs_visitor::dead_code_eliminate()
1792 {
1793    bool progress = false;
1794    int pc = 0;
1795
1796    calculate_live_intervals();
1797
1798    foreach_list_safe(node, &this->instructions) {
1799       fs_inst *inst = (fs_inst *)node;
1800
1801       if (inst->dst.file == GRF) {
1802          assert(this->virtual_grf_end[inst->dst.reg] >= pc);
1803          if (this->virtual_grf_end[inst->dst.reg] == pc) {
1804             inst->remove();
1805             progress = true;
1806          }
1807       }
1808
1809       pc++;
1810    }
1811
1812    if (progress)
1813       live_intervals_valid = false;
1814
1815    return progress;
1816 }
1817
1818 struct dead_code_hash_key
1819 {
1820    int vgrf;
1821    int reg_offset;
1822 };
1823
1824 static bool
1825 dead_code_hash_compare(const void *a, const void *b)
1826 {
1827    return memcmp(a, b, sizeof(struct dead_code_hash_key)) == 0;
1828 }
1829
1830 static void
1831 clear_dead_code_hash(struct hash_table *ht)
1832 {
1833    struct hash_entry *entry;
1834
1835    hash_table_foreach(ht, entry) {
1836       _mesa_hash_table_remove(ht, entry);
1837    }
1838 }
1839
1840 static void
1841 insert_dead_code_hash(struct hash_table *ht,
1842                       int vgrf, int reg_offset, fs_inst *inst)
1843 {
1844    /* We don't bother freeing keys, because they'll be GCed with the ht. */
1845    struct dead_code_hash_key *key = ralloc(ht, struct dead_code_hash_key);
1846
1847    key->vgrf = vgrf;
1848    key->reg_offset = reg_offset;
1849
1850    _mesa_hash_table_insert(ht, _mesa_hash_data(key, sizeof(*key)), key, inst);
1851 }
1852
1853 static struct hash_entry *
1854 get_dead_code_hash_entry(struct hash_table *ht, int vgrf, int reg_offset)
1855 {
1856    struct dead_code_hash_key key;
1857
1858    key.vgrf = vgrf;
1859    key.reg_offset = reg_offset;
1860
1861    return _mesa_hash_table_search(ht, _mesa_hash_data(&key, sizeof(key)), &key);
1862 }
1863
1864 static void
1865 remove_dead_code_hash(struct hash_table *ht,
1866                       int vgrf, int reg_offset)
1867 {
1868    struct hash_entry *entry = get_dead_code_hash_entry(ht, vgrf, reg_offset);
1869    if (!entry)
1870       return;
1871
1872    _mesa_hash_table_remove(ht, entry);
1873 }
1874
1875 /**
1876  * Walks basic blocks, removing any regs that are written but not read before
1877  * being redefined.
1878  *
1879  * The dead_code_eliminate() function implements a global dead code
1880  * elimination, but it only handles the removing the last write to a register
1881  * if it's never read.  This one can handle intermediate writes, but only
1882  * within a basic block.
1883  */
1884 bool
1885 fs_visitor::dead_code_eliminate_local()
1886 {
1887    struct hash_table *ht;
1888    bool progress = false;
1889
1890    ht = _mesa_hash_table_create(mem_ctx, dead_code_hash_compare);
1891
1892    foreach_list_safe(node, &this->instructions) {
1893       fs_inst *inst = (fs_inst *)node;
1894
1895       /* At a basic block, empty the HT since we don't understand dataflow
1896        * here.
1897        */
1898       if (inst->is_control_flow()) {
1899          clear_dead_code_hash(ht);
1900          continue;
1901       }
1902
1903       /* Clear the HT of any instructions that got read. */
1904       for (int i = 0; i < 3; i++) {
1905          fs_reg src = inst->src[i];
1906          if (src.file != GRF)
1907             continue;
1908
1909          int read = 1;
1910          if (inst->is_send_from_grf())
1911             read = virtual_grf_sizes[src.reg] - src.reg_offset;
1912
1913          for (int reg_offset = src.reg_offset;
1914               reg_offset < src.reg_offset + read;
1915               reg_offset++) {
1916             remove_dead_code_hash(ht, src.reg, reg_offset);
1917          }
1918       }
1919
1920       /* Add any update of a GRF to the HT, removing a previous write if it
1921        * wasn't read.
1922        */
1923       if (inst->dst.file == GRF) {
1924          if (inst->regs_written > 1) {
1925             /* We don't know how to trim channels from an instruction's
1926              * writes, so we can't incrementally remove unread channels from
1927              * it.  Just remove whatever it overwrites from the table
1928              */
1929             for (int i = 0; i < inst->regs_written; i++) {
1930                remove_dead_code_hash(ht,
1931                                      inst->dst.reg,
1932                                      inst->dst.reg_offset + i);
1933             }
1934          } else {
1935             struct hash_entry *entry =
1936                get_dead_code_hash_entry(ht, inst->dst.reg,
1937                                         inst->dst.reg_offset);
1938
1939             if (inst->is_partial_write()) {
1940                /* For a partial write, we can't remove any previous dead code
1941                 * candidate, since we're just modifying their result, but we can
1942                 * be dead code eliminiated ourselves.
1943                 */
1944                if (entry) {
1945                   entry->data = inst;
1946                } else {
1947                   insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
1948                                         inst);
1949                }
1950             } else {
1951                if (entry) {
1952                   /* We're completely updating a channel, and there was a
1953                    * previous write to the channel that wasn't read.  Kill it!
1954                    */
1955                   fs_inst *inst = (fs_inst *)entry->data;
1956                   inst->remove();
1957                   progress = true;
1958                   _mesa_hash_table_remove(ht, entry);
1959                }
1960
1961                insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
1962                                      inst);
1963             }
1964          }
1965       }
1966    }
1967
1968    _mesa_hash_table_destroy(ht, NULL);
1969
1970    if (progress)
1971       live_intervals_valid = false;
1972
1973    return progress;
1974 }
1975
1976 /**
1977  * Implements a second type of register coalescing: This one checks if
1978  * the two regs involved in a raw move don't interfere, in which case
1979  * they can both by stored in the same place and the MOV removed.
1980  */
1981 bool
1982 fs_visitor::register_coalesce_2()
1983 {
1984    bool progress = false;
1985
1986    calculate_live_intervals();
1987
1988    foreach_list_safe(node, &this->instructions) {
1989       fs_inst *inst = (fs_inst *)node;
1990
1991       if (inst->opcode != BRW_OPCODE_MOV ||
1992           inst->is_partial_write() ||
1993           inst->saturate ||
1994           inst->src[0].file != GRF ||
1995           inst->src[0].negate ||
1996           inst->src[0].abs ||
1997           inst->src[0].smear != -1 ||
1998           inst->dst.file != GRF ||
1999           inst->dst.type != inst->src[0].type ||
2000           virtual_grf_sizes[inst->src[0].reg] != 1 ||
2001           virtual_grf_interferes(inst->dst.reg, inst->src[0].reg)) {
2002          continue;
2003       }
2004
2005       int reg_from = inst->src[0].reg;
2006       assert(inst->src[0].reg_offset == 0);
2007       int reg_to = inst->dst.reg;
2008       int reg_to_offset = inst->dst.reg_offset;
2009
2010       foreach_list(node, &this->instructions) {
2011          fs_inst *scan_inst = (fs_inst *)node;
2012
2013          if (scan_inst->dst.file == GRF &&
2014              scan_inst->dst.reg == reg_from) {
2015             scan_inst->dst.reg = reg_to;
2016             scan_inst->dst.reg_offset = reg_to_offset;
2017          }
2018          for (int i = 0; i < 3; i++) {
2019             if (scan_inst->src[i].file == GRF &&
2020                 scan_inst->src[i].reg == reg_from) {
2021                scan_inst->src[i].reg = reg_to;
2022                scan_inst->src[i].reg_offset = reg_to_offset;
2023             }
2024          }
2025       }
2026
2027       inst->remove();
2028
2029       /* We don't need to recalculate live intervals inside the loop despite
2030        * flagging live_intervals_valid because we only use live intervals for
2031        * the interferes test, and we must have had a situation where the
2032        * intervals were:
2033        *
2034        *  from  to
2035        *  ^
2036        *  |
2037        *  v
2038        *        ^
2039        *        |
2040        *        v
2041        *
2042        * Some register R that might get coalesced with one of these two could
2043        * only be referencing "to", otherwise "from"'s range would have been
2044        * longer.  R's range could also only start at the end of "to" or later,
2045        * otherwise it will conflict with "to" when we try to coalesce "to"
2046        * into Rw anyway.
2047        */
2048       live_intervals_valid = false;
2049
2050       progress = true;
2051       continue;
2052    }
2053
2054    return progress;
2055 }
2056
2057 bool
2058 fs_visitor::register_coalesce()
2059 {
2060    bool progress = false;
2061    int if_depth = 0;
2062    int loop_depth = 0;
2063
2064    foreach_list_safe(node, &this->instructions) {
2065       fs_inst *inst = (fs_inst *)node;
2066
2067       /* Make sure that we dominate the instructions we're going to
2068        * scan for interfering with our coalescing, or we won't have
2069        * scanned enough to see if anything interferes with our
2070        * coalescing.  We don't dominate the following instructions if
2071        * we're in a loop or an if block.
2072        */
2073       switch (inst->opcode) {
2074       case BRW_OPCODE_DO:
2075          loop_depth++;
2076          break;
2077       case BRW_OPCODE_WHILE:
2078          loop_depth--;
2079          break;
2080       case BRW_OPCODE_IF:
2081          if_depth++;
2082          break;
2083       case BRW_OPCODE_ENDIF:
2084          if_depth--;
2085          break;
2086       default:
2087          break;
2088       }
2089       if (loop_depth || if_depth)
2090          continue;
2091
2092       if (inst->opcode != BRW_OPCODE_MOV ||
2093           inst->is_partial_write() ||
2094           inst->saturate ||
2095           inst->dst.file != GRF || (inst->src[0].file != GRF &&
2096                                     inst->src[0].file != UNIFORM)||
2097           inst->dst.type != inst->src[0].type)
2098          continue;
2099
2100       bool has_source_modifiers = (inst->src[0].abs ||
2101                                    inst->src[0].negate ||
2102                                    inst->src[0].smear != -1 ||
2103                                    inst->src[0].file == UNIFORM);
2104
2105       /* Found a move of a GRF to a GRF.  Let's see if we can coalesce
2106        * them: check for no writes to either one until the exit of the
2107        * program.
2108        */
2109       bool interfered = false;
2110
2111       for (fs_inst *scan_inst = (fs_inst *)inst->next;
2112            !scan_inst->is_tail_sentinel();
2113            scan_inst = (fs_inst *)scan_inst->next) {
2114          if (scan_inst->dst.file == GRF) {
2115             if (scan_inst->overwrites_reg(inst->dst) ||
2116                 scan_inst->overwrites_reg(inst->src[0])) {
2117                interfered = true;
2118                break;
2119             }
2120          }
2121
2122          if (has_source_modifiers) {
2123             for (int i = 0; i < 3; i++) {
2124                if (scan_inst->src[i].file == GRF &&
2125                    scan_inst->src[i].reg == inst->dst.reg &&
2126                    scan_inst->src[i].reg_offset == inst->dst.reg_offset &&
2127                    inst->dst.type != scan_inst->src[i].type)
2128                {
2129                  interfered = true;
2130                  break;
2131                }
2132             }
2133          }
2134
2135
2136          /* The gen6 MATH instruction can't handle source modifiers or
2137           * unusual register regions, so avoid coalescing those for
2138           * now.  We should do something more specific.
2139           */
2140          if (has_source_modifiers && !can_do_source_mods(scan_inst)) {
2141             interfered = true;
2142             break;
2143          }
2144
2145          /* The accumulator result appears to get used for the
2146           * conditional modifier generation.  When negating a UD
2147           * value, there is a 33rd bit generated for the sign in the
2148           * accumulator value, so now you can't check, for example,
2149           * equality with a 32-bit value.  See piglit fs-op-neg-uint.
2150           */
2151          if (scan_inst->conditional_mod &&
2152              inst->src[0].negate &&
2153              inst->src[0].type == BRW_REGISTER_TYPE_UD) {
2154             interfered = true;
2155             break;
2156          }
2157       }
2158       if (interfered) {
2159          continue;
2160       }
2161
2162       /* Rewrite the later usage to point at the source of the move to
2163        * be removed.
2164        */
2165       for (fs_inst *scan_inst = inst;
2166            !scan_inst->is_tail_sentinel();
2167            scan_inst = (fs_inst *)scan_inst->next) {
2168          for (int i = 0; i < 3; i++) {
2169             if (scan_inst->src[i].file == GRF &&
2170                 scan_inst->src[i].reg == inst->dst.reg &&
2171                 scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
2172                fs_reg new_src = inst->src[0];
2173                if (scan_inst->src[i].abs) {
2174                   new_src.negate = 0;
2175                   new_src.abs = 1;
2176                }
2177                new_src.negate ^= scan_inst->src[i].negate;
2178                scan_inst->src[i] = new_src;
2179             }
2180          }
2181       }
2182
2183       inst->remove();
2184       progress = true;
2185    }
2186
2187    if (progress)
2188       live_intervals_valid = false;
2189
2190    return progress;
2191 }
2192
2193
2194 bool
2195 fs_visitor::compute_to_mrf()
2196 {
2197    bool progress = false;
2198    int next_ip = 0;
2199
2200    calculate_live_intervals();
2201
2202    foreach_list_safe(node, &this->instructions) {
2203       fs_inst *inst = (fs_inst *)node;
2204
2205       int ip = next_ip;
2206       next_ip++;
2207
2208       if (inst->opcode != BRW_OPCODE_MOV ||
2209           inst->is_partial_write() ||
2210           inst->dst.file != MRF || inst->src[0].file != GRF ||
2211           inst->dst.type != inst->src[0].type ||
2212           inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
2213          continue;
2214
2215       /* Work out which hardware MRF registers are written by this
2216        * instruction.
2217        */
2218       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2219       int mrf_high;
2220       if (inst->dst.reg & BRW_MRF_COMPR4) {
2221          mrf_high = mrf_low + 4;
2222       } else if (dispatch_width == 16 &&
2223                  (!inst->force_uncompressed && !inst->force_sechalf)) {
2224          mrf_high = mrf_low + 1;
2225       } else {
2226          mrf_high = mrf_low;
2227       }
2228
2229       /* Can't compute-to-MRF this GRF if someone else was going to
2230        * read it later.
2231        */
2232       if (this->virtual_grf_end[inst->src[0].reg] > ip)
2233          continue;
2234
2235       /* Found a move of a GRF to a MRF.  Let's see if we can go
2236        * rewrite the thing that made this GRF to write into the MRF.
2237        */
2238       fs_inst *scan_inst;
2239       for (scan_inst = (fs_inst *)inst->prev;
2240            scan_inst->prev != NULL;
2241            scan_inst = (fs_inst *)scan_inst->prev) {
2242          if (scan_inst->dst.file == GRF &&
2243              scan_inst->dst.reg == inst->src[0].reg) {
2244             /* Found the last thing to write our reg we want to turn
2245              * into a compute-to-MRF.
2246              */
2247
2248             /* If this one instruction didn't populate all the
2249              * channels, bail.  We might be able to rewrite everything
2250              * that writes that reg, but it would require smarter
2251              * tracking to delay the rewriting until complete success.
2252              */
2253             if (scan_inst->is_partial_write())
2254                break;
2255
2256             /* Things returning more than one register would need us to
2257              * understand coalescing out more than one MOV at a time.
2258              */
2259             if (scan_inst->regs_written > 1)
2260                break;
2261
2262             /* SEND instructions can't have MRF as a destination. */
2263             if (scan_inst->mlen)
2264                break;
2265
2266             if (brw->gen == 6) {
2267                /* gen6 math instructions must have the destination be
2268                 * GRF, so no compute-to-MRF for them.
2269                 */
2270                if (scan_inst->is_math()) {
2271                   break;
2272                }
2273             }
2274
2275             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2276                /* Found the creator of our MRF's source value. */
2277                scan_inst->dst.file = MRF;
2278                scan_inst->dst.reg = inst->dst.reg;
2279                scan_inst->saturate |= inst->saturate;
2280                inst->remove();
2281                progress = true;
2282             }
2283             break;
2284          }
2285
2286          /* We don't handle control flow here.  Most computation of
2287           * values that end up in MRFs are shortly before the MRF
2288           * write anyway.
2289           */
2290          if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2291             break;
2292
2293          /* You can't read from an MRF, so if someone else reads our
2294           * MRF's source GRF that we wanted to rewrite, that stops us.
2295           */
2296          bool interfered = false;
2297          for (int i = 0; i < 3; i++) {
2298             if (scan_inst->src[i].file == GRF &&
2299                 scan_inst->src[i].reg == inst->src[0].reg &&
2300                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2301                interfered = true;
2302             }
2303          }
2304          if (interfered)
2305             break;
2306
2307          if (scan_inst->dst.file == MRF) {
2308             /* If somebody else writes our MRF here, we can't
2309              * compute-to-MRF before that.
2310              */
2311             int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2312             int scan_mrf_high;
2313
2314             if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2315                scan_mrf_high = scan_mrf_low + 4;
2316             } else if (dispatch_width == 16 &&
2317                        (!scan_inst->force_uncompressed &&
2318                         !scan_inst->force_sechalf)) {
2319                scan_mrf_high = scan_mrf_low + 1;
2320             } else {
2321                scan_mrf_high = scan_mrf_low;
2322             }
2323
2324             if (mrf_low == scan_mrf_low ||
2325                 mrf_low == scan_mrf_high ||
2326                 mrf_high == scan_mrf_low ||
2327                 mrf_high == scan_mrf_high) {
2328                break;
2329             }
2330          }
2331
2332          if (scan_inst->mlen > 0) {
2333             /* Found a SEND instruction, which means that there are
2334              * live values in MRFs from base_mrf to base_mrf +
2335              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
2336              * above it.
2337              */
2338             if (mrf_low >= scan_inst->base_mrf &&
2339                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2340                break;
2341             }
2342             if (mrf_high >= scan_inst->base_mrf &&
2343                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2344                break;
2345             }
2346          }
2347       }
2348    }
2349
2350    if (progress)
2351       live_intervals_valid = false;
2352
2353    return progress;
2354 }
2355
2356 /**
2357  * Walks through basic blocks, looking for repeated MRF writes and
2358  * removing the later ones.
2359  */
2360 bool
2361 fs_visitor::remove_duplicate_mrf_writes()
2362 {
2363    fs_inst *last_mrf_move[16];
2364    bool progress = false;
2365
2366    /* Need to update the MRF tracking for compressed instructions. */
2367    if (dispatch_width == 16)
2368       return false;
2369
2370    memset(last_mrf_move, 0, sizeof(last_mrf_move));
2371
2372    foreach_list_safe(node, &this->instructions) {
2373       fs_inst *inst = (fs_inst *)node;
2374
2375       if (inst->is_control_flow()) {
2376          memset(last_mrf_move, 0, sizeof(last_mrf_move));
2377       }
2378
2379       if (inst->opcode == BRW_OPCODE_MOV &&
2380           inst->dst.file == MRF) {
2381          fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2382          if (prev_inst && inst->equals(prev_inst)) {
2383             inst->remove();
2384             progress = true;
2385             continue;
2386          }
2387       }
2388
2389       /* Clear out the last-write records for MRFs that were overwritten. */
2390       if (inst->dst.file == MRF) {
2391          last_mrf_move[inst->dst.reg] = NULL;
2392       }
2393
2394       if (inst->mlen > 0) {
2395          /* Found a SEND instruction, which will include two or fewer
2396           * implied MRF writes.  We could do better here.
2397           */
2398          for (int i = 0; i < implied_mrf_writes(inst); i++) {
2399             last_mrf_move[inst->base_mrf + i] = NULL;
2400          }
2401       }
2402
2403       /* Clear out any MRF move records whose sources got overwritten. */
2404       if (inst->dst.file == GRF) {
2405          for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2406             if (last_mrf_move[i] &&
2407                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2408                last_mrf_move[i] = NULL;
2409             }
2410          }
2411       }
2412
2413       if (inst->opcode == BRW_OPCODE_MOV &&
2414           inst->dst.file == MRF &&
2415           inst->src[0].file == GRF &&
2416           !inst->is_partial_write()) {
2417          last_mrf_move[inst->dst.reg] = inst;
2418       }
2419    }
2420
2421    if (progress)
2422       live_intervals_valid = false;
2423
2424    return progress;
2425 }
2426
2427 static void
2428 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2429                         int first_grf, int grf_len)
2430 {
2431    bool inst_16wide = (dispatch_width > 8 &&
2432                        !inst->force_uncompressed &&
2433                        !inst->force_sechalf);
2434
2435    /* Clear the flag for registers that actually got read (as expected). */
2436    for (int i = 0; i < 3; i++) {
2437       int grf;
2438       if (inst->src[i].file == GRF) {
2439          grf = inst->src[i].reg;
2440       } else if (inst->src[i].file == HW_REG &&
2441                  inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2442          grf = inst->src[i].fixed_hw_reg.nr;
2443       } else {
2444          continue;
2445       }
2446
2447       if (grf >= first_grf &&
2448           grf < first_grf + grf_len) {
2449          deps[grf - first_grf] = false;
2450          if (inst_16wide)
2451             deps[grf - first_grf + 1] = false;
2452       }
2453    }
2454 }
2455
2456 /**
2457  * Implements this workaround for the original 965:
2458  *
2459  *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2460  *      check for post destination dependencies on this instruction, software
2461  *      must ensure that there is no destination hazard for the case of ‘write
2462  *      followed by a posted write’ shown in the following example.
2463  *
2464  *      1. mov r3 0
2465  *      2. send r3.xy <rest of send instruction>
2466  *      3. mov r2 r3
2467  *
2468  *      Due to no post-destination dependency check on the ‘send’, the above
2469  *      code sequence could have two instructions (1 and 2) in flight at the
2470  *      same time that both consider ‘r3’ as the target of their final writes.
2471  */
2472 void
2473 fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
2474 {
2475    int reg_size = dispatch_width / 8;
2476    int write_len = inst->regs_written * reg_size;
2477    int first_write_grf = inst->dst.reg;
2478    bool needs_dep[BRW_MAX_MRF];
2479    assert(write_len < (int)sizeof(needs_dep) - 1);
2480
2481    memset(needs_dep, false, sizeof(needs_dep));
2482    memset(needs_dep, true, write_len);
2483
2484    clear_deps_for_inst_src(inst, dispatch_width,
2485                            needs_dep, first_write_grf, write_len);
2486
2487    /* Walk backwards looking for writes to registers we're writing which
2488     * aren't read since being written.  If we hit the start of the program,
2489     * we assume that there are no outstanding dependencies on entry to the
2490     * program.
2491     */
2492    for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2493         scan_inst != NULL;
2494         scan_inst = (fs_inst *)scan_inst->prev) {
2495
2496       /* If we hit control flow, assume that there *are* outstanding
2497        * dependencies, and force their cleanup before our instruction.
2498        */
2499       if (scan_inst->is_control_flow()) {
2500          for (int i = 0; i < write_len; i++) {
2501             if (needs_dep[i]) {
2502                inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2503             }
2504          }
2505          return;
2506       }
2507
2508       bool scan_inst_16wide = (dispatch_width > 8 &&
2509                                !scan_inst->force_uncompressed &&
2510                                !scan_inst->force_sechalf);
2511
2512       /* We insert our reads as late as possible on the assumption that any
2513        * instruction but a MOV that might have left us an outstanding
2514        * dependency has more latency than a MOV.
2515        */
2516       if (scan_inst->dst.file == GRF) {
2517          for (int i = 0; i < scan_inst->regs_written; i++) {
2518             int reg = scan_inst->dst.reg + i * reg_size;
2519
2520             if (reg >= first_write_grf &&
2521                 reg < first_write_grf + write_len &&
2522                 needs_dep[reg - first_write_grf]) {
2523                inst->insert_before(DEP_RESOLVE_MOV(reg));
2524                needs_dep[reg - first_write_grf] = false;
2525                if (scan_inst_16wide)
2526                   needs_dep[reg - first_write_grf + 1] = false;
2527             }
2528          }
2529       }
2530
2531       /* Clear the flag for registers that actually got read (as expected). */
2532       clear_deps_for_inst_src(scan_inst, dispatch_width,
2533                               needs_dep, first_write_grf, write_len);
2534
2535       /* Continue the loop only if we haven't resolved all the dependencies */
2536       int i;
2537       for (i = 0; i < write_len; i++) {
2538          if (needs_dep[i])
2539             break;
2540       }
2541       if (i == write_len)
2542          return;
2543    }
2544 }
2545
2546 /**
2547  * Implements this workaround for the original 965:
2548  *
2549  *     "[DevBW, DevCL] Errata: A destination register from a send can not be
2550  *      used as a destination register until after it has been sourced by an
2551  *      instruction with a different destination register.
2552  */
2553 void
2554 fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
2555 {
2556    int write_len = inst->regs_written * dispatch_width / 8;
2557    int first_write_grf = inst->dst.reg;
2558    bool needs_dep[BRW_MAX_MRF];
2559    assert(write_len < (int)sizeof(needs_dep) - 1);
2560
2561    memset(needs_dep, false, sizeof(needs_dep));
2562    memset(needs_dep, true, write_len);
2563    /* Walk forwards looking for writes to registers we're writing which aren't
2564     * read before being written.
2565     */
2566    for (fs_inst *scan_inst = (fs_inst *)inst->next;
2567         !scan_inst->is_tail_sentinel();
2568         scan_inst = (fs_inst *)scan_inst->next) {
2569       /* If we hit control flow, force resolve all remaining dependencies. */
2570       if (scan_inst->is_control_flow()) {
2571          for (int i = 0; i < write_len; i++) {
2572             if (needs_dep[i])
2573                scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2574          }
2575          return;
2576       }
2577
2578       /* Clear the flag for registers that actually got read (as expected). */
2579       clear_deps_for_inst_src(scan_inst, dispatch_width,
2580                               needs_dep, first_write_grf, write_len);
2581
2582       /* We insert our reads as late as possible since they're reading the
2583        * result of a SEND, which has massive latency.
2584        */
2585       if (scan_inst->dst.file == GRF &&
2586           scan_inst->dst.reg >= first_write_grf &&
2587           scan_inst->dst.reg < first_write_grf + write_len &&
2588           needs_dep[scan_inst->dst.reg - first_write_grf]) {
2589          scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2590          needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2591       }
2592
2593       /* Continue the loop only if we haven't resolved all the dependencies */
2594       int i;
2595       for (i = 0; i < write_len; i++) {
2596          if (needs_dep[i])
2597             break;
2598       }
2599       if (i == write_len)
2600          return;
2601    }
2602
2603    /* If we hit the end of the program, resolve all remaining dependencies out
2604     * of paranoia.
2605     */
2606    fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2607    assert(last_inst->eot);
2608    for (int i = 0; i < write_len; i++) {
2609       if (needs_dep[i])
2610          last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2611    }
2612 }
2613
2614 void
2615 fs_visitor::insert_gen4_send_dependency_workarounds()
2616 {
2617    if (brw->gen != 4 || brw->is_g4x)
2618       return;
2619
2620    /* Note that we're done with register allocation, so GRF fs_regs always
2621     * have a .reg_offset of 0.
2622     */
2623
2624    foreach_list_safe(node, &this->instructions) {
2625       fs_inst *inst = (fs_inst *)node;
2626
2627       if (inst->mlen != 0 && inst->dst.file == GRF) {
2628          insert_gen4_pre_send_dependency_workarounds(inst);
2629          insert_gen4_post_send_dependency_workarounds(inst);
2630       }
2631    }
2632 }
2633
2634 /**
2635  * Turns the generic expression-style uniform pull constant load instruction
2636  * into a hardware-specific series of instructions for loading a pull
2637  * constant.
2638  *
2639  * The expression style allows the CSE pass before this to optimize out
2640  * repeated loads from the same offset, and gives the pre-register-allocation
2641  * scheduling full flexibility, while the conversion to native instructions
2642  * allows the post-register-allocation scheduler the best information
2643  * possible.
2644  *
2645  * Note that execution masking for setting up pull constant loads is special:
2646  * the channels that need to be written are unrelated to the current execution
2647  * mask, since a later instruction will use one of the result channels as a
2648  * source operand for all 8 or 16 of its channels.
2649  */
2650 void
2651 fs_visitor::lower_uniform_pull_constant_loads()
2652 {
2653    foreach_list(node, &this->instructions) {
2654       fs_inst *inst = (fs_inst *)node;
2655
2656       if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2657          continue;
2658
2659       if (brw->gen >= 7) {
2660          /* The offset arg before was a vec4-aligned byte offset.  We need to
2661           * turn it into a dword offset.
2662           */
2663          fs_reg const_offset_reg = inst->src[1];
2664          assert(const_offset_reg.file == IMM &&
2665                 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2666          const_offset_reg.imm.u /= 4;
2667          fs_reg payload = fs_reg(this, glsl_type::uint_type);
2668
2669          /* This is actually going to be a MOV, but since only the first dword
2670           * is accessed, we have a special opcode to do just that one.  Note
2671           * that this needs to be an operation that will be considered a def
2672           * by live variable analysis, or register allocation will explode.
2673           */
2674          fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2675                                                payload, const_offset_reg);
2676          setup->force_writemask_all = true;
2677
2678          setup->ir = inst->ir;
2679          setup->annotation = inst->annotation;
2680          inst->insert_before(setup);
2681
2682          /* Similarly, this will only populate the first 4 channels of the
2683           * result register (since we only use smear values from 0-3), but we
2684           * don't tell the optimizer.
2685           */
2686          inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2687          inst->src[1] = payload;
2688
2689          this->live_intervals_valid = false;
2690       } else {
2691          /* Before register allocation, we didn't tell the scheduler about the
2692           * MRF we use.  We know it's safe to use this MRF because nothing
2693           * else does except for register spill/unspill, which generates and
2694           * uses its MRF within a single IR instruction.
2695           */
2696          inst->base_mrf = 14;
2697          inst->mlen = 1;
2698       }
2699    }
2700 }
2701
2702 void
2703 fs_visitor::dump_instruction(backend_instruction *be_inst)
2704 {
2705    fs_inst *inst = (fs_inst *)be_inst;
2706
2707    if (inst->predicate) {
2708       printf("(%cf0.%d) ",
2709              inst->predicate_inverse ? '-' : '+',
2710              inst->flag_subreg);
2711    }
2712
2713    printf("%s", brw_instruction_name(inst->opcode));
2714    if (inst->saturate)
2715       printf(".sat");
2716    if (inst->conditional_mod) {
2717       printf(".cmod");
2718       if (!inst->predicate &&
2719           (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2720                               inst->opcode != BRW_OPCODE_IF &&
2721                               inst->opcode != BRW_OPCODE_WHILE))) {
2722          printf(".f0.%d", inst->flag_subreg);
2723       }
2724    }
2725    printf(" ");
2726
2727
2728    switch (inst->dst.file) {
2729    case GRF:
2730       printf("vgrf%d", inst->dst.reg);
2731       if (inst->dst.reg_offset)
2732          printf("+%d", inst->dst.reg_offset);
2733       break;
2734    case MRF:
2735       printf("m%d", inst->dst.reg);
2736       break;
2737    case BAD_FILE:
2738       printf("(null)");
2739       break;
2740    case UNIFORM:
2741       printf("***u%d***", inst->dst.reg);
2742       break;
2743    case ARF:
2744       if (inst->dst.reg == BRW_ARF_NULL)
2745          printf("(null)");
2746       else
2747          printf("arf%d", inst->dst.reg);
2748       break;
2749    default:
2750       printf("???");
2751       break;
2752    }
2753    printf(", ");
2754
2755    for (int i = 0; i < 3; i++) {
2756       if (inst->src[i].negate)
2757          printf("-");
2758       if (inst->src[i].abs)
2759          printf("|");
2760       switch (inst->src[i].file) {
2761       case GRF:
2762          printf("vgrf%d", inst->src[i].reg);
2763          if (inst->src[i].reg_offset)
2764             printf("+%d", inst->src[i].reg_offset);
2765          break;
2766       case MRF:
2767          printf("***m%d***", inst->src[i].reg);
2768          break;
2769       case UNIFORM:
2770          printf("u%d", inst->src[i].reg);
2771          if (inst->src[i].reg_offset)
2772             printf(".%d", inst->src[i].reg_offset);
2773          break;
2774       case BAD_FILE:
2775          printf("(null)");
2776          break;
2777       case IMM:
2778          switch (inst->src[i].type) {
2779          case BRW_REGISTER_TYPE_F:
2780             printf("%ff", inst->src[i].imm.f);
2781             break;
2782          case BRW_REGISTER_TYPE_D:
2783             printf("%dd", inst->src[i].imm.i);
2784             break;
2785          case BRW_REGISTER_TYPE_UD:
2786             printf("%uu", inst->src[i].imm.u);
2787             break;
2788          default:
2789             printf("???");
2790             break;
2791          }
2792          break;
2793       default:
2794          printf("???");
2795          break;
2796       }
2797       if (inst->src[i].abs)
2798          printf("|");
2799
2800       if (i < 3)
2801          printf(", ");
2802    }
2803
2804    printf(" ");
2805
2806    if (inst->force_uncompressed)
2807       printf("1sthalf ");
2808
2809    if (inst->force_sechalf)
2810       printf("2ndhalf ");
2811
2812    printf("\n");
2813 }
2814
2815 /**
2816  * Possibly returns an instruction that set up @param reg.
2817  *
2818  * Sometimes we want to take the result of some expression/variable
2819  * dereference tree and rewrite the instruction generating the result
2820  * of the tree.  When processing the tree, we know that the
2821  * instructions generated are all writing temporaries that are dead
2822  * outside of this tree.  So, if we have some instructions that write
2823  * a temporary, we're free to point that temp write somewhere else.
2824  *
2825  * Note that this doesn't guarantee that the instruction generated
2826  * only reg -- it might be the size=4 destination of a texture instruction.
2827  */
2828 fs_inst *
2829 fs_visitor::get_instruction_generating_reg(fs_inst *start,
2830                                            fs_inst *end,
2831                                            fs_reg reg)
2832 {
2833    if (end == start ||
2834        end->is_partial_write() ||
2835        reg.reladdr ||
2836        !reg.equals(end->dst)) {
2837       return NULL;
2838    } else {
2839       return end;
2840    }
2841 }
2842
2843 void
2844 fs_visitor::setup_payload_gen6()
2845 {
2846    bool uses_depth =
2847       (fp->Base.InputsRead & (1 << VARYING_SLOT_POS)) != 0;
2848    unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
2849
2850    assert(brw->gen >= 6);
2851
2852    /* R0-1: masks, pixel X/Y coordinates. */
2853    c->nr_payload_regs = 2;
2854    /* R2: only for 32-pixel dispatch.*/
2855
2856    /* R3-26: barycentric interpolation coordinates.  These appear in the
2857     * same order that they appear in the brw_wm_barycentric_interp_mode
2858     * enum.  Each set of coordinates occupies 2 registers if dispatch width
2859     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
2860     * appear if they were enabled using the "Barycentric Interpolation
2861     * Mode" bits in WM_STATE.
2862     */
2863    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
2864       if (barycentric_interp_modes & (1 << i)) {
2865          c->barycentric_coord_reg[i] = c->nr_payload_regs;
2866          c->nr_payload_regs += 2;
2867          if (dispatch_width == 16) {
2868             c->nr_payload_regs += 2;
2869          }
2870       }
2871    }
2872
2873    /* R27: interpolated depth if uses source depth */
2874    if (uses_depth) {
2875       c->source_depth_reg = c->nr_payload_regs;
2876       c->nr_payload_regs++;
2877       if (dispatch_width == 16) {
2878          /* R28: interpolated depth if not 8-wide. */
2879          c->nr_payload_regs++;
2880       }
2881    }
2882    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
2883    if (uses_depth) {
2884       c->source_w_reg = c->nr_payload_regs;
2885       c->nr_payload_regs++;
2886       if (dispatch_width == 16) {
2887          /* R30: interpolated W if not 8-wide. */
2888          c->nr_payload_regs++;
2889       }
2890    }
2891    /* R31: MSAA position offsets. */
2892    /* R32-: bary for 32-pixel. */
2893    /* R58-59: interp W for 32-pixel. */
2894
2895    if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
2896       c->source_depth_to_render_target = true;
2897    }
2898 }
2899
2900 bool
2901 fs_visitor::run()
2902 {
2903    sanity_param_count = fp->Base.Parameters->NumParameters;
2904    uint32_t orig_nr_params = c->prog_data.nr_params;
2905
2906    if (brw->gen >= 6)
2907       setup_payload_gen6();
2908    else
2909       setup_payload_gen4();
2910
2911    if (0) {
2912       emit_dummy_fs();
2913    } else {
2914       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2915          emit_shader_time_begin();
2916
2917       calculate_urb_setup();
2918       if (brw->gen < 6)
2919          emit_interpolation_setup_gen4();
2920       else
2921          emit_interpolation_setup_gen6();
2922
2923       /* We handle discards by keeping track of the still-live pixels in f0.1.
2924        * Initialize it with the dispatched pixels.
2925        */
2926       if (fp->UsesKill) {
2927          fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
2928          discard_init->flag_subreg = 1;
2929       }
2930
2931       /* Generate FS IR for main().  (the visitor only descends into
2932        * functions called "main").
2933        */
2934       if (shader) {
2935          foreach_list(node, &*shader->ir) {
2936             ir_instruction *ir = (ir_instruction *)node;
2937             base_ir = ir;
2938             this->result = reg_undef;
2939             ir->accept(this);
2940          }
2941       } else {
2942          emit_fragment_program_code();
2943       }
2944       base_ir = NULL;
2945       if (failed)
2946          return false;
2947
2948       emit(FS_OPCODE_PLACEHOLDER_HALT);
2949
2950       emit_fb_writes();
2951
2952       split_virtual_grfs();
2953
2954       move_uniform_array_access_to_pull_constants();
2955       setup_pull_constants();
2956
2957       bool progress;
2958       do {
2959          progress = false;
2960
2961          compact_virtual_grfs();
2962
2963          progress = remove_duplicate_mrf_writes() || progress;
2964
2965          progress = opt_algebraic() || progress;
2966          progress = opt_cse() || progress;
2967          progress = opt_copy_propagate() || progress;
2968          progress = dead_code_eliminate() || progress;
2969          progress = dead_code_eliminate_local() || progress;
2970          progress = register_coalesce() || progress;
2971          progress = register_coalesce_2() || progress;
2972          progress = compute_to_mrf() || progress;
2973       } while (progress);
2974
2975       remove_dead_constants();
2976
2977       schedule_instructions(false);
2978
2979       lower_uniform_pull_constant_loads();
2980
2981       assign_curb_setup();
2982       assign_urb_setup();
2983
2984       if (0) {
2985          /* Debug of register spilling: Go spill everything. */
2986          for (int i = 0; i < virtual_grf_count; i++) {
2987             spill_reg(i);
2988          }
2989       }
2990
2991       if (0)
2992          assign_regs_trivial();
2993       else {
2994          while (!assign_regs()) {
2995             if (failed)
2996                break;
2997          }
2998       }
2999    }
3000    assert(force_uncompressed_stack == 0);
3001    assert(force_sechalf_stack == 0);
3002
3003    /* This must come after all optimization and register allocation, since
3004     * it inserts dead code that happens to have side effects, and it does
3005     * so based on the actual physical registers in use.
3006     */
3007    insert_gen4_send_dependency_workarounds();
3008
3009    if (failed)
3010       return false;
3011
3012    schedule_instructions(true);
3013
3014    if (dispatch_width == 8) {
3015       c->prog_data.reg_blocks = brw_register_blocks(grf_used);
3016    } else {
3017       c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
3018
3019       /* Make sure we didn't try to sneak in an extra uniform */
3020       assert(orig_nr_params == c->prog_data.nr_params);
3021       (void) orig_nr_params;
3022    }
3023
3024    /* If any state parameters were appended, then ParameterValues could have
3025     * been realloced, in which case the driver uniform storage set up by
3026     * _mesa_associate_uniform_storage() would point to freed memory.  Make
3027     * sure that didn't happen.
3028     */
3029    assert(sanity_param_count == fp->Base.Parameters->NumParameters);
3030
3031    return !failed;
3032 }
3033
3034 const unsigned *
3035 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
3036                struct gl_fragment_program *fp,
3037                struct gl_shader_program *prog,
3038                unsigned *final_assembly_size)
3039 {
3040    bool start_busy = false;
3041    float start_time = 0;
3042
3043    if (unlikely(brw->perf_debug)) {
3044       start_busy = (brw->batch.last_bo &&
3045                     drm_intel_bo_busy(brw->batch.last_bo));
3046       start_time = get_time();
3047    }
3048
3049    struct brw_shader *shader = NULL;
3050    if (prog)
3051       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3052
3053    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3054       if (prog) {
3055          printf("GLSL IR for native fragment shader %d:\n", prog->Name);
3056          _mesa_print_ir(shader->ir, NULL);
3057          printf("\n\n");
3058       } else {
3059          printf("ARB_fragment_program %d ir for native fragment shader\n",
3060                 fp->Base.Id);
3061          _mesa_print_program(&fp->Base);
3062       }
3063    }
3064
3065    /* Now the main event: Visit the shader IR and generate our FS IR for it.
3066     */
3067    fs_visitor v(brw, c, prog, fp, 8);
3068    if (!v.run()) {
3069       if (prog) {
3070          prog->LinkStatus = false;
3071          ralloc_strcat(&prog->InfoLog, v.fail_msg);
3072       }
3073
3074       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3075                     v.fail_msg);
3076
3077       return NULL;
3078    }
3079
3080    exec_list *simd16_instructions = NULL;
3081    fs_visitor v2(brw, c, prog, fp, 16);
3082    if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16))) {
3083       if (c->prog_data.nr_pull_params == 0) {
3084          /* Try a 16-wide compile */
3085          v2.import_uniforms(&v);
3086          if (!v2.run()) {
3087             perf_debug("16-wide shader failed to compile, falling back to "
3088                        "8-wide at a 10-20%% performance cost: %s", v2.fail_msg);
3089          } else {
3090             simd16_instructions = &v2.instructions;
3091          }
3092       } else {
3093          perf_debug("Skipping 16-wide due to pull parameters.\n");
3094       }
3095    }
3096
3097    c->prog_data.dispatch_width = 8;
3098
3099    fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
3100    const unsigned *generated = g.generate_assembly(&v.instructions,
3101                                                    simd16_instructions,
3102                                                    final_assembly_size);
3103
3104    if (unlikely(brw->perf_debug) && shader) {
3105       if (shader->compiled_once)
3106          brw_wm_debug_recompile(brw, prog, &c->key);
3107       shader->compiled_once = true;
3108
3109       if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
3110          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3111                     (get_time() - start_time) * 1000);
3112       }
3113    }
3114
3115    return generated;
3116 }
3117
3118 bool
3119 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
3120 {
3121    struct brw_context *brw = brw_context(ctx);
3122    struct brw_wm_prog_key key;
3123
3124    if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
3125       return true;
3126
3127    struct gl_fragment_program *fp = (struct gl_fragment_program *)
3128       prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
3129    struct brw_fragment_program *bfp = brw_fragment_program(fp);
3130    bool program_uses_dfdy = fp->UsesDFdy;
3131
3132    memset(&key, 0, sizeof(key));
3133
3134    if (brw->gen < 6) {
3135       if (fp->UsesKill)
3136          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3137
3138       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3139          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3140
3141       /* Just assume depth testing. */
3142       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3143       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3144    }
3145
3146    if (brw->gen < 6)
3147       key.input_slots_valid |= BITFIELD64_BIT(VARYING_SLOT_POS);
3148
3149    for (int i = 0; i < VARYING_SLOT_MAX; i++) {
3150       if (!(fp->Base.InputsRead & BITFIELD64_BIT(i)))
3151          continue;
3152
3153       if (brw->gen < 6) {
3154          if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
3155             key.input_slots_valid |= BITFIELD64_BIT(i);
3156       }
3157    }
3158
3159    key.clamp_fragment_color = ctx->API == API_OPENGL_COMPAT;
3160
3161    unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
3162    for (unsigned i = 0; i < sampler_count; i++) {
3163       if (fp->Base.ShadowSamplers & (1 << i)) {
3164          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3165          key.tex.swizzles[i] =
3166             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3167       } else {
3168          /* Color sampler: assume no swizzling. */
3169          key.tex.swizzles[i] = SWIZZLE_XYZW;
3170       }
3171    }
3172
3173    if (fp->Base.InputsRead & VARYING_BIT_POS) {
3174       key.drawable_height = ctx->DrawBuffer->Height;
3175    }
3176
3177    if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3178       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
3179    }
3180
3181    key.nr_color_regions = 1;
3182
3183    key.program_string_id = bfp->id;
3184
3185    uint32_t old_prog_offset = brw->wm.prog_offset;
3186    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3187
3188    bool success = do_wm_prog(brw, prog, bfp, &key);
3189
3190    brw->wm.prog_offset = old_prog_offset;
3191    brw->wm.prog_data = old_prog_data;
3192
3193    return success;
3194 }