src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 extern "C" {
  32
  33 #include <sys/types.h>
  34
  35 #include "main/hash_table.h"
  36 #include "main/macros.h"
  37 #include "main/shaderobj.h"
  38 #include "main/uniforms.h"
  39 #include "main/fbobject.h"
  40 #include "program/prog_parameter.h"
  41 #include "program/prog_print.h"
  42 #include "program/register_allocate.h"
  43 #include "program/sampler.h"
  44 #include "program/hash_table.h"
  45 #include "brw_context.h"
  46 #include "brw_eu.h"
  47 #include "brw_wm.h"
  48 }
  49 #include "brw_fs.h"
  50 #include "glsl/glsl_types.h"
  51
  52 void
  53 fs_inst::init()
  54 {
  55    memset(this, 0, sizeof(*this));
  56    this->opcode = BRW_OPCODE_NOP;
  57    this->conditional_mod = BRW_CONDITIONAL_NONE;
  58
  59    this->dst = reg_undef;
  60    this->src[0] = reg_undef;
  61    this->src[1] = reg_undef;
  62    this->src[2] = reg_undef;
  63
  64    /* This will be the case for almost all instructions. */
  65    this->regs_written = 1;
  66 }
  67
  68 fs_inst::fs_inst()
  69 {
  70    init();
  71 }
  72
  73 fs_inst::fs_inst(enum opcode opcode)
  74 {
  75    init();
  76    this->opcode = opcode;
  77 }
  78
  79 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
  80 {
  81    init();
  82    this->opcode = opcode;
  83    this->dst = dst;
  84
  85    if (dst.file == GRF)
  86       assert(dst.reg_offset >= 0);
  87 }
  88
  89 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
  90 {
  91    init();
  92    this->opcode = opcode;
  93    this->dst = dst;
  94    this->src[0] = src0;
  95
  96    if (dst.file == GRF)
  97       assert(dst.reg_offset >= 0);
  98    if (src[0].file == GRF)
  99       assert(src[0].reg_offset >= 0);
 100 }
 101
 102 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 103 {
 104    init();
 105    this->opcode = opcode;
 106    this->dst = dst;
 107    this->src[0] = src0;
 108    this->src[1] = src1;
 109
 110    if (dst.file == GRF)
 111       assert(dst.reg_offset >= 0);
 112    if (src[0].file == GRF)
 113       assert(src[0].reg_offset >= 0);
 114    if (src[1].file == GRF)
 115       assert(src[1].reg_offset >= 0);
 116 }
 117
 118 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
 119                  fs_reg src0, fs_reg src1, fs_reg src2)
 120 {
 121    init();
 122    this->opcode = opcode;
 123    this->dst = dst;
 124    this->src[0] = src0;
 125    this->src[1] = src1;
 126    this->src[2] = src2;
 127
 128    if (dst.file == GRF)
 129       assert(dst.reg_offset >= 0);
 130    if (src[0].file == GRF)
 131       assert(src[0].reg_offset >= 0);
 132    if (src[1].file == GRF)
 133       assert(src[1].reg_offset >= 0);
 134    if (src[2].file == GRF)
 135       assert(src[2].reg_offset >= 0);
 136 }
 137
 138 #define ALU1(op)                                                        \
 139    fs_inst *                                                            \
 140    fs_visitor::op(fs_reg dst, fs_reg src0)                              \
 141    {                                                                    \
 142       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
 143    }
 144
 145 #define ALU2(op)                                                        \
 146    fs_inst *                                                            \
 147    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1)                 \
 148    {                                                                    \
 149       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
 150    }
 151
 152 #define ALU3(op)                                                        \
 153    fs_inst *                                                            \
 154    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1, fs_reg src2)    \
 155    {                                                                    \
 156       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
 157    }
 158
 159 ALU1(NOT)
 160 ALU1(MOV)
 161 ALU1(FRC)
 162 ALU1(RNDD)
 163 ALU1(RNDE)
 164 ALU1(RNDZ)
 165 ALU2(ADD)
 166 ALU2(MUL)
 167 ALU2(MACH)
 168 ALU2(AND)
 169 ALU2(OR)
 170 ALU2(XOR)
 171 ALU2(SHL)
 172 ALU2(SHR)
 173 ALU2(ASR)
 174 ALU3(LRP)
 175 ALU1(BFREV)
 176 ALU3(BFE)
 177 ALU2(BFI1)
 178 ALU3(BFI2)
 179 ALU1(FBH)
 180 ALU1(FBL)
 181 ALU1(CBIT)
 182 ALU3(MAD)
 183
 184 /** Gen4 predicated IF. */
 185 fs_inst *
 186 fs_visitor::IF(uint32_t predicate)
 187 {
 188    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
 189    inst->predicate = predicate;
 190    return inst;
 191 }
 192
 193 /** Gen6+ IF with embedded comparison. */
 194 fs_inst *
 195 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
 196 {
 197    assert(brw->gen >= 6);
 198    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
 199                                         reg_null_d, src0, src1);
 200    inst->conditional_mod = condition;
 201    return inst;
 202 }
 203
 204 /**
 205  * CMP: Sets the low bit of the destination channels with the result
 206  * of the comparison, while the upper bits are undefined, and updates
 207  * the flag register with the packed 16 bits of the result.
 208  */
 209 fs_inst *
 210 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
 211 {
 212    fs_inst *inst;
 213
 214    /* Take the instruction:
 215     *
 216     * CMP null<d> src0<f> src1<f>
 217     *
 218     * Original gen4 does type conversion to the destination type before
 219     * comparison, producing garbage results for floating point comparisons.
 220     * gen5 does the comparison on the execution type (resolved source types),
 221     * so dst type doesn't matter.  gen6 does comparison and then uses the
 222     * result as if it was the dst type with no conversion, which happens to
 223     * mostly work out for float-interpreted-as-int since our comparisons are
 224     * for >0, =0, <0.
 225     */
 226    if (brw->gen == 4) {
 227       dst.type = src0.type;
 228       if (dst.file == HW_REG)
 229          dst.fixed_hw_reg.type = dst.type;
 230    }
 231
 232    resolve_ud_negate(&src0);
 233    resolve_ud_negate(&src1);
 234
 235    inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
 236    inst->conditional_mod = condition;
 237
 238    return inst;
 239 }
 240
 241 exec_list
 242 fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
 243                                        fs_reg varying_offset,
 244                                        uint32_t const_offset)
 245 {
 246    exec_list instructions;
 247    fs_inst *inst;
 248
 249    /* We have our constant surface use a pitch of 4 bytes, so our index can
 250     * be any component of a vector, and then we load 4 contiguous
 251     * components starting from that.
 252     *
 253     * We break down the const_offset to a portion added to the variable
 254     * offset and a portion done using reg_offset, which means that if you
 255     * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
 256     * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
 257     * CSE can later notice that those loads are all the same and eliminate
 258     * the redundant ones.
 259     */
 260    fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
 261    instructions.push_tail(ADD(vec4_offset,
 262                               varying_offset, const_offset & ~3));
 263
 264    int scale = 1;
 265    if (brw->gen == 4 && dispatch_width == 8) {
 266       /* Pre-gen5, we can either use a SIMD8 message that requires (header,
 267        * u, v, r) as parameters, or we can just use the SIMD16 message
 268        * consisting of (header, u).  We choose the second, at the cost of a
 269        * longer return length.
 270        */
 271       scale = 2;
 272    }
 273
 274    enum opcode op;
 275    if (brw->gen >= 7)
 276       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
 277    else
 278       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
 279    fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4 * scale), dst.type);
 280    inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
 281    inst->regs_written = 4 * scale;
 282    instructions.push_tail(inst);
 283
 284    if (brw->gen < 7) {
 285       inst->base_mrf = 13;
 286       inst->header_present = true;
 287       if (brw->gen == 4)
 288          inst->mlen = 3;
 289       else
 290          inst->mlen = 1 + dispatch_width / 8;
 291    }
 292
 293    vec4_result.reg_offset += (const_offset & 3) * scale;
 294    instructions.push_tail(MOV(dst, vec4_result));
 295
 296    return instructions;
 297 }
 298
 299 /**
 300  * A helper for MOV generation for fixing up broken hardware SEND dependency
 301  * handling.
 302  */
 303 fs_inst *
 304 fs_visitor::DEP_RESOLVE_MOV(int grf)
 305 {
 306    fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
 307
 308    inst->ir = NULL;
 309    inst->annotation = "send dependency resolve";
 310
 311    /* The caller always wants uncompressed to emit the minimal extra
 312     * dependencies, and to avoid having to deal with aligning its regs to 2.
 313     */
 314    inst->force_uncompressed = true;
 315
 316    return inst;
 317 }
 318
 319 bool
 320 fs_inst::equals(fs_inst *inst)
 321 {
 322    return (opcode == inst->opcode &&
 323            dst.equals(inst->dst) &&
 324            src[0].equals(inst->src[0]) &&
 325            src[1].equals(inst->src[1]) &&
 326            src[2].equals(inst->src[2]) &&
 327            saturate == inst->saturate &&
 328            predicate == inst->predicate &&
 329            conditional_mod == inst->conditional_mod &&
 330            mlen == inst->mlen &&
 331            base_mrf == inst->base_mrf &&
 332            sampler == inst->sampler &&
 333            target == inst->target &&
 334            eot == inst->eot &&
 335            header_present == inst->header_present &&
 336            shadow_compare == inst->shadow_compare &&
 337            offset == inst->offset);
 338 }
 339
 340 bool
 341 fs_inst::overwrites_reg(const fs_reg &reg)
 342 {
 343    return (reg.file == dst.file &&
 344            reg.reg == dst.reg &&
 345            reg.reg_offset >= dst.reg_offset  &&
 346            reg.reg_offset < dst.reg_offset + regs_written);
 347 }
 348
 349 bool
 350 fs_inst::is_send_from_grf()
 351 {
 352    return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
 353            opcode == SHADER_OPCODE_SHADER_TIME_ADD ||
 354            (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
 355             src[1].file == GRF));
 356 }
 357
 358 bool
 359 fs_visitor::can_do_source_mods(fs_inst *inst)
 360 {
 361    if (brw->gen == 6 && inst->is_math())
 362       return false;
 363
 364    if (inst->is_send_from_grf())
 365       return false;
 366
 367    return true;
 368 }
 369
 370 void
 371 fs_reg::init()
 372 {
 373    memset(this, 0, sizeof(*this));
 374    this->smear = -1;
 375 }
 376
 377 /** Generic unset register constructor. */
 378 fs_reg::fs_reg()
 379 {
 380    init();
 381    this->file = BAD_FILE;
 382 }
 383
 384 /** Immediate value constructor. */
 385 fs_reg::fs_reg(float f)
 386 {
 387    init();
 388    this->file = IMM;
 389    this->type = BRW_REGISTER_TYPE_F;
 390    this->imm.f = f;
 391 }
 392
 393 /** Immediate value constructor. */
 394 fs_reg::fs_reg(int32_t i)
 395 {
 396    init();
 397    this->file = IMM;
 398    this->type = BRW_REGISTER_TYPE_D;
 399    this->imm.i = i;
 400 }
 401
 402 /** Immediate value constructor. */
 403 fs_reg::fs_reg(uint32_t u)
 404 {
 405    init();
 406    this->file = IMM;
 407    this->type = BRW_REGISTER_TYPE_UD;
 408    this->imm.u = u;
 409 }
 410
 411 /** Fixed brw_reg Immediate value constructor. */
 412 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
 413 {
 414    init();
 415    this->file = HW_REG;
 416    this->fixed_hw_reg = fixed_hw_reg;
 417    this->type = fixed_hw_reg.type;
 418 }
 419
 420 bool
 421 fs_reg::equals(const fs_reg &r) const
 422 {
 423    return (file == r.file &&
 424            reg == r.reg &&
 425            reg_offset == r.reg_offset &&
 426            type == r.type &&
 427            negate == r.negate &&
 428            abs == r.abs &&
 429            !reladdr && !r.reladdr &&
 430            memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
 431                   sizeof(fixed_hw_reg)) == 0 &&
 432            smear == r.smear &&
 433            imm.u == r.imm.u);
 434 }
 435
 436 bool
 437 fs_reg::is_zero() const
 438 {
 439    if (file != IMM)
 440       return false;
 441
 442    return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
 443 }
 444
 445 bool
 446 fs_reg::is_one() const
 447 {
 448    if (file != IMM)
 449       return false;
 450
 451    return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
 452 }
 453
 454 bool
 455 fs_reg::is_valid_3src() const
 456 {
 457    return file == GRF || file == UNIFORM;
 458 }
 459
 460 int
 461 fs_visitor::type_size(const struct glsl_type *type)
 462 {
 463    unsigned int size, i;
 464
 465    switch (type->base_type) {
 466    case GLSL_TYPE_UINT:
 467    case GLSL_TYPE_INT:
 468    case GLSL_TYPE_FLOAT:
 469    case GLSL_TYPE_BOOL:
 470       return type->components();
 471    case GLSL_TYPE_ARRAY:
 472       return type_size(type->fields.array) * type->length;
 473    case GLSL_TYPE_STRUCT:
 474       size = 0;
 475       for (i = 0; i < type->length; i++) {
 476          size += type_size(type->fields.structure[i].type);
 477       }
 478       return size;
 479    case GLSL_TYPE_SAMPLER:
 480       /* Samplers take up no register space, since they're baked in at
 481        * link time.
 482        */
 483       return 0;
 484    case GLSL_TYPE_VOID:
 485    case GLSL_TYPE_ERROR:
 486    case GLSL_TYPE_INTERFACE:
 487       assert(!"not reached");
 488       break;
 489    }
 490
 491    return 0;
 492 }
 493
 494 fs_reg
 495 fs_visitor::get_timestamp()
 496 {
 497    assert(brw->gen >= 7);
 498
 499    fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 500                                           BRW_ARF_TIMESTAMP,
 501                                           0),
 502                              BRW_REGISTER_TYPE_UD));
 503
 504    fs_reg dst = fs_reg(this, glsl_type::uint_type);
 505
 506    fs_inst *mov = emit(MOV(dst, ts));
 507    /* We want to read the 3 fields we care about (mostly field 0, but also 2)
 508     * even if it's not enabled in the dispatch.
 509     */
 510    mov->force_writemask_all = true;
 511    mov->force_uncompressed = true;
 512
 513    /* The caller wants the low 32 bits of the timestamp.  Since it's running
 514     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
 515     * which is plenty of time for our purposes.  It is identical across the
 516     * EUs, but since it's tracking GPU core speed it will increment at a
 517     * varying rate as render P-states change.
 518     *
 519     * The caller could also check if render P-states have changed (or anything
 520     * else that might disrupt timing) by setting smear to 2 and checking if
 521     * that field is != 0.
 522     */
 523    dst.smear = 0;
 524
 525    return dst;
 526 }
 527
 528 void
 529 fs_visitor::emit_shader_time_begin()
 530 {
 531    current_annotation = "shader time start";
 532    shader_start_time = get_timestamp();
 533 }
 534
 535 void
 536 fs_visitor::emit_shader_time_end()
 537 {
 538    current_annotation = "shader time end";
 539
 540    enum shader_time_shader_type type, written_type, reset_type;
 541    if (dispatch_width == 8) {
 542       type = ST_FS8;
 543       written_type = ST_FS8_WRITTEN;
 544       reset_type = ST_FS8_RESET;
 545    } else {
 546       assert(dispatch_width == 16);
 547       type = ST_FS16;
 548       written_type = ST_FS16_WRITTEN;
 549       reset_type = ST_FS16_RESET;
 550    }
 551
 552    fs_reg shader_end_time = get_timestamp();
 553
 554    /* Check that there weren't any timestamp reset events (assuming these
 555     * were the only two timestamp reads that happened).
 556     */
 557    fs_reg reset = shader_end_time;
 558    reset.smear = 2;
 559    fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
 560    test->conditional_mod = BRW_CONDITIONAL_Z;
 561    emit(IF(BRW_PREDICATE_NORMAL));
 562
 563    push_force_uncompressed();
 564    fs_reg start = shader_start_time;
 565    start.negate = true;
 566    fs_reg diff = fs_reg(this, glsl_type::uint_type);
 567    emit(ADD(diff, start, shader_end_time));
 568
 569    /* If there were no instructions between the two timestamp gets, the diff
 570     * is 2 cycles.  Remove that overhead, so I can forget about that when
 571     * trying to determine the time taken for single instructions.
 572     */
 573    emit(ADD(diff, diff, fs_reg(-2u)));
 574
 575    emit_shader_time_write(type, diff);
 576    emit_shader_time_write(written_type, fs_reg(1u));
 577    emit(BRW_OPCODE_ELSE);
 578    emit_shader_time_write(reset_type, fs_reg(1u));
 579    emit(BRW_OPCODE_ENDIF);
 580
 581    pop_force_uncompressed();
 582 }
 583
 584 void
 585 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
 586                                    fs_reg value)
 587 {
 588    int shader_time_index =
 589       brw_get_shader_time_index(brw, shader_prog, &fp->Base, type);
 590    fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
 591
 592    fs_reg payload;
 593    if (dispatch_width == 8)
 594       payload = fs_reg(this, glsl_type::uvec2_type);
 595    else
 596       payload = fs_reg(this, glsl_type::uint_type);
 597
 598    emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
 599                 fs_reg(), payload, offset, value));
 600 }
 601
 602 void
 603 fs_visitor::fail(const char *format, ...)
 604 {
 605    va_list va;
 606    char *msg;
 607
 608    if (failed)
 609       return;
 610
 611    failed = true;
 612
 613    va_start(va, format);
 614    msg = ralloc_vasprintf(mem_ctx, format, va);
 615    va_end(va);
 616    msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
 617
 618    this->fail_msg = msg;
 619
 620    if (INTEL_DEBUG & DEBUG_WM) {
 621       fprintf(stderr, "%s",  msg);
 622    }
 623 }
 624
 625 fs_inst *
 626 fs_visitor::emit(enum opcode opcode)
 627 {
 628    return emit(fs_inst(opcode));
 629 }
 630
 631 fs_inst *
 632 fs_visitor::emit(enum opcode opcode, fs_reg dst)
 633 {
 634    return emit(fs_inst(opcode, dst));
 635 }
 636
 637 fs_inst *
 638 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
 639 {
 640    return emit(fs_inst(opcode, dst, src0));
 641 }
 642
 643 fs_inst *
 644 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 645 {
 646    return emit(fs_inst(opcode, dst, src0, src1));
 647 }
 648
 649 fs_inst *
 650 fs_visitor::emit(enum opcode opcode, fs_reg dst,
 651                  fs_reg src0, fs_reg src1, fs_reg src2)
 652 {
 653    return emit(fs_inst(opcode, dst, src0, src1, src2));
 654 }
 655
 656 void
 657 fs_visitor::push_force_uncompressed()
 658 {
 659    force_uncompressed_stack++;
 660 }
 661
 662 void
 663 fs_visitor::pop_force_uncompressed()
 664 {
 665    force_uncompressed_stack--;
 666    assert(force_uncompressed_stack >= 0);
 667 }
 668
 669 void
 670 fs_visitor::push_force_sechalf()
 671 {
 672    force_sechalf_stack++;
 673 }
 674
 675 void
 676 fs_visitor::pop_force_sechalf()
 677 {
 678    force_sechalf_stack--;
 679    assert(force_sechalf_stack >= 0);
 680 }
 681
 682 /**
 683  * Returns true if the instruction has a flag that means it won't
 684  * update an entire destination register.
 685  *
 686  * For example, dead code elimination and live variable analysis want to know
 687  * when a write to a variable screens off any preceding values that were in
 688  * it.
 689  */
 690 bool
 691 fs_inst::is_partial_write()
 692 {
 693    return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
 694            this->force_uncompressed ||
 695            this->force_sechalf);
 696 }
 697
 698 /**
 699  * Returns how many MRFs an FS opcode will write over.
 700  *
 701  * Note that this is not the 0 or 1 implied writes in an actual gen
 702  * instruction -- the FS opcodes often generate MOVs in addition.
 703  */
 704 int
 705 fs_visitor::implied_mrf_writes(fs_inst *inst)
 706 {
 707    if (inst->mlen == 0)
 708       return 0;
 709
 710    switch (inst->opcode) {
 711    case SHADER_OPCODE_RCP:
 712    case SHADER_OPCODE_RSQ:
 713    case SHADER_OPCODE_SQRT:
 714    case SHADER_OPCODE_EXP2:
 715    case SHADER_OPCODE_LOG2:
 716    case SHADER_OPCODE_SIN:
 717    case SHADER_OPCODE_COS:
 718       return 1 * dispatch_width / 8;
 719    case SHADER_OPCODE_POW:
 720    case SHADER_OPCODE_INT_QUOTIENT:
 721    case SHADER_OPCODE_INT_REMAINDER:
 722       return 2 * dispatch_width / 8;
 723    case SHADER_OPCODE_TEX:
 724    case FS_OPCODE_TXB:
 725    case SHADER_OPCODE_TXD:
 726    case SHADER_OPCODE_TXF:
 727    case SHADER_OPCODE_TXF_MS:
 728    case SHADER_OPCODE_TXL:
 729    case SHADER_OPCODE_TXS:
 730    case SHADER_OPCODE_LOD:
 731       return 1;
 732    case FS_OPCODE_FB_WRITE:
 733       return 2;
 734    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 735    case FS_OPCODE_UNSPILL:
 736       return 1;
 737    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
 738       return inst->mlen;
 739    case FS_OPCODE_SPILL:
 740       return 2;
 741    default:
 742       assert(!"not reached");
 743       return inst->mlen;
 744    }
 745 }
 746
 747 int
 748 fs_visitor::virtual_grf_alloc(int size)
 749 {
 750    if (virtual_grf_array_size <= virtual_grf_count) {
 751       if (virtual_grf_array_size == 0)
 752          virtual_grf_array_size = 16;
 753       else
 754          virtual_grf_array_size *= 2;
 755       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 756                                    virtual_grf_array_size);
 757    }
 758    virtual_grf_sizes[virtual_grf_count] = size;
 759    return virtual_grf_count++;
 760 }
 761
 762 /** Fixed HW reg constructor. */
 763 fs_reg::fs_reg(enum register_file file, int reg)
 764 {
 765    init();
 766    this->file = file;
 767    this->reg = reg;
 768    this->type = BRW_REGISTER_TYPE_F;
 769 }
 770
 771 /** Fixed HW reg constructor. */
 772 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
 773 {
 774    init();
 775    this->file = file;
 776    this->reg = reg;
 777    this->type = type;
 778 }
 779
 780 /** Automatic reg constructor. */
 781 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
 782 {
 783    init();
 784
 785    this->file = GRF;
 786    this->reg = v->virtual_grf_alloc(v->type_size(type));
 787    this->reg_offset = 0;
 788    this->type = brw_type_for_base_type(type);
 789 }
 790
 791 fs_reg *
 792 fs_visitor::variable_storage(ir_variable *var)
 793 {
 794    return (fs_reg *)hash_table_find(this->variable_ht, var);
 795 }
 796
 797 void
 798 import_uniforms_callback(const void *key,
 799                          void *data,
 800                          void *closure)
 801 {
 802    struct hash_table *dst_ht = (struct hash_table *)closure;
 803    const fs_reg *reg = (const fs_reg *)data;
 804
 805    if (reg->file != UNIFORM)
 806       return;
 807
 808    hash_table_insert(dst_ht, data, key);
 809 }
 810
 811 /* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
 812  * This brings in those uniform definitions
 813  */
 814 void
 815 fs_visitor::import_uniforms(fs_visitor *v)
 816 {
 817    hash_table_call_foreach(v->variable_ht,
 818                            import_uniforms_callback,
 819                            variable_ht);
 820    this->params_remap = v->params_remap;
 821    this->nr_params_remap = v->nr_params_remap;
 822 }
 823
 824 /* Our support for uniforms is piggy-backed on the struct
 825  * gl_fragment_program, because that's where the values actually
 826  * get stored, rather than in some global gl_shader_program uniform
 827  * store.
 828  */
 829 void
 830 fs_visitor::setup_uniform_values(ir_variable *ir)
 831 {
 832    int namelen = strlen(ir->name);
 833
 834    /* The data for our (non-builtin) uniforms is stored in a series of
 835     * gl_uniform_driver_storage structs for each subcomponent that
 836     * glGetUniformLocation() could name.  We know it's been set up in the same
 837     * order we'd walk the type, so walk the list of storage and find anything
 838     * with our name, or the prefix of a component that starts with our name.
 839     */
 840    unsigned params_before = c->prog_data.nr_params;
 841    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
 842       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 843
 844       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 845           (storage->name[namelen] != 0 &&
 846            storage->name[namelen] != '.' &&
 847            storage->name[namelen] != '[')) {
 848          continue;
 849       }
 850
 851       unsigned slots = storage->type->component_slots();
 852       if (storage->array_elements)
 853          slots *= storage->array_elements;
 854
 855       for (unsigned i = 0; i < slots; i++) {
 856          c->prog_data.param[c->prog_data.nr_params++] =
 857             &storage->storage[i].f;
 858       }
 859    }
 860
 861    /* Make sure we actually initialized the right amount of stuff here. */
 862    assert(params_before + ir->type->component_slots() ==
 863           c->prog_data.nr_params);
 864    (void)params_before;
 865 }
 866
 867
 868 /* Our support for builtin uniforms is even scarier than non-builtin.
 869  * It sits on top of the PROG_STATE_VAR parameters that are
 870  * automatically updated from GL context state.
 871  */
 872 void
 873 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
 874 {
 875    const ir_state_slot *const slots = ir->state_slots;
 876    assert(ir->state_slots != NULL);
 877
 878    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 879       /* This state reference has already been setup by ir_to_mesa, but we'll
 880        * get the same index back here.
 881        */
 882       int index = _mesa_add_state_reference(this->fp->Base.Parameters,
 883                                             (gl_state_index *)slots[i].tokens);
 884
 885       /* Add each of the unique swizzles of the element as a parameter.
 886        * This'll end up matching the expected layout of the
 887        * array/matrix/structure we're trying to fill in.
 888        */
 889       int last_swiz = -1;
 890       for (unsigned int j = 0; j < 4; j++) {
 891          int swiz = GET_SWZ(slots[i].swizzle, j);
 892          if (swiz == last_swiz)
 893             break;
 894          last_swiz = swiz;
 895
 896          c->prog_data.param[c->prog_data.nr_params++] =
 897             &fp->Base.Parameters->ParameterValues[index][swiz].f;
 898       }
 899    }
 900 }
 901
 902 fs_reg *
 903 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
 904 {
 905    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 906    fs_reg wpos = *reg;
 907    bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
 908
 909    /* gl_FragCoord.x */
 910    if (ir->pixel_center_integer) {
 911       emit(MOV(wpos, this->pixel_x));
 912    } else {
 913       emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
 914    }
 915    wpos.reg_offset++;
 916
 917    /* gl_FragCoord.y */
 918    if (!flip && ir->pixel_center_integer) {
 919       emit(MOV(wpos, this->pixel_y));
 920    } else {
 921       fs_reg pixel_y = this->pixel_y;
 922       float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
 923
 924       if (flip) {
 925          pixel_y.negate = true;
 926          offset += c->key.drawable_height - 1.0;
 927       }
 928
 929       emit(ADD(wpos, pixel_y, fs_reg(offset)));
 930    }
 931    wpos.reg_offset++;
 932
 933    /* gl_FragCoord.z */
 934    if (brw->gen >= 6) {
 935       emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
 936    } else {
 937       emit(FS_OPCODE_LINTERP, wpos,
 938            this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
 939            this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
 940            interp_reg(VARYING_SLOT_POS, 2));
 941    }
 942    wpos.reg_offset++;
 943
 944    /* gl_FragCoord.w: Already set up in emit_interpolation */
 945    emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
 946
 947    return reg;
 948 }
 949
 950 fs_inst *
 951 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
 952                          glsl_interp_qualifier interpolation_mode,
 953                          bool is_centroid)
 954 {
 955    brw_wm_barycentric_interp_mode barycoord_mode;
 956    if (brw->gen >= 6) {
 957       if (is_centroid) {
 958          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
 959             barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
 960          else
 961             barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
 962       } else {
 963          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
 964             barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
 965          else
 966             barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
 967       }
 968    } else {
 969       /* On Ironlake and below, there is only one interpolation mode.
 970        * Centroid interpolation doesn't mean anything on this hardware --
 971        * there is no multisampling.
 972        */
 973       barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
 974    }
 975    return emit(FS_OPCODE_LINTERP, attr,
 976                this->delta_x[barycoord_mode],
 977                this->delta_y[barycoord_mode], interp);
 978 }
 979
 980 fs_reg *
 981 fs_visitor::emit_general_interpolation(ir_variable *ir)
 982 {
 983    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 984    reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
 985    fs_reg attr = *reg;
 986
 987    unsigned int array_elements;
 988    const glsl_type *type;
 989
 990    if (ir->type->is_array()) {
 991       array_elements = ir->type->length;
 992       if (array_elements == 0) {
 993          fail("dereferenced array '%s' has length 0\n", ir->name);
 994       }
 995       type = ir->type->fields.array;
 996    } else {
 997       array_elements = 1;
 998       type = ir->type;
 999    }
1000
1001    glsl_interp_qualifier interpolation_mode =
1002       ir->determine_interpolation_mode(c->key.flat_shade);
1003
1004    int location = ir->location;
1005    for (unsigned int i = 0; i < array_elements; i++) {
1006       for (unsigned int j = 0; j < type->matrix_columns; j++) {
1007          if (urb_setup[location] == -1) {
1008             /* If there's no incoming setup data for this slot, don't
1009              * emit interpolation for it.
1010              */
1011             attr.reg_offset += type->vector_elements;
1012             location++;
1013             continue;
1014          }
1015
1016          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1017             /* Constant interpolation (flat shading) case. The SF has
1018              * handed us defined values in only the constant offset
1019              * field of the setup reg.
1020              */
1021             for (unsigned int k = 0; k < type->vector_elements; k++) {
1022                struct brw_reg interp = interp_reg(location, k);
1023                interp = suboffset(interp, 3);
1024                interp.type = reg->type;
1025                emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1026                attr.reg_offset++;
1027             }
1028          } else {
1029             /* Smooth/noperspective interpolation case. */
1030             for (unsigned int k = 0; k < type->vector_elements; k++) {
1031                /* FINISHME: At some point we probably want to push
1032                 * this farther by giving similar treatment to the
1033                 * other potentially constant components of the
1034                 * attribute, as well as making brw_vs_constval.c
1035                 * handle varyings other than gl_TexCoord.
1036                 */
1037                struct brw_reg interp = interp_reg(location, k);
1038                emit_linterp(attr, fs_reg(interp), interpolation_mode,
1039                             ir->centroid);
1040                if (brw->needs_unlit_centroid_workaround && ir->centroid) {
1041                   /* Get the pixel/sample mask into f0 so that we know
1042                    * which pixels are lit.  Then, for each channel that is
1043                    * unlit, replace the centroid data with non-centroid
1044                    * data.
1045                    */
1046                   emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1047                   fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1048                                                interpolation_mode, false);
1049                   inst->predicate = BRW_PREDICATE_NORMAL;
1050                   inst->predicate_inverse = true;
1051                }
1052                if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1053                   emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1054                }
1055                attr.reg_offset++;
1056             }
1057
1058          }
1059          location++;
1060       }
1061    }
1062
1063    return reg;
1064 }
1065
1066 fs_reg *
1067 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1068 {
1069    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1070
1071    /* The frontfacing comes in as a bit in the thread payload. */
1072    if (brw->gen >= 6) {
1073       emit(BRW_OPCODE_ASR, *reg,
1074            fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1075            fs_reg(15));
1076       emit(BRW_OPCODE_NOT, *reg, *reg);
1077       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1078    } else {
1079       struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1080       /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1081        * us front face
1082        */
1083       emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1084       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1085    }
1086
1087    return reg;
1088 }
1089
1090 fs_reg
1091 fs_visitor::fix_math_operand(fs_reg src)
1092 {
1093    /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1094     * might be able to do better by doing execsize = 1 math and then
1095     * expanding that result out, but we would need to be careful with
1096     * masking.
1097     *
1098     * The hardware ignores source modifiers (negate and abs) on math
1099     * instructions, so we also move to a temp to set those up.
1100     */
1101    if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1102        !src.abs && !src.negate)
1103       return src;
1104
1105    /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1106     * operands to math
1107     */
1108    if (brw->gen >= 7 && src.file != IMM)
1109       return src;
1110
1111    fs_reg expanded = fs_reg(this, glsl_type::float_type);
1112    expanded.type = src.type;
1113    emit(BRW_OPCODE_MOV, expanded, src);
1114    return expanded;
1115 }
1116
1117 fs_inst *
1118 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1119 {
1120    switch (opcode) {
1121    case SHADER_OPCODE_RCP:
1122    case SHADER_OPCODE_RSQ:
1123    case SHADER_OPCODE_SQRT:
1124    case SHADER_OPCODE_EXP2:
1125    case SHADER_OPCODE_LOG2:
1126    case SHADER_OPCODE_SIN:
1127    case SHADER_OPCODE_COS:
1128       break;
1129    default:
1130       assert(!"not reached: bad math opcode");
1131       return NULL;
1132    }
1133
1134    /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
1135     * might be able to do better by doing execsize = 1 math and then
1136     * expanding that result out, but we would need to be careful with
1137     * masking.
1138     *
1139     * Gen 6 hardware ignores source modifiers (negate and abs) on math
1140     * instructions, so we also move to a temp to set those up.
1141     */
1142    if (brw->gen >= 6)
1143       src = fix_math_operand(src);
1144
1145    fs_inst *inst = emit(opcode, dst, src);
1146
1147    if (brw->gen < 6) {
1148       inst->base_mrf = 2;
1149       inst->mlen = dispatch_width / 8;
1150    }
1151
1152    return inst;
1153 }
1154
1155 fs_inst *
1156 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1157 {
1158    int base_mrf = 2;
1159    fs_inst *inst;
1160
1161    switch (opcode) {
1162    case SHADER_OPCODE_INT_QUOTIENT:
1163    case SHADER_OPCODE_INT_REMAINDER:
1164       if (brw->gen >= 7 && dispatch_width == 16)
1165          fail("16-wide INTDIV unsupported\n");
1166       break;
1167    case SHADER_OPCODE_POW:
1168       break;
1169    default:
1170       assert(!"not reached: unsupported binary math opcode.");
1171       return NULL;
1172    }
1173
1174    if (brw->gen >= 6) {
1175       src0 = fix_math_operand(src0);
1176       src1 = fix_math_operand(src1);
1177
1178       inst = emit(opcode, dst, src0, src1);
1179    } else {
1180       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1181        * "Message Payload":
1182        *
1183        * "Operand0[7].  For the INT DIV functions, this operand is the
1184        *  denominator."
1185        *  ...
1186        * "Operand1[7].  For the INT DIV functions, this operand is the
1187        *  numerator."
1188        */
1189       bool is_int_div = opcode != SHADER_OPCODE_POW;
1190       fs_reg &op0 = is_int_div ? src1 : src0;
1191       fs_reg &op1 = is_int_div ? src0 : src1;
1192
1193       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1194       inst = emit(opcode, dst, op0, reg_null_f);
1195
1196       inst->base_mrf = base_mrf;
1197       inst->mlen = 2 * dispatch_width / 8;
1198    }
1199    return inst;
1200 }
1201
1202 void
1203 fs_visitor::assign_curb_setup()
1204 {
1205    c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
1206    if (dispatch_width == 8) {
1207       c->prog_data.first_curbe_grf = c->nr_payload_regs;
1208    } else {
1209       c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1210    }
1211
1212    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1213    foreach_list(node, &this->instructions) {
1214       fs_inst *inst = (fs_inst *)node;
1215
1216       for (unsigned int i = 0; i < 3; i++) {
1217          if (inst->src[i].file == UNIFORM) {
1218             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1219             struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1220                                                   constant_nr / 8,
1221                                                   constant_nr % 8);
1222
1223             inst->src[i].file = HW_REG;
1224             inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
1225          }
1226       }
1227    }
1228 }
1229
1230 void
1231 fs_visitor::calculate_urb_setup()
1232 {
1233    for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1234       urb_setup[i] = -1;
1235    }
1236
1237    int urb_next = 0;
1238    /* Figure out where each of the incoming setup attributes lands. */
1239    if (brw->gen >= 6) {
1240       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1241          if (fp->Base.InputsRead & BITFIELD64_BIT(i)) {
1242             urb_setup[i] = urb_next++;
1243          }
1244       }
1245    } else {
1246       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1247       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1248          /* Point size is packed into the header, not as a general attribute */
1249          if (i == VARYING_SLOT_PSIZ)
1250             continue;
1251
1252          if (c->key.input_slots_valid & BITFIELD64_BIT(i)) {
1253             /* The back color slot is skipped when the front color is
1254              * also written to.  In addition, some slots can be
1255              * written in the vertex shader and not read in the
1256              * fragment shader.  So the register number must always be
1257              * incremented, mapped or not.
1258              */
1259             if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1260                urb_setup[i] = urb_next;
1261             urb_next++;
1262          }
1263       }
1264
1265       /*
1266        * It's a FS only attribute, and we did interpolation for this attribute
1267        * in SF thread. So, count it here, too.
1268        *
1269        * See compile_sf_prog() for more info.
1270        */
1271       if (fp->Base.InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1272          urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1273    }
1274
1275    /* Each attribute is 4 setup channels, each of which is half a reg. */
1276    c->prog_data.urb_read_length = urb_next * 2;
1277 }
1278
1279 void
1280 fs_visitor::assign_urb_setup()
1281 {
1282    int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1283
1284    /* Offset all the urb_setup[] index by the actual position of the
1285     * setup regs, now that the location of the constants has been chosen.
1286     */
1287    foreach_list(node, &this->instructions) {
1288       fs_inst *inst = (fs_inst *)node;
1289
1290       if (inst->opcode == FS_OPCODE_LINTERP) {
1291          assert(inst->src[2].file == HW_REG);
1292          inst->src[2].fixed_hw_reg.nr += urb_start;
1293       }
1294
1295       if (inst->opcode == FS_OPCODE_CINTERP) {
1296          assert(inst->src[0].file == HW_REG);
1297          inst->src[0].fixed_hw_reg.nr += urb_start;
1298       }
1299    }
1300
1301    this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length;
1302 }
1303
1304 /**
1305  * Split large virtual GRFs into separate components if we can.
1306  *
1307  * This is mostly duplicated with what brw_fs_vector_splitting does,
1308  * but that's really conservative because it's afraid of doing
1309  * splitting that doesn't result in real progress after the rest of
1310  * the optimization phases, which would cause infinite looping in
1311  * optimization.  We can do it once here, safely.  This also has the
1312  * opportunity to split interpolated values, or maybe even uniforms,
1313  * which we don't have at the IR level.
1314  *
1315  * We want to split, because virtual GRFs are what we register
1316  * allocate and spill (due to contiguousness requirements for some
1317  * instructions), and they're what we naturally generate in the
1318  * codegen process, but most virtual GRFs don't actually need to be
1319  * contiguous sets of GRFs.  If we split, we'll end up with reduced
1320  * live intervals and better dead code elimination and coalescing.
1321  */
1322 void
1323 fs_visitor::split_virtual_grfs()
1324 {
1325    int num_vars = this->virtual_grf_count;
1326    bool split_grf[num_vars];
1327    int new_virtual_grf[num_vars];
1328
1329    /* Try to split anything > 0 sized. */
1330    for (int i = 0; i < num_vars; i++) {
1331       if (this->virtual_grf_sizes[i] != 1)
1332          split_grf[i] = true;
1333       else
1334          split_grf[i] = false;
1335    }
1336
1337    if (brw->has_pln &&
1338        this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1339       /* PLN opcodes rely on the delta_xy being contiguous.  We only have to
1340        * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1341        * Gen6, that was the only supported interpolation mode, and since Gen6,
1342        * delta_x and delta_y are in fixed hardware registers.
1343        */
1344       split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1345          false;
1346    }
1347
1348    foreach_list(node, &this->instructions) {
1349       fs_inst *inst = (fs_inst *)node;
1350
1351       /* If there's a SEND message that requires contiguous destination
1352        * registers, no splitting is allowed.
1353        */
1354       if (inst->regs_written > 1) {
1355          split_grf[inst->dst.reg] = false;
1356       }
1357
1358       /* If we're sending from a GRF, don't split it, on the assumption that
1359        * the send is reading the whole thing.
1360        */
1361       if (inst->is_send_from_grf()) {
1362          for (int i = 0; i < 3; i++) {
1363             if (inst->src[i].file == GRF) {
1364                split_grf[inst->src[i].reg] = false;
1365             }
1366          }
1367       }
1368    }
1369
1370    /* Allocate new space for split regs.  Note that the virtual
1371     * numbers will be contiguous.
1372     */
1373    for (int i = 0; i < num_vars; i++) {
1374       if (split_grf[i]) {
1375          new_virtual_grf[i] = virtual_grf_alloc(1);
1376          for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1377             int reg = virtual_grf_alloc(1);
1378             assert(reg == new_virtual_grf[i] + j - 1);
1379             (void) reg;
1380          }
1381          this->virtual_grf_sizes[i] = 1;
1382       }
1383    }
1384
1385    foreach_list(node, &this->instructions) {
1386       fs_inst *inst = (fs_inst *)node;
1387
1388       if (inst->dst.file == GRF &&
1389           split_grf[inst->dst.reg] &&
1390           inst->dst.reg_offset != 0) {
1391          inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1392                           inst->dst.reg_offset - 1);
1393          inst->dst.reg_offset = 0;
1394       }
1395       for (int i = 0; i < 3; i++) {
1396          if (inst->src[i].file == GRF &&
1397              split_grf[inst->src[i].reg] &&
1398              inst->src[i].reg_offset != 0) {
1399             inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1400                                 inst->src[i].reg_offset - 1);
1401             inst->src[i].reg_offset = 0;
1402          }
1403       }
1404    }
1405    this->live_intervals_valid = false;
1406 }
1407
1408 /**
1409  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1410  *
1411  * During code generation, we create tons of temporary variables, many of
1412  * which get immediately killed and are never used again.  Yet, in later
1413  * optimization and analysis passes, such as compute_live_intervals, we need
1414  * to loop over all the virtual GRFs.  Compacting them can save a lot of
1415  * overhead.
1416  */
1417 void
1418 fs_visitor::compact_virtual_grfs()
1419 {
1420    /* Mark which virtual GRFs are used, and count how many. */
1421    int remap_table[this->virtual_grf_count];
1422    memset(remap_table, -1, sizeof(remap_table));
1423
1424    foreach_list(node, &this->instructions) {
1425       const fs_inst *inst = (const fs_inst *) node;
1426
1427       if (inst->dst.file == GRF)
1428          remap_table[inst->dst.reg] = 0;
1429
1430       for (int i = 0; i < 3; i++) {
1431          if (inst->src[i].file == GRF)
1432             remap_table[inst->src[i].reg] = 0;
1433       }
1434    }
1435
1436    /* In addition to registers used in instructions, fs_visitor keeps
1437     * direct references to certain special values which must be patched:
1438     */
1439    fs_reg *special[] = {
1440       &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1441       &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1442       &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1443       &delta_x[0], &delta_x[1], &delta_x[2],
1444       &delta_x[3], &delta_x[4], &delta_x[5],
1445       &delta_y[0], &delta_y[1], &delta_y[2],
1446       &delta_y[3], &delta_y[4], &delta_y[5],
1447    };
1448    STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1449    STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1450
1451    /* Treat all special values as used, to be conservative */
1452    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1453       if (special[i]->file == GRF)
1454          remap_table[special[i]->reg] = 0;
1455    }
1456
1457    /* Compact the GRF arrays. */
1458    int new_index = 0;
1459    for (int i = 0; i < this->virtual_grf_count; i++) {
1460       if (remap_table[i] != -1) {
1461          remap_table[i] = new_index;
1462          virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1463          if (live_intervals_valid) {
1464             virtual_grf_start[new_index] = virtual_grf_start[i];
1465             virtual_grf_end[new_index] = virtual_grf_end[i];
1466          }
1467          ++new_index;
1468       }
1469    }
1470
1471    this->virtual_grf_count = new_index;
1472
1473    /* Patch all the instructions to use the newly renumbered registers */
1474    foreach_list(node, &this->instructions) {
1475       fs_inst *inst = (fs_inst *) node;
1476
1477       if (inst->dst.file == GRF)
1478          inst->dst.reg = remap_table[inst->dst.reg];
1479
1480       for (int i = 0; i < 3; i++) {
1481          if (inst->src[i].file == GRF)
1482             inst->src[i].reg = remap_table[inst->src[i].reg];
1483       }
1484    }
1485
1486    /* Patch all the references to special values */
1487    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1488       if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1489          special[i]->reg = remap_table[special[i]->reg];
1490    }
1491 }
1492
1493 bool
1494 fs_visitor::remove_dead_constants()
1495 {
1496    if (dispatch_width == 8) {
1497       this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
1498       this->nr_params_remap = c->prog_data.nr_params;
1499
1500       for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
1501          this->params_remap[i] = -1;
1502
1503       /* Find which params are still in use. */
1504       foreach_list(node, &this->instructions) {
1505          fs_inst *inst = (fs_inst *)node;
1506
1507          for (int i = 0; i < 3; i++) {
1508             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1509
1510             if (inst->src[i].file != UNIFORM)
1511                continue;
1512
1513             /* Section 5.11 of the OpenGL 4.3 spec says:
1514              *
1515              *     "Out-of-bounds reads return undefined values, which include
1516              *     values from other variables of the active program or zero."
1517              */
1518             if (constant_nr < 0 || constant_nr >= (int)c->prog_data.nr_params) {
1519                constant_nr = 0;
1520             }
1521
1522             /* For now, set this to non-negative.  We'll give it the
1523              * actual new number in a moment, in order to keep the
1524              * register numbers nicely ordered.
1525              */
1526             this->params_remap[constant_nr] = 0;
1527          }
1528       }
1529
1530       /* Figure out what the new numbers for the params will be.  At some
1531        * point when we're doing uniform array access, we're going to want
1532        * to keep the distinction between .reg and .reg_offset, but for
1533        * now we don't care.
1534        */
1535       unsigned int new_nr_params = 0;
1536       for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1537          if (this->params_remap[i] != -1) {
1538             this->params_remap[i] = new_nr_params++;
1539          }
1540       }
1541
1542       /* Update the list of params to be uploaded to match our new numbering. */
1543       for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1544          int remapped = this->params_remap[i];
1545
1546          if (remapped == -1)
1547             continue;
1548
1549          c->prog_data.param[remapped] = c->prog_data.param[i];
1550       }
1551
1552       c->prog_data.nr_params = new_nr_params;
1553    } else {
1554       /* This should have been generated in the 8-wide pass already. */
1555       assert(this->params_remap);
1556    }
1557
1558    /* Now do the renumbering of the shader to remove unused params. */
1559    foreach_list(node, &this->instructions) {
1560       fs_inst *inst = (fs_inst *)node;
1561
1562       for (int i = 0; i < 3; i++) {
1563          int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1564
1565          if (inst->src[i].file != UNIFORM)
1566             continue;
1567
1568          /* as above alias to 0 */
1569          if (constant_nr < 0 || constant_nr >= (int)this->nr_params_remap) {
1570             constant_nr = 0;
1571          }
1572          assert(this->params_remap[constant_nr] != -1);
1573          inst->src[i].reg = this->params_remap[constant_nr];
1574          inst->src[i].reg_offset = 0;
1575       }
1576    }
1577
1578    return true;
1579 }
1580
1581 /*
1582  * Implements array access of uniforms by inserting a
1583  * PULL_CONSTANT_LOAD instruction.
1584  *
1585  * Unlike temporary GRF array access (where we don't support it due to
1586  * the difficulty of doing relative addressing on instruction
1587  * destinations), we could potentially do array access of uniforms
1588  * that were loaded in GRF space as push constants.  In real-world
1589  * usage we've seen, though, the arrays being used are always larger
1590  * than we could load as push constants, so just always move all
1591  * uniform array access out to a pull constant buffer.
1592  */
1593 void
1594 fs_visitor::move_uniform_array_access_to_pull_constants()
1595 {
1596    int pull_constant_loc[c->prog_data.nr_params];
1597
1598    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1599       pull_constant_loc[i] = -1;
1600    }
1601
1602    /* Walk through and find array access of uniforms.  Put a copy of that
1603     * uniform in the pull constant buffer.
1604     *
1605     * Note that we don't move constant-indexed accesses to arrays.  No
1606     * testing has been done of the performance impact of this choice.
1607     */
1608    foreach_list_safe(node, &this->instructions) {
1609       fs_inst *inst = (fs_inst *)node;
1610
1611       for (int i = 0 ; i < 3; i++) {
1612          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1613             continue;
1614
1615          int uniform = inst->src[i].reg;
1616
1617          /* If this array isn't already present in the pull constant buffer,
1618           * add it.
1619           */
1620          if (pull_constant_loc[uniform] == -1) {
1621             const float **values = &c->prog_data.param[uniform];
1622
1623             pull_constant_loc[uniform] = c->prog_data.nr_pull_params;
1624
1625             assert(param_size[uniform]);
1626
1627             for (int j = 0; j < param_size[uniform]; j++) {
1628                c->prog_data.pull_param[c->prog_data.nr_pull_params++] =
1629                   values[j];
1630             }
1631          }
1632
1633          /* Set up the annotation tracking for new generated instructions. */
1634          base_ir = inst->ir;
1635          current_annotation = inst->annotation;
1636
1637          fs_reg surf_index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1638          fs_reg temp = fs_reg(this, glsl_type::float_type);
1639          exec_list list = VARYING_PULL_CONSTANT_LOAD(temp,
1640                                                      surf_index,
1641                                                      *inst->src[i].reladdr,
1642                                                      pull_constant_loc[uniform] +
1643                                                      inst->src[i].reg_offset);
1644          inst->insert_before(&list);
1645
1646          inst->src[i].file = temp.file;
1647          inst->src[i].reg = temp.reg;
1648          inst->src[i].reg_offset = temp.reg_offset;
1649          inst->src[i].reladdr = NULL;
1650       }
1651    }
1652 }
1653
1654 /**
1655  * Choose accesses from the UNIFORM file to demote to using the pull
1656  * constant buffer.
1657  *
1658  * We allow a fragment shader to have more than the specified minimum
1659  * maximum number of fragment shader uniform components (64).  If
1660  * there are too many of these, they'd fill up all of register space.
1661  * So, this will push some of them out to the pull constant buffer and
1662  * update the program to load them.
1663  */
1664 void
1665 fs_visitor::setup_pull_constants()
1666 {
1667    /* Only allow 16 registers (128 uniform components) as push constants. */
1668    unsigned int max_uniform_components = 16 * 8;
1669    if (c->prog_data.nr_params <= max_uniform_components)
1670       return;
1671
1672    if (dispatch_width == 16) {
1673       fail("Pull constants not supported in 16-wide\n");
1674       return;
1675    }
1676
1677    /* Just demote the end of the list.  We could probably do better
1678     * here, demoting things that are rarely used in the program first.
1679     */
1680    unsigned int pull_uniform_base = max_uniform_components;
1681
1682    int pull_constant_loc[c->prog_data.nr_params];
1683    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1684       if (i < pull_uniform_base) {
1685          pull_constant_loc[i] = -1;
1686       } else {
1687          pull_constant_loc[i] = -1;
1688          /* If our constant is already being uploaded for reladdr purposes,
1689           * reuse it.
1690           */
1691          for (unsigned int j = 0; j < c->prog_data.nr_pull_params; j++) {
1692             if (c->prog_data.pull_param[j] == c->prog_data.param[i]) {
1693                pull_constant_loc[i] = j;
1694                break;
1695             }
1696          }
1697          if (pull_constant_loc[i] == -1) {
1698             int pull_index = c->prog_data.nr_pull_params++;
1699             c->prog_data.pull_param[pull_index] = c->prog_data.param[i];
1700             pull_constant_loc[i] = pull_index;;
1701          }
1702       }
1703    }
1704    c->prog_data.nr_params = pull_uniform_base;
1705
1706    foreach_list(node, &this->instructions) {
1707       fs_inst *inst = (fs_inst *)node;
1708
1709       for (int i = 0; i < 3; i++) {
1710          if (inst->src[i].file != UNIFORM)
1711             continue;
1712
1713          int pull_index = pull_constant_loc[inst->src[i].reg +
1714                                             inst->src[i].reg_offset];
1715          if (pull_index == -1)
1716             continue;
1717
1718          assert(!inst->src[i].reladdr);
1719
1720          fs_reg dst = fs_reg(this, glsl_type::float_type);
1721          fs_reg index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1722          fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1723          fs_inst *pull =
1724             new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1725                                  dst, index, offset);
1726          pull->ir = inst->ir;
1727          pull->annotation = inst->annotation;
1728
1729          inst->insert_before(pull);
1730
1731          inst->src[i].file = GRF;
1732          inst->src[i].reg = dst.reg;
1733          inst->src[i].reg_offset = 0;
1734          inst->src[i].smear = pull_index & 3;
1735       }
1736    }
1737 }
1738
1739 bool
1740 fs_visitor::opt_algebraic()
1741 {
1742    bool progress = false;
1743
1744    foreach_list(node, &this->instructions) {
1745       fs_inst *inst = (fs_inst *)node;
1746
1747       switch (inst->opcode) {
1748       case BRW_OPCODE_MUL:
1749          if (inst->src[1].file != IMM)
1750             continue;
1751
1752          /* a * 1.0 = a */
1753          if (inst->src[1].is_one()) {
1754             inst->opcode = BRW_OPCODE_MOV;
1755             inst->src[1] = reg_undef;
1756             progress = true;
1757             break;
1758          }
1759
1760          /* a * 0.0 = 0.0 */
1761          if (inst->src[1].is_zero()) {
1762             inst->opcode = BRW_OPCODE_MOV;
1763             inst->src[0] = inst->src[1];
1764             inst->src[1] = reg_undef;
1765             progress = true;
1766             break;
1767          }
1768
1769          break;
1770       case BRW_OPCODE_ADD:
1771          if (inst->src[1].file != IMM)
1772             continue;
1773
1774          /* a + 0.0 = a */
1775          if (inst->src[1].is_zero()) {
1776             inst->opcode = BRW_OPCODE_MOV;
1777             inst->src[1] = reg_undef;
1778             progress = true;
1779             break;
1780          }
1781          break;
1782       default:
1783          break;
1784       }
1785    }
1786
1787    return progress;
1788 }
1789
1790 /**
1791  * Removes any instructions writing a VGRF where that VGRF is not used by any
1792  * later instruction.
1793  */
1794 bool
1795 fs_visitor::dead_code_eliminate()
1796 {
1797    bool progress = false;
1798    int pc = 0;
1799
1800    calculate_live_intervals();
1801
1802    foreach_list_safe(node, &this->instructions) {
1803       fs_inst *inst = (fs_inst *)node;
1804
1805       if (inst->dst.file == GRF) {
1806          assert(this->virtual_grf_end[inst->dst.reg] >= pc);
1807          if (this->virtual_grf_end[inst->dst.reg] == pc) {
1808             inst->remove();
1809             progress = true;
1810          }
1811       }
1812
1813       pc++;
1814    }
1815
1816    if (progress)
1817       live_intervals_valid = false;
1818
1819    return progress;
1820 }
1821
1822 struct dead_code_hash_key
1823 {
1824    int vgrf;
1825    int reg_offset;
1826 };
1827
1828 static bool
1829 dead_code_hash_compare(const void *a, const void *b)
1830 {
1831    return memcmp(a, b, sizeof(struct dead_code_hash_key)) == 0;
1832 }
1833
1834 static void
1835 clear_dead_code_hash(struct hash_table *ht)
1836 {
1837    struct hash_entry *entry;
1838
1839    hash_table_foreach(ht, entry) {
1840       _mesa_hash_table_remove(ht, entry);
1841    }
1842 }
1843
1844 static void
1845 insert_dead_code_hash(struct hash_table *ht,
1846                       int vgrf, int reg_offset, fs_inst *inst)
1847 {
1848    /* We don't bother freeing keys, because they'll be GCed with the ht. */
1849    struct dead_code_hash_key *key = ralloc(ht, struct dead_code_hash_key);
1850
1851    key->vgrf = vgrf;
1852    key->reg_offset = reg_offset;
1853
1854    _mesa_hash_table_insert(ht, _mesa_hash_data(key, sizeof(*key)), key, inst);
1855 }
1856
1857 static struct hash_entry *
1858 get_dead_code_hash_entry(struct hash_table *ht, int vgrf, int reg_offset)
1859 {
1860    struct dead_code_hash_key key;
1861
1862    key.vgrf = vgrf;
1863    key.reg_offset = reg_offset;
1864
1865    return _mesa_hash_table_search(ht, _mesa_hash_data(&key, sizeof(key)), &key);
1866 }
1867
1868 static void
1869 remove_dead_code_hash(struct hash_table *ht,
1870                       int vgrf, int reg_offset)
1871 {
1872    struct hash_entry *entry = get_dead_code_hash_entry(ht, vgrf, reg_offset);
1873    if (!entry)
1874       return;
1875
1876    _mesa_hash_table_remove(ht, entry);
1877 }
1878
1879 /**
1880  * Walks basic blocks, removing any regs that are written but not read before
1881  * being redefined.
1882  *
1883  * The dead_code_eliminate() function implements a global dead code
1884  * elimination, but it only handles the removing the last write to a register
1885  * if it's never read.  This one can handle intermediate writes, but only
1886  * within a basic block.
1887  */
1888 bool
1889 fs_visitor::dead_code_eliminate_local()
1890 {
1891    struct hash_table *ht;
1892    bool progress = false;
1893
1894    ht = _mesa_hash_table_create(mem_ctx, dead_code_hash_compare);
1895
1896    foreach_list_safe(node, &this->instructions) {
1897       fs_inst *inst = (fs_inst *)node;
1898
1899       /* At a basic block, empty the HT since we don't understand dataflow
1900        * here.
1901        */
1902       if (inst->is_control_flow()) {
1903          clear_dead_code_hash(ht);
1904          continue;
1905       }
1906
1907       /* Clear the HT of any instructions that got read. */
1908       for (int i = 0; i < 3; i++) {
1909          fs_reg src = inst->src[i];
1910          if (src.file != GRF)
1911             continue;
1912
1913          int read = 1;
1914          if (inst->is_send_from_grf())
1915             read = virtual_grf_sizes[src.reg] - src.reg_offset;
1916
1917          for (int reg_offset = src.reg_offset;
1918               reg_offset < src.reg_offset + read;
1919               reg_offset++) {
1920             remove_dead_code_hash(ht, src.reg, reg_offset);
1921          }
1922       }
1923
1924       /* Add any update of a GRF to the HT, removing a previous write if it
1925        * wasn't read.
1926        */
1927       if (inst->dst.file == GRF) {
1928          if (inst->regs_written > 1) {
1929             /* We don't know how to trim channels from an instruction's
1930              * writes, so we can't incrementally remove unread channels from
1931              * it.  Just remove whatever it overwrites from the table
1932              */
1933             for (int i = 0; i < inst->regs_written; i++) {
1934                remove_dead_code_hash(ht,
1935                                      inst->dst.reg,
1936                                      inst->dst.reg_offset + i);
1937             }
1938          } else {
1939             struct hash_entry *entry =
1940                get_dead_code_hash_entry(ht, inst->dst.reg,
1941                                         inst->dst.reg_offset);
1942
1943             if (inst->is_partial_write()) {
1944                /* For a partial write, we can't remove any previous dead code
1945                 * candidate, since we're just modifying their result, but we can
1946                 * be dead code eliminiated ourselves.
1947                 */
1948                if (entry) {
1949                   entry->data = inst;
1950                } else {
1951                   insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
1952                                         inst);
1953                }
1954             } else {
1955                if (entry) {
1956                   /* We're completely updating a channel, and there was a
1957                    * previous write to the channel that wasn't read.  Kill it!
1958                    */
1959                   fs_inst *inst = (fs_inst *)entry->data;
1960                   inst->remove();
1961                   progress = true;
1962                   _mesa_hash_table_remove(ht, entry);
1963                }
1964
1965                insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
1966                                      inst);
1967             }
1968          }
1969       }
1970    }
1971
1972    _mesa_hash_table_destroy(ht, NULL);
1973
1974    if (progress)
1975       live_intervals_valid = false;
1976
1977    return progress;
1978 }
1979
1980 /**
1981  * Implements a second type of register coalescing: This one checks if
1982  * the two regs involved in a raw move don't interfere, in which case
1983  * they can both by stored in the same place and the MOV removed.
1984  */
1985 bool
1986 fs_visitor::register_coalesce_2()
1987 {
1988    bool progress = false;
1989
1990    calculate_live_intervals();
1991
1992    foreach_list_safe(node, &this->instructions) {
1993       fs_inst *inst = (fs_inst *)node;
1994
1995       if (inst->opcode != BRW_OPCODE_MOV ||
1996           inst->is_partial_write() ||
1997           inst->saturate ||
1998           inst->src[0].file != GRF ||
1999           inst->src[0].negate ||
2000           inst->src[0].abs ||
2001           inst->src[0].smear != -1 ||
2002           inst->dst.file != GRF ||
2003           inst->dst.type != inst->src[0].type ||
2004           virtual_grf_sizes[inst->src[0].reg] != 1 ||
2005           virtual_grf_interferes(inst->dst.reg, inst->src[0].reg)) {
2006          continue;
2007       }
2008
2009       int reg_from = inst->src[0].reg;
2010       assert(inst->src[0].reg_offset == 0);
2011       int reg_to = inst->dst.reg;
2012       int reg_to_offset = inst->dst.reg_offset;
2013
2014       foreach_list(node, &this->instructions) {
2015          fs_inst *scan_inst = (fs_inst *)node;
2016
2017          if (scan_inst->dst.file == GRF &&
2018              scan_inst->dst.reg == reg_from) {
2019             scan_inst->dst.reg = reg_to;
2020             scan_inst->dst.reg_offset = reg_to_offset;
2021          }
2022          for (int i = 0; i < 3; i++) {
2023             if (scan_inst->src[i].file == GRF &&
2024                 scan_inst->src[i].reg == reg_from) {
2025                scan_inst->src[i].reg = reg_to;
2026                scan_inst->src[i].reg_offset = reg_to_offset;
2027             }
2028          }
2029       }
2030
2031       inst->remove();
2032
2033       /* We don't need to recalculate live intervals inside the loop despite
2034        * flagging live_intervals_valid because we only use live intervals for
2035        * the interferes test, and we must have had a situation where the
2036        * intervals were:
2037        *
2038        *  from  to
2039        *  ^
2040        *  |
2041        *  v
2042        *        ^
2043        *        |
2044        *        v
2045        *
2046        * Some register R that might get coalesced with one of these two could
2047        * only be referencing "to", otherwise "from"'s range would have been
2048        * longer.  R's range could also only start at the end of "to" or later,
2049        * otherwise it will conflict with "to" when we try to coalesce "to"
2050        * into Rw anyway.
2051        */
2052       live_intervals_valid = false;
2053
2054       progress = true;
2055       continue;
2056    }
2057
2058    return progress;
2059 }
2060
2061 bool
2062 fs_visitor::register_coalesce()
2063 {
2064    bool progress = false;
2065    int if_depth = 0;
2066    int loop_depth = 0;
2067
2068    foreach_list_safe(node, &this->instructions) {
2069       fs_inst *inst = (fs_inst *)node;
2070
2071       /* Make sure that we dominate the instructions we're going to
2072        * scan for interfering with our coalescing, or we won't have
2073        * scanned enough to see if anything interferes with our
2074        * coalescing.  We don't dominate the following instructions if
2075        * we're in a loop or an if block.
2076        */
2077       switch (inst->opcode) {
2078       case BRW_OPCODE_DO:
2079          loop_depth++;
2080          break;
2081       case BRW_OPCODE_WHILE:
2082          loop_depth--;
2083          break;
2084       case BRW_OPCODE_IF:
2085          if_depth++;
2086          break;
2087       case BRW_OPCODE_ENDIF:
2088          if_depth--;
2089          break;
2090       default:
2091          break;
2092       }
2093       if (loop_depth || if_depth)
2094          continue;
2095
2096       if (inst->opcode != BRW_OPCODE_MOV ||
2097           inst->is_partial_write() ||
2098           inst->saturate ||
2099           inst->dst.file != GRF || (inst->src[0].file != GRF &&
2100                                     inst->src[0].file != UNIFORM)||
2101           inst->dst.type != inst->src[0].type)
2102          continue;
2103
2104       bool has_source_modifiers = (inst->src[0].abs ||
2105                                    inst->src[0].negate ||
2106                                    inst->src[0].smear != -1 ||
2107                                    inst->src[0].file == UNIFORM);
2108
2109       /* Found a move of a GRF to a GRF.  Let's see if we can coalesce
2110        * them: check for no writes to either one until the exit of the
2111        * program.
2112        */
2113       bool interfered = false;
2114
2115       for (fs_inst *scan_inst = (fs_inst *)inst->next;
2116            !scan_inst->is_tail_sentinel();
2117            scan_inst = (fs_inst *)scan_inst->next) {
2118          if (scan_inst->dst.file == GRF) {
2119             if (scan_inst->overwrites_reg(inst->dst) ||
2120                 scan_inst->overwrites_reg(inst->src[0])) {
2121                interfered = true;
2122                break;
2123             }
2124          }
2125
2126          if (has_source_modifiers) {
2127             for (int i = 0; i < 3; i++) {
2128                if (scan_inst->src[i].file == GRF &&
2129                    scan_inst->src[i].reg == inst->dst.reg &&
2130                    scan_inst->src[i].reg_offset == inst->dst.reg_offset &&
2131                    inst->dst.type != scan_inst->src[i].type)
2132                {
2133                  interfered = true;
2134                  break;
2135                }
2136             }
2137          }
2138
2139
2140          /* The gen6 MATH instruction can't handle source modifiers or
2141           * unusual register regions, so avoid coalescing those for
2142           * now.  We should do something more specific.
2143           */
2144          if (has_source_modifiers && !can_do_source_mods(scan_inst)) {
2145             interfered = true;
2146             break;
2147          }
2148
2149          /* The accumulator result appears to get used for the
2150           * conditional modifier generation.  When negating a UD
2151           * value, there is a 33rd bit generated for the sign in the
2152           * accumulator value, so now you can't check, for example,
2153           * equality with a 32-bit value.  See piglit fs-op-neg-uint.
2154           */
2155          if (scan_inst->conditional_mod &&
2156              inst->src[0].negate &&
2157              inst->src[0].type == BRW_REGISTER_TYPE_UD) {
2158             interfered = true;
2159             break;
2160          }
2161       }
2162       if (interfered) {
2163          continue;
2164       }
2165
2166       /* Rewrite the later usage to point at the source of the move to
2167        * be removed.
2168        */
2169       for (fs_inst *scan_inst = inst;
2170            !scan_inst->is_tail_sentinel();
2171            scan_inst = (fs_inst *)scan_inst->next) {
2172          for (int i = 0; i < 3; i++) {
2173             if (scan_inst->src[i].file == GRF &&
2174                 scan_inst->src[i].reg == inst->dst.reg &&
2175                 scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
2176                fs_reg new_src = inst->src[0];
2177                if (scan_inst->src[i].abs) {
2178                   new_src.negate = 0;
2179                   new_src.abs = 1;
2180                }
2181                new_src.negate ^= scan_inst->src[i].negate;
2182                scan_inst->src[i] = new_src;
2183             }
2184          }
2185       }
2186
2187       inst->remove();
2188       progress = true;
2189    }
2190
2191    if (progress)
2192       live_intervals_valid = false;
2193
2194    return progress;
2195 }
2196
2197
2198 bool
2199 fs_visitor::compute_to_mrf()
2200 {
2201    bool progress = false;
2202    int next_ip = 0;
2203
2204    calculate_live_intervals();
2205
2206    foreach_list_safe(node, &this->instructions) {
2207       fs_inst *inst = (fs_inst *)node;
2208
2209       int ip = next_ip;
2210       next_ip++;
2211
2212       if (inst->opcode != BRW_OPCODE_MOV ||
2213           inst->is_partial_write() ||
2214           inst->dst.file != MRF || inst->src[0].file != GRF ||
2215           inst->dst.type != inst->src[0].type ||
2216           inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
2217          continue;
2218
2219       /* Work out which hardware MRF registers are written by this
2220        * instruction.
2221        */
2222       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2223       int mrf_high;
2224       if (inst->dst.reg & BRW_MRF_COMPR4) {
2225          mrf_high = mrf_low + 4;
2226       } else if (dispatch_width == 16 &&
2227                  (!inst->force_uncompressed && !inst->force_sechalf)) {
2228          mrf_high = mrf_low + 1;
2229       } else {
2230          mrf_high = mrf_low;
2231       }
2232
2233       /* Can't compute-to-MRF this GRF if someone else was going to
2234        * read it later.
2235        */
2236       if (this->virtual_grf_end[inst->src[0].reg] > ip)
2237          continue;
2238
2239       /* Found a move of a GRF to a MRF.  Let's see if we can go
2240        * rewrite the thing that made this GRF to write into the MRF.
2241        */
2242       fs_inst *scan_inst;
2243       for (scan_inst = (fs_inst *)inst->prev;
2244            scan_inst->prev != NULL;
2245            scan_inst = (fs_inst *)scan_inst->prev) {
2246          if (scan_inst->dst.file == GRF &&
2247              scan_inst->dst.reg == inst->src[0].reg) {
2248             /* Found the last thing to write our reg we want to turn
2249              * into a compute-to-MRF.
2250              */
2251
2252             /* If this one instruction didn't populate all the
2253              * channels, bail.  We might be able to rewrite everything
2254              * that writes that reg, but it would require smarter
2255              * tracking to delay the rewriting until complete success.
2256              */
2257             if (scan_inst->is_partial_write())
2258                break;
2259
2260             /* Things returning more than one register would need us to
2261              * understand coalescing out more than one MOV at a time.
2262              */
2263             if (scan_inst->regs_written > 1)
2264                break;
2265
2266             /* SEND instructions can't have MRF as a destination. */
2267             if (scan_inst->mlen)
2268                break;
2269
2270             if (brw->gen == 6) {
2271                /* gen6 math instructions must have the destination be
2272                 * GRF, so no compute-to-MRF for them.
2273                 */
2274                if (scan_inst->is_math()) {
2275                   break;
2276                }
2277             }
2278
2279             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2280                /* Found the creator of our MRF's source value. */
2281                scan_inst->dst.file = MRF;
2282                scan_inst->dst.reg = inst->dst.reg;
2283                scan_inst->saturate |= inst->saturate;
2284                inst->remove();
2285                progress = true;
2286             }
2287             break;
2288          }
2289
2290          /* We don't handle control flow here.  Most computation of
2291           * values that end up in MRFs are shortly before the MRF
2292           * write anyway.
2293           */
2294          if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2295             break;
2296
2297          /* You can't read from an MRF, so if someone else reads our
2298           * MRF's source GRF that we wanted to rewrite, that stops us.
2299           */
2300          bool interfered = false;
2301          for (int i = 0; i < 3; i++) {
2302             if (scan_inst->src[i].file == GRF &&
2303                 scan_inst->src[i].reg == inst->src[0].reg &&
2304                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2305                interfered = true;
2306             }
2307          }
2308          if (interfered)
2309             break;
2310
2311          if (scan_inst->dst.file == MRF) {
2312             /* If somebody else writes our MRF here, we can't
2313              * compute-to-MRF before that.
2314              */
2315             int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2316             int scan_mrf_high;
2317
2318             if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2319                scan_mrf_high = scan_mrf_low + 4;
2320             } else if (dispatch_width == 16 &&
2321                        (!scan_inst->force_uncompressed &&
2322                         !scan_inst->force_sechalf)) {
2323                scan_mrf_high = scan_mrf_low + 1;
2324             } else {
2325                scan_mrf_high = scan_mrf_low;
2326             }
2327
2328             if (mrf_low == scan_mrf_low ||
2329                 mrf_low == scan_mrf_high ||
2330                 mrf_high == scan_mrf_low ||
2331                 mrf_high == scan_mrf_high) {
2332                break;
2333             }
2334          }
2335
2336          if (scan_inst->mlen > 0) {
2337             /* Found a SEND instruction, which means that there are
2338              * live values in MRFs from base_mrf to base_mrf +
2339              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
2340              * above it.
2341              */
2342             if (mrf_low >= scan_inst->base_mrf &&
2343                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2344                break;
2345             }
2346             if (mrf_high >= scan_inst->base_mrf &&
2347                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2348                break;
2349             }
2350          }
2351       }
2352    }
2353
2354    if (progress)
2355       live_intervals_valid = false;
2356
2357    return progress;
2358 }
2359
2360 /**
2361  * Walks through basic blocks, looking for repeated MRF writes and
2362  * removing the later ones.
2363  */
2364 bool
2365 fs_visitor::remove_duplicate_mrf_writes()
2366 {
2367    fs_inst *last_mrf_move[16];
2368    bool progress = false;
2369
2370    /* Need to update the MRF tracking for compressed instructions. */
2371    if (dispatch_width == 16)
2372       return false;
2373
2374    memset(last_mrf_move, 0, sizeof(last_mrf_move));
2375
2376    foreach_list_safe(node, &this->instructions) {
2377       fs_inst *inst = (fs_inst *)node;
2378
2379       if (inst->is_control_flow()) {
2380          memset(last_mrf_move, 0, sizeof(last_mrf_move));
2381       }
2382
2383       if (inst->opcode == BRW_OPCODE_MOV &&
2384           inst->dst.file == MRF) {
2385          fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2386          if (prev_inst && inst->equals(prev_inst)) {
2387             inst->remove();
2388             progress = true;
2389             continue;
2390          }
2391       }
2392
2393       /* Clear out the last-write records for MRFs that were overwritten. */
2394       if (inst->dst.file == MRF) {
2395          last_mrf_move[inst->dst.reg] = NULL;
2396       }
2397
2398       if (inst->mlen > 0) {
2399          /* Found a SEND instruction, which will include two or fewer
2400           * implied MRF writes.  We could do better here.
2401           */
2402          for (int i = 0; i < implied_mrf_writes(inst); i++) {
2403             last_mrf_move[inst->base_mrf + i] = NULL;
2404          }
2405       }
2406
2407       /* Clear out any MRF move records whose sources got overwritten. */
2408       if (inst->dst.file == GRF) {
2409          for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2410             if (last_mrf_move[i] &&
2411                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2412                last_mrf_move[i] = NULL;
2413             }
2414          }
2415       }
2416
2417       if (inst->opcode == BRW_OPCODE_MOV &&
2418           inst->dst.file == MRF &&
2419           inst->src[0].file == GRF &&
2420           !inst->is_partial_write()) {
2421          last_mrf_move[inst->dst.reg] = inst;
2422       }
2423    }
2424
2425    if (progress)
2426       live_intervals_valid = false;
2427
2428    return progress;
2429 }
2430
2431 static void
2432 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2433                         int first_grf, int grf_len)
2434 {
2435    bool inst_16wide = (dispatch_width > 8 &&
2436                        !inst->force_uncompressed &&
2437                        !inst->force_sechalf);
2438
2439    /* Clear the flag for registers that actually got read (as expected). */
2440    for (int i = 0; i < 3; i++) {
2441       int grf;
2442       if (inst->src[i].file == GRF) {
2443          grf = inst->src[i].reg;
2444       } else if (inst->src[i].file == HW_REG &&
2445                  inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2446          grf = inst->src[i].fixed_hw_reg.nr;
2447       } else {
2448          continue;
2449       }
2450
2451       if (grf >= first_grf &&
2452           grf < first_grf + grf_len) {
2453          deps[grf - first_grf] = false;
2454          if (inst_16wide)
2455             deps[grf - first_grf + 1] = false;
2456       }
2457    }
2458 }
2459
2460 /**
2461  * Implements this workaround for the original 965:
2462  *
2463  *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2464  *      check for post destination dependencies on this instruction, software
2465  *      must ensure that there is no destination hazard for the case of ‘write
2466  *      followed by a posted write’ shown in the following example.
2467  *
2468  *      1. mov r3 0
2469  *      2. send r3.xy <rest of send instruction>
2470  *      3. mov r2 r3
2471  *
2472  *      Due to no post-destination dependency check on the ‘send’, the above
2473  *      code sequence could have two instructions (1 and 2) in flight at the
2474  *      same time that both consider ‘r3’ as the target of their final writes.
2475  */
2476 void
2477 fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
2478 {
2479    int reg_size = dispatch_width / 8;
2480    int write_len = inst->regs_written * reg_size;
2481    int first_write_grf = inst->dst.reg;
2482    bool needs_dep[BRW_MAX_MRF];
2483    assert(write_len < (int)sizeof(needs_dep) - 1);
2484
2485    memset(needs_dep, false, sizeof(needs_dep));
2486    memset(needs_dep, true, write_len);
2487
2488    clear_deps_for_inst_src(inst, dispatch_width,
2489                            needs_dep, first_write_grf, write_len);
2490
2491    /* Walk backwards looking for writes to registers we're writing which
2492     * aren't read since being written.  If we hit the start of the program,
2493     * we assume that there are no outstanding dependencies on entry to the
2494     * program.
2495     */
2496    for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2497         scan_inst != NULL;
2498         scan_inst = (fs_inst *)scan_inst->prev) {
2499
2500       /* If we hit control flow, assume that there *are* outstanding
2501        * dependencies, and force their cleanup before our instruction.
2502        */
2503       if (scan_inst->is_control_flow()) {
2504          for (int i = 0; i < write_len; i++) {
2505             if (needs_dep[i]) {
2506                inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2507             }
2508          }
2509          return;
2510       }
2511
2512       bool scan_inst_16wide = (dispatch_width > 8 &&
2513                                !scan_inst->force_uncompressed &&
2514                                !scan_inst->force_sechalf);
2515
2516       /* We insert our reads as late as possible on the assumption that any
2517        * instruction but a MOV that might have left us an outstanding
2518        * dependency has more latency than a MOV.
2519        */
2520       if (scan_inst->dst.file == GRF) {
2521          for (int i = 0; i < scan_inst->regs_written; i++) {
2522             int reg = scan_inst->dst.reg + i * reg_size;
2523
2524             if (reg >= first_write_grf &&
2525                 reg < first_write_grf + write_len &&
2526                 needs_dep[reg - first_write_grf]) {
2527                inst->insert_before(DEP_RESOLVE_MOV(reg));
2528                needs_dep[reg - first_write_grf] = false;
2529                if (scan_inst_16wide)
2530                   needs_dep[reg - first_write_grf + 1] = false;
2531             }
2532          }
2533       }
2534
2535       /* Clear the flag for registers that actually got read (as expected). */
2536       clear_deps_for_inst_src(scan_inst, dispatch_width,
2537                               needs_dep, first_write_grf, write_len);
2538
2539       /* Continue the loop only if we haven't resolved all the dependencies */
2540       int i;
2541       for (i = 0; i < write_len; i++) {
2542          if (needs_dep[i])
2543             break;
2544       }
2545       if (i == write_len)
2546          return;
2547    }
2548 }
2549
2550 /**
2551  * Implements this workaround for the original 965:
2552  *
2553  *     "[DevBW, DevCL] Errata: A destination register from a send can not be
2554  *      used as a destination register until after it has been sourced by an
2555  *      instruction with a different destination register.
2556  */
2557 void
2558 fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
2559 {
2560    int write_len = inst->regs_written * dispatch_width / 8;
2561    int first_write_grf = inst->dst.reg;
2562    bool needs_dep[BRW_MAX_MRF];
2563    assert(write_len < (int)sizeof(needs_dep) - 1);
2564
2565    memset(needs_dep, false, sizeof(needs_dep));
2566    memset(needs_dep, true, write_len);
2567    /* Walk forwards looking for writes to registers we're writing which aren't
2568     * read before being written.
2569     */
2570    for (fs_inst *scan_inst = (fs_inst *)inst->next;
2571         !scan_inst->is_tail_sentinel();
2572         scan_inst = (fs_inst *)scan_inst->next) {
2573       /* If we hit control flow, force resolve all remaining dependencies. */
2574       if (scan_inst->is_control_flow()) {
2575          for (int i = 0; i < write_len; i++) {
2576             if (needs_dep[i])
2577                scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2578          }
2579          return;
2580       }
2581
2582       /* Clear the flag for registers that actually got read (as expected). */
2583       clear_deps_for_inst_src(scan_inst, dispatch_width,
2584                               needs_dep, first_write_grf, write_len);
2585
2586       /* We insert our reads as late as possible since they're reading the
2587        * result of a SEND, which has massive latency.
2588        */
2589       if (scan_inst->dst.file == GRF &&
2590           scan_inst->dst.reg >= first_write_grf &&
2591           scan_inst->dst.reg < first_write_grf + write_len &&
2592           needs_dep[scan_inst->dst.reg - first_write_grf]) {
2593          scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2594          needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2595       }
2596
2597       /* Continue the loop only if we haven't resolved all the dependencies */
2598       int i;
2599       for (i = 0; i < write_len; i++) {
2600          if (needs_dep[i])
2601             break;
2602       }
2603       if (i == write_len)
2604          return;
2605    }
2606
2607    /* If we hit the end of the program, resolve all remaining dependencies out
2608     * of paranoia.
2609     */
2610    fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2611    assert(last_inst->eot);
2612    for (int i = 0; i < write_len; i++) {
2613       if (needs_dep[i])
2614          last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2615    }
2616 }
2617
2618 void
2619 fs_visitor::insert_gen4_send_dependency_workarounds()
2620 {
2621    if (brw->gen != 4 || brw->is_g4x)
2622       return;
2623
2624    /* Note that we're done with register allocation, so GRF fs_regs always
2625     * have a .reg_offset of 0.
2626     */
2627
2628    foreach_list_safe(node, &this->instructions) {
2629       fs_inst *inst = (fs_inst *)node;
2630
2631       if (inst->mlen != 0 && inst->dst.file == GRF) {
2632          insert_gen4_pre_send_dependency_workarounds(inst);
2633          insert_gen4_post_send_dependency_workarounds(inst);
2634       }
2635    }
2636 }
2637
2638 /**
2639  * Turns the generic expression-style uniform pull constant load instruction
2640  * into a hardware-specific series of instructions for loading a pull
2641  * constant.
2642  *
2643  * The expression style allows the CSE pass before this to optimize out
2644  * repeated loads from the same offset, and gives the pre-register-allocation
2645  * scheduling full flexibility, while the conversion to native instructions
2646  * allows the post-register-allocation scheduler the best information
2647  * possible.
2648  *
2649  * Note that execution masking for setting up pull constant loads is special:
2650  * the channels that need to be written are unrelated to the current execution
2651  * mask, since a later instruction will use one of the result channels as a
2652  * source operand for all 8 or 16 of its channels.
2653  */
2654 void
2655 fs_visitor::lower_uniform_pull_constant_loads()
2656 {
2657    foreach_list(node, &this->instructions) {
2658       fs_inst *inst = (fs_inst *)node;
2659
2660       if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2661          continue;
2662
2663       if (brw->gen >= 7) {
2664          /* The offset arg before was a vec4-aligned byte offset.  We need to
2665           * turn it into a dword offset.
2666           */
2667          fs_reg const_offset_reg = inst->src[1];
2668          assert(const_offset_reg.file == IMM &&
2669                 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2670          const_offset_reg.imm.u /= 4;
2671          fs_reg payload = fs_reg(this, glsl_type::uint_type);
2672
2673          /* This is actually going to be a MOV, but since only the first dword
2674           * is accessed, we have a special opcode to do just that one.  Note
2675           * that this needs to be an operation that will be considered a def
2676           * by live variable analysis, or register allocation will explode.
2677           */
2678          fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2679                                                payload, const_offset_reg);
2680          setup->force_writemask_all = true;
2681
2682          setup->ir = inst->ir;
2683          setup->annotation = inst->annotation;
2684          inst->insert_before(setup);
2685
2686          /* Similarly, this will only populate the first 4 channels of the
2687           * result register (since we only use smear values from 0-3), but we
2688           * don't tell the optimizer.
2689           */
2690          inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2691          inst->src[1] = payload;
2692
2693          this->live_intervals_valid = false;
2694       } else {
2695          /* Before register allocation, we didn't tell the scheduler about the
2696           * MRF we use.  We know it's safe to use this MRF because nothing
2697           * else does except for register spill/unspill, which generates and
2698           * uses its MRF within a single IR instruction.
2699           */
2700          inst->base_mrf = 14;
2701          inst->mlen = 1;
2702       }
2703    }
2704 }
2705
2706 void
2707 fs_visitor::dump_instruction(backend_instruction *be_inst)
2708 {
2709    fs_inst *inst = (fs_inst *)be_inst;
2710
2711    if (inst->predicate) {
2712       printf("(%cf0.%d) ",
2713              inst->predicate_inverse ? '-' : '+',
2714              inst->flag_subreg);
2715    }
2716
2717    printf("%s", brw_instruction_name(inst->opcode));
2718    if (inst->saturate)
2719       printf(".sat");
2720    if (inst->conditional_mod) {
2721       printf(".cmod");
2722       if (!inst->predicate &&
2723           (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2724                               inst->opcode != BRW_OPCODE_IF &&
2725                               inst->opcode != BRW_OPCODE_WHILE))) {
2726          printf(".f0.%d", inst->flag_subreg);
2727       }
2728    }
2729    printf(" ");
2730
2731
2732    switch (inst->dst.file) {
2733    case GRF:
2734       printf("vgrf%d", inst->dst.reg);
2735       if (inst->dst.reg_offset)
2736          printf("+%d", inst->dst.reg_offset);
2737       break;
2738    case MRF:
2739       printf("m%d", inst->dst.reg);
2740       break;
2741    case BAD_FILE:
2742       printf("(null)");
2743       break;
2744    case UNIFORM:
2745       printf("***u%d***", inst->dst.reg);
2746       break;
2747    case ARF:
2748       if (inst->dst.reg == BRW_ARF_NULL)
2749          printf("(null)");
2750       else
2751          printf("arf%d", inst->dst.reg);
2752       break;
2753    default:
2754       printf("???");
2755       break;
2756    }
2757    printf(", ");
2758
2759    for (int i = 0; i < 3; i++) {
2760       if (inst->src[i].negate)
2761          printf("-");
2762       if (inst->src[i].abs)
2763          printf("|");
2764       switch (inst->src[i].file) {
2765       case GRF:
2766          printf("vgrf%d", inst->src[i].reg);
2767          if (inst->src[i].reg_offset)
2768             printf("+%d", inst->src[i].reg_offset);
2769          break;
2770       case MRF:
2771          printf("***m%d***", inst->src[i].reg);
2772          break;
2773       case UNIFORM:
2774          printf("u%d", inst->src[i].reg);
2775          if (inst->src[i].reg_offset)
2776             printf(".%d", inst->src[i].reg_offset);
2777          break;
2778       case BAD_FILE:
2779          printf("(null)");
2780          break;
2781       case IMM:
2782          switch (inst->src[i].type) {
2783          case BRW_REGISTER_TYPE_F:
2784             printf("%ff", inst->src[i].imm.f);
2785             break;
2786          case BRW_REGISTER_TYPE_D:
2787             printf("%dd", inst->src[i].imm.i);
2788             break;
2789          case BRW_REGISTER_TYPE_UD:
2790             printf("%uu", inst->src[i].imm.u);
2791             break;
2792          default:
2793             printf("???");
2794             break;
2795          }
2796          break;
2797       default:
2798          printf("???");
2799          break;
2800       }
2801       if (inst->src[i].abs)
2802          printf("|");
2803
2804       if (i < 3)
2805          printf(", ");
2806    }
2807
2808    printf(" ");
2809
2810    if (inst->force_uncompressed)
2811       printf("1sthalf ");
2812
2813    if (inst->force_sechalf)
2814       printf("2ndhalf ");
2815
2816    printf("\n");
2817 }
2818
2819 /**
2820  * Possibly returns an instruction that set up @param reg.
2821  *
2822  * Sometimes we want to take the result of some expression/variable
2823  * dereference tree and rewrite the instruction generating the result
2824  * of the tree.  When processing the tree, we know that the
2825  * instructions generated are all writing temporaries that are dead
2826  * outside of this tree.  So, if we have some instructions that write
2827  * a temporary, we're free to point that temp write somewhere else.
2828  *
2829  * Note that this doesn't guarantee that the instruction generated
2830  * only reg -- it might be the size=4 destination of a texture instruction.
2831  */
2832 fs_inst *
2833 fs_visitor::get_instruction_generating_reg(fs_inst *start,
2834                                            fs_inst *end,
2835                                            fs_reg reg)
2836 {
2837    if (end == start ||
2838        end->is_partial_write() ||
2839        reg.reladdr ||
2840        !reg.equals(end->dst)) {
2841       return NULL;
2842    } else {
2843       return end;
2844    }
2845 }
2846
2847 void
2848 fs_visitor::setup_payload_gen6()
2849 {
2850    bool uses_depth =
2851       (fp->Base.InputsRead & (1 << VARYING_SLOT_POS)) != 0;
2852    unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
2853
2854    assert(brw->gen >= 6);
2855
2856    /* R0-1: masks, pixel X/Y coordinates. */
2857    c->nr_payload_regs = 2;
2858    /* R2: only for 32-pixel dispatch.*/
2859
2860    /* R3-26: barycentric interpolation coordinates.  These appear in the
2861     * same order that they appear in the brw_wm_barycentric_interp_mode
2862     * enum.  Each set of coordinates occupies 2 registers if dispatch width
2863     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
2864     * appear if they were enabled using the "Barycentric Interpolation
2865     * Mode" bits in WM_STATE.
2866     */
2867    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
2868       if (barycentric_interp_modes & (1 << i)) {
2869          c->barycentric_coord_reg[i] = c->nr_payload_regs;
2870          c->nr_payload_regs += 2;
2871          if (dispatch_width == 16) {
2872             c->nr_payload_regs += 2;
2873          }
2874       }
2875    }
2876
2877    /* R27: interpolated depth if uses source depth */
2878    if (uses_depth) {
2879       c->source_depth_reg = c->nr_payload_regs;
2880       c->nr_payload_regs++;
2881       if (dispatch_width == 16) {
2882          /* R28: interpolated depth if not 8-wide. */
2883          c->nr_payload_regs++;
2884       }
2885    }
2886    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
2887    if (uses_depth) {
2888       c->source_w_reg = c->nr_payload_regs;
2889       c->nr_payload_regs++;
2890       if (dispatch_width == 16) {
2891          /* R30: interpolated W if not 8-wide. */
2892          c->nr_payload_regs++;
2893       }
2894    }
2895    /* R31: MSAA position offsets. */
2896    /* R32-: bary for 32-pixel. */
2897    /* R58-59: interp W for 32-pixel. */
2898
2899    if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
2900       c->source_depth_to_render_target = true;
2901    }
2902 }
2903
2904 bool
2905 fs_visitor::run()
2906 {
2907    sanity_param_count = fp->Base.Parameters->NumParameters;
2908    uint32_t orig_nr_params = c->prog_data.nr_params;
2909
2910    if (brw->gen >= 6)
2911       setup_payload_gen6();
2912    else
2913       setup_payload_gen4();
2914
2915    if (0) {
2916       emit_dummy_fs();
2917    } else {
2918       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2919          emit_shader_time_begin();
2920
2921       calculate_urb_setup();
2922       if (brw->gen < 6)
2923          emit_interpolation_setup_gen4();
2924       else
2925          emit_interpolation_setup_gen6();
2926
2927       /* We handle discards by keeping track of the still-live pixels in f0.1.
2928        * Initialize it with the dispatched pixels.
2929        */
2930       if (fp->UsesKill) {
2931          fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
2932          discard_init->flag_subreg = 1;
2933       }
2934
2935       /* Generate FS IR for main().  (the visitor only descends into
2936        * functions called "main").
2937        */
2938       if (shader) {
2939          foreach_list(node, &*shader->ir) {
2940             ir_instruction *ir = (ir_instruction *)node;
2941             base_ir = ir;
2942             this->result = reg_undef;
2943             ir->accept(this);
2944          }
2945       } else {
2946          emit_fragment_program_code();
2947       }
2948       base_ir = NULL;
2949       if (failed)
2950          return false;
2951
2952       emit(FS_OPCODE_PLACEHOLDER_HALT);
2953
2954       emit_fb_writes();
2955
2956       split_virtual_grfs();
2957
2958       move_uniform_array_access_to_pull_constants();
2959       setup_pull_constants();
2960
2961       bool progress;
2962       do {
2963          progress = false;
2964
2965          compact_virtual_grfs();
2966
2967          progress = remove_duplicate_mrf_writes() || progress;
2968
2969          progress = opt_algebraic() || progress;
2970          progress = opt_cse() || progress;
2971          progress = opt_copy_propagate() || progress;
2972          progress = dead_code_eliminate() || progress;
2973          progress = dead_code_eliminate_local() || progress;
2974          progress = register_coalesce() || progress;
2975          progress = register_coalesce_2() || progress;
2976          progress = compute_to_mrf() || progress;
2977       } while (progress);
2978
2979       remove_dead_constants();
2980
2981       schedule_instructions(false);
2982
2983       lower_uniform_pull_constant_loads();
2984
2985       assign_curb_setup();
2986       assign_urb_setup();
2987
2988       if (0) {
2989          /* Debug of register spilling: Go spill everything. */
2990          for (int i = 0; i < virtual_grf_count; i++) {
2991             spill_reg(i);
2992          }
2993       }
2994
2995       if (0)
2996          assign_regs_trivial();
2997       else {
2998          while (!assign_regs()) {
2999             if (failed)
3000                break;
3001          }
3002       }
3003    }
3004    assert(force_uncompressed_stack == 0);
3005    assert(force_sechalf_stack == 0);
3006
3007    /* This must come after all optimization and register allocation, since
3008     * it inserts dead code that happens to have side effects, and it does
3009     * so based on the actual physical registers in use.
3010     */
3011    insert_gen4_send_dependency_workarounds();
3012
3013    if (failed)
3014       return false;
3015
3016    schedule_instructions(true);
3017
3018    if (dispatch_width == 8) {
3019       c->prog_data.reg_blocks = brw_register_blocks(grf_used);
3020    } else {
3021       c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
3022
3023       /* Make sure we didn't try to sneak in an extra uniform */
3024       assert(orig_nr_params == c->prog_data.nr_params);
3025       (void) orig_nr_params;
3026    }
3027
3028    /* If any state parameters were appended, then ParameterValues could have
3029     * been realloced, in which case the driver uniform storage set up by
3030     * _mesa_associate_uniform_storage() would point to freed memory.  Make
3031     * sure that didn't happen.
3032     */
3033    assert(sanity_param_count == fp->Base.Parameters->NumParameters);
3034
3035    return !failed;
3036 }
3037
3038 const unsigned *
3039 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
3040                struct gl_fragment_program *fp,
3041                struct gl_shader_program *prog,
3042                unsigned *final_assembly_size)
3043 {
3044    bool start_busy = false;
3045    float start_time = 0;
3046
3047    if (unlikely(brw->perf_debug)) {
3048       start_busy = (brw->batch.last_bo &&
3049                     drm_intel_bo_busy(brw->batch.last_bo));
3050       start_time = get_time();
3051    }
3052
3053    struct brw_shader *shader = NULL;
3054    if (prog)
3055       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3056
3057    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3058       if (prog) {
3059          printf("GLSL IR for native fragment shader %d:\n", prog->Name);
3060          _mesa_print_ir(shader->ir, NULL);
3061          printf("\n\n");
3062       } else {
3063          printf("ARB_fragment_program %d ir for native fragment shader\n",
3064                 fp->Base.Id);
3065          _mesa_print_program(&fp->Base);
3066       }
3067    }
3068
3069    /* Now the main event: Visit the shader IR and generate our FS IR for it.
3070     */
3071    fs_visitor v(brw, c, prog, fp, 8);
3072    if (!v.run()) {
3073       if (prog) {
3074          prog->LinkStatus = false;
3075          ralloc_strcat(&prog->InfoLog, v.fail_msg);
3076       }
3077
3078       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3079                     v.fail_msg);
3080
3081       return NULL;
3082    }
3083
3084    exec_list *simd16_instructions = NULL;
3085    fs_visitor v2(brw, c, prog, fp, 16);
3086    if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16))) {
3087       if (c->prog_data.nr_pull_params == 0) {
3088          /* Try a 16-wide compile */
3089          v2.import_uniforms(&v);
3090          if (!v2.run()) {
3091             perf_debug("16-wide shader failed to compile, falling back to "
3092                        "8-wide at a 10-20%% performance cost: %s", v2.fail_msg);
3093          } else {
3094             simd16_instructions = &v2.instructions;
3095          }
3096       } else {
3097          perf_debug("Skipping 16-wide due to pull parameters.\n");
3098       }
3099    }
3100
3101    c->prog_data.dispatch_width = 8;
3102
3103    fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
3104    const unsigned *generated = g.generate_assembly(&v.instructions,
3105                                                    simd16_instructions,
3106                                                    final_assembly_size);
3107
3108    if (unlikely(brw->perf_debug) && shader) {
3109       if (shader->compiled_once)
3110          brw_wm_debug_recompile(brw, prog, &c->key);
3111       shader->compiled_once = true;
3112
3113       if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
3114          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3115                     (get_time() - start_time) * 1000);
3116       }
3117    }
3118
3119    return generated;
3120 }
3121
3122 bool
3123 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
3124 {
3125    struct brw_context *brw = brw_context(ctx);
3126    struct brw_wm_prog_key key;
3127
3128    if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
3129       return true;
3130
3131    struct gl_fragment_program *fp = (struct gl_fragment_program *)
3132       prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
3133    struct brw_fragment_program *bfp = brw_fragment_program(fp);
3134    bool program_uses_dfdy = fp->UsesDFdy;
3135
3136    memset(&key, 0, sizeof(key));
3137
3138    if (brw->gen < 6) {
3139       if (fp->UsesKill)
3140          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3141
3142       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3143          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3144
3145       /* Just assume depth testing. */
3146       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3147       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3148    }
3149
3150    if (brw->gen < 6)
3151       key.input_slots_valid |= BITFIELD64_BIT(VARYING_SLOT_POS);
3152
3153    for (int i = 0; i < VARYING_SLOT_MAX; i++) {
3154       if (!(fp->Base.InputsRead & BITFIELD64_BIT(i)))
3155          continue;
3156
3157       if (brw->gen < 6) {
3158          if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
3159             key.input_slots_valid |= BITFIELD64_BIT(i);
3160       }
3161    }
3162
3163    key.clamp_fragment_color = ctx->API == API_OPENGL_COMPAT;
3164
3165    unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
3166    for (unsigned i = 0; i < sampler_count; i++) {
3167       if (fp->Base.ShadowSamplers & (1 << i)) {
3168          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3169          key.tex.swizzles[i] =
3170             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3171       } else {
3172          /* Color sampler: assume no swizzling. */
3173          key.tex.swizzles[i] = SWIZZLE_XYZW;
3174       }
3175    }
3176
3177    if (fp->Base.InputsRead & VARYING_BIT_POS) {
3178       key.drawable_height = ctx->DrawBuffer->Height;
3179    }
3180
3181    if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3182       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
3183    }
3184
3185    key.nr_color_regions = 1;
3186
3187    key.program_string_id = bfp->id;
3188
3189    uint32_t old_prog_offset = brw->wm.prog_offset;
3190    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3191
3192    bool success = do_wm_prog(brw, prog, bfp, &key);
3193
3194    brw->wm.prog_offset = old_prog_offset;
3195    brw->wm.prog_data = old_prog_data;
3196
3197    return success;
3198 }