src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 extern "C" {
  32
  33 #include <sys/types.h>
  34
  35 #include "main/hash_table.h"
  36 #include "main/macros.h"
  37 #include "main/shaderobj.h"
  38 #include "main/fbobject.h"
  39 #include "program/prog_parameter.h"
  40 #include "program/prog_print.h"
  41 #include "program/register_allocate.h"
  42 #include "program/sampler.h"
  43 #include "program/hash_table.h"
  44 #include "brw_context.h"
  45 #include "brw_eu.h"
  46 #include "brw_wm.h"
  47 }
  48 #include "brw_fs.h"
  49 #include "main/uniforms.h"
  50 #include "glsl/glsl_types.h"
  51
  52 void
  53 fs_inst::init()
  54 {
  55    memset(this, 0, sizeof(*this));
  56    this->opcode = BRW_OPCODE_NOP;
  57    this->conditional_mod = BRW_CONDITIONAL_NONE;
  58
  59    this->dst = reg_undef;
  60    this->src[0] = reg_undef;
  61    this->src[1] = reg_undef;
  62    this->src[2] = reg_undef;
  63
  64    /* This will be the case for almost all instructions. */
  65    this->regs_written = 1;
  66 }
  67
  68 fs_inst::fs_inst()
  69 {
  70    init();
  71 }
  72
  73 fs_inst::fs_inst(enum opcode opcode)
  74 {
  75    init();
  76    this->opcode = opcode;
  77 }
  78
  79 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
  80 {
  81    init();
  82    this->opcode = opcode;
  83    this->dst = dst;
  84
  85    if (dst.file == GRF)
  86       assert(dst.reg_offset >= 0);
  87 }
  88
  89 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
  90 {
  91    init();
  92    this->opcode = opcode;
  93    this->dst = dst;
  94    this->src[0] = src0;
  95
  96    if (dst.file == GRF)
  97       assert(dst.reg_offset >= 0);
  98    if (src[0].file == GRF)
  99       assert(src[0].reg_offset >= 0);
 100 }
 101
 102 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 103 {
 104    init();
 105    this->opcode = opcode;
 106    this->dst = dst;
 107    this->src[0] = src0;
 108    this->src[1] = src1;
 109
 110    if (dst.file == GRF)
 111       assert(dst.reg_offset >= 0);
 112    if (src[0].file == GRF)
 113       assert(src[0].reg_offset >= 0);
 114    if (src[1].file == GRF)
 115       assert(src[1].reg_offset >= 0);
 116 }
 117
 118 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
 119                  fs_reg src0, fs_reg src1, fs_reg src2)
 120 {
 121    init();
 122    this->opcode = opcode;
 123    this->dst = dst;
 124    this->src[0] = src0;
 125    this->src[1] = src1;
 126    this->src[2] = src2;
 127
 128    if (dst.file == GRF)
 129       assert(dst.reg_offset >= 0);
 130    if (src[0].file == GRF)
 131       assert(src[0].reg_offset >= 0);
 132    if (src[1].file == GRF)
 133       assert(src[1].reg_offset >= 0);
 134    if (src[2].file == GRF)
 135       assert(src[2].reg_offset >= 0);
 136 }
 137
 138 #define ALU1(op)                                                        \
 139    fs_inst *                                                            \
 140    fs_visitor::op(fs_reg dst, fs_reg src0)                              \
 141    {                                                                    \
 142       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
 143    }
 144
 145 #define ALU2(op)                                                        \
 146    fs_inst *                                                            \
 147    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1)                 \
 148    {                                                                    \
 149       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
 150    }
 151
 152 #define ALU3(op)                                                        \
 153    fs_inst *                                                            \
 154    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1, fs_reg src2)    \
 155    {                                                                    \
 156       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
 157    }
 158
 159 ALU1(NOT)
 160 ALU1(MOV)
 161 ALU1(FRC)
 162 ALU1(RNDD)
 163 ALU1(RNDE)
 164 ALU1(RNDZ)
 165 ALU2(ADD)
 166 ALU2(MUL)
 167 ALU2(MACH)
 168 ALU2(AND)
 169 ALU2(OR)
 170 ALU2(XOR)
 171 ALU2(SHL)
 172 ALU2(SHR)
 173 ALU2(ASR)
 174 ALU3(LRP)
 175 ALU1(BFREV)
 176 ALU3(BFE)
 177 ALU2(BFI1)
 178 ALU3(BFI2)
 179 ALU1(FBH)
 180 ALU1(FBL)
 181 ALU1(CBIT)
 182 ALU3(MAD)
 183
 184 /** Gen4 predicated IF. */
 185 fs_inst *
 186 fs_visitor::IF(uint32_t predicate)
 187 {
 188    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
 189    inst->predicate = predicate;
 190    return inst;
 191 }
 192
 193 /** Gen6+ IF with embedded comparison. */
 194 fs_inst *
 195 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
 196 {
 197    assert(brw->gen >= 6);
 198    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
 199                                         reg_null_d, src0, src1);
 200    inst->conditional_mod = condition;
 201    return inst;
 202 }
 203
 204 /**
 205  * CMP: Sets the low bit of the destination channels with the result
 206  * of the comparison, while the upper bits are undefined, and updates
 207  * the flag register with the packed 16 bits of the result.
 208  */
 209 fs_inst *
 210 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
 211 {
 212    fs_inst *inst;
 213
 214    /* Take the instruction:
 215     *
 216     * CMP null<d> src0<f> src1<f>
 217     *
 218     * Original gen4 does type conversion to the destination type before
 219     * comparison, producing garbage results for floating point comparisons.
 220     * gen5 does the comparison on the execution type (resolved source types),
 221     * so dst type doesn't matter.  gen6 does comparison and then uses the
 222     * result as if it was the dst type with no conversion, which happens to
 223     * mostly work out for float-interpreted-as-int since our comparisons are
 224     * for >0, =0, <0.
 225     */
 226    if (brw->gen == 4) {
 227       dst.type = src0.type;
 228       if (dst.file == HW_REG)
 229          dst.fixed_hw_reg.type = dst.type;
 230    }
 231
 232    resolve_ud_negate(&src0);
 233    resolve_ud_negate(&src1);
 234
 235    inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
 236    inst->conditional_mod = condition;
 237
 238    return inst;
 239 }
 240
 241 exec_list
 242 fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
 243                                        fs_reg varying_offset,
 244                                        uint32_t const_offset)
 245 {
 246    exec_list instructions;
 247    fs_inst *inst;
 248
 249    /* We have our constant surface use a pitch of 4 bytes, so our index can
 250     * be any component of a vector, and then we load 4 contiguous
 251     * components starting from that.
 252     *
 253     * We break down the const_offset to a portion added to the variable
 254     * offset and a portion done using reg_offset, which means that if you
 255     * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
 256     * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
 257     * CSE can later notice that those loads are all the same and eliminate
 258     * the redundant ones.
 259     */
 260    fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
 261    instructions.push_tail(ADD(vec4_offset,
 262                               varying_offset, const_offset & ~3));
 263
 264    int scale = 1;
 265    if (brw->gen == 4 && dispatch_width == 8) {
 266       /* Pre-gen5, we can either use a SIMD8 message that requires (header,
 267        * u, v, r) as parameters, or we can just use the SIMD16 message
 268        * consisting of (header, u).  We choose the second, at the cost of a
 269        * longer return length.
 270        */
 271       scale = 2;
 272    }
 273
 274    enum opcode op;
 275    if (brw->gen >= 7)
 276       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
 277    else
 278       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
 279    fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4 * scale), dst.type);
 280    inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
 281    inst->regs_written = 4 * scale;
 282    instructions.push_tail(inst);
 283
 284    if (brw->gen < 7) {
 285       inst->base_mrf = 13;
 286       inst->header_present = true;
 287       if (brw->gen == 4)
 288          inst->mlen = 3;
 289       else
 290          inst->mlen = 1 + dispatch_width / 8;
 291    }
 292
 293    vec4_result.reg_offset += (const_offset & 3) * scale;
 294    instructions.push_tail(MOV(dst, vec4_result));
 295
 296    return instructions;
 297 }
 298
 299 /**
 300  * A helper for MOV generation for fixing up broken hardware SEND dependency
 301  * handling.
 302  */
 303 fs_inst *
 304 fs_visitor::DEP_RESOLVE_MOV(int grf)
 305 {
 306    fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
 307
 308    inst->ir = NULL;
 309    inst->annotation = "send dependency resolve";
 310
 311    /* The caller always wants uncompressed to emit the minimal extra
 312     * dependencies, and to avoid having to deal with aligning its regs to 2.
 313     */
 314    inst->force_uncompressed = true;
 315
 316    return inst;
 317 }
 318
 319 bool
 320 fs_inst::equals(fs_inst *inst)
 321 {
 322    return (opcode == inst->opcode &&
 323            dst.equals(inst->dst) &&
 324            src[0].equals(inst->src[0]) &&
 325            src[1].equals(inst->src[1]) &&
 326            src[2].equals(inst->src[2]) &&
 327            saturate == inst->saturate &&
 328            predicate == inst->predicate &&
 329            conditional_mod == inst->conditional_mod &&
 330            mlen == inst->mlen &&
 331            base_mrf == inst->base_mrf &&
 332            sampler == inst->sampler &&
 333            target == inst->target &&
 334            eot == inst->eot &&
 335            header_present == inst->header_present &&
 336            shadow_compare == inst->shadow_compare &&
 337            offset == inst->offset);
 338 }
 339
 340 bool
 341 fs_inst::overwrites_reg(const fs_reg &reg)
 342 {
 343    return (reg.file == dst.file &&
 344            reg.reg == dst.reg &&
 345            reg.reg_offset >= dst.reg_offset  &&
 346            reg.reg_offset < dst.reg_offset + regs_written);
 347 }
 348
 349 bool
 350 fs_inst::is_send_from_grf()
 351 {
 352    return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
 353            opcode == SHADER_OPCODE_SHADER_TIME_ADD ||
 354            (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
 355             src[1].file == GRF));
 356 }
 357
 358 bool
 359 fs_visitor::can_do_source_mods(fs_inst *inst)
 360 {
 361    if (brw->gen == 6 && inst->is_math())
 362       return false;
 363
 364    if (inst->is_send_from_grf())
 365       return false;
 366
 367    return true;
 368 }
 369
 370 void
 371 fs_reg::init()
 372 {
 373    memset(this, 0, sizeof(*this));
 374    this->smear = -1;
 375 }
 376
 377 /** Generic unset register constructor. */
 378 fs_reg::fs_reg()
 379 {
 380    init();
 381    this->file = BAD_FILE;
 382 }
 383
 384 /** Immediate value constructor. */
 385 fs_reg::fs_reg(float f)
 386 {
 387    init();
 388    this->file = IMM;
 389    this->type = BRW_REGISTER_TYPE_F;
 390    this->imm.f = f;
 391 }
 392
 393 /** Immediate value constructor. */
 394 fs_reg::fs_reg(int32_t i)
 395 {
 396    init();
 397    this->file = IMM;
 398    this->type = BRW_REGISTER_TYPE_D;
 399    this->imm.i = i;
 400 }
 401
 402 /** Immediate value constructor. */
 403 fs_reg::fs_reg(uint32_t u)
 404 {
 405    init();
 406    this->file = IMM;
 407    this->type = BRW_REGISTER_TYPE_UD;
 408    this->imm.u = u;
 409 }
 410
 411 /** Fixed brw_reg Immediate value constructor. */
 412 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
 413 {
 414    init();
 415    this->file = HW_REG;
 416    this->fixed_hw_reg = fixed_hw_reg;
 417    this->type = fixed_hw_reg.type;
 418 }
 419
 420 bool
 421 fs_reg::equals(const fs_reg &r) const
 422 {
 423    return (file == r.file &&
 424            reg == r.reg &&
 425            reg_offset == r.reg_offset &&
 426            type == r.type &&
 427            negate == r.negate &&
 428            abs == r.abs &&
 429            !reladdr && !r.reladdr &&
 430            memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
 431                   sizeof(fixed_hw_reg)) == 0 &&
 432            smear == r.smear &&
 433            imm.u == r.imm.u);
 434 }
 435
 436 bool
 437 fs_reg::is_zero() const
 438 {
 439    if (file != IMM)
 440       return false;
 441
 442    return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
 443 }
 444
 445 bool
 446 fs_reg::is_one() const
 447 {
 448    if (file != IMM)
 449       return false;
 450
 451    return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
 452 }
 453
 454 bool
 455 fs_reg::is_valid_3src() const
 456 {
 457    return file == GRF || file == UNIFORM;
 458 }
 459
 460 int
 461 fs_visitor::type_size(const struct glsl_type *type)
 462 {
 463    unsigned int size, i;
 464
 465    switch (type->base_type) {
 466    case GLSL_TYPE_UINT:
 467    case GLSL_TYPE_INT:
 468    case GLSL_TYPE_FLOAT:
 469    case GLSL_TYPE_BOOL:
 470       return type->components();
 471    case GLSL_TYPE_ARRAY:
 472       return type_size(type->fields.array) * type->length;
 473    case GLSL_TYPE_STRUCT:
 474       size = 0;
 475       for (i = 0; i < type->length; i++) {
 476          size += type_size(type->fields.structure[i].type);
 477       }
 478       return size;
 479    case GLSL_TYPE_SAMPLER:
 480       /* Samplers take up no register space, since they're baked in at
 481        * link time.
 482        */
 483       return 0;
 484    case GLSL_TYPE_VOID:
 485    case GLSL_TYPE_ERROR:
 486    case GLSL_TYPE_INTERFACE:
 487       assert(!"not reached");
 488       break;
 489    }
 490
 491    return 0;
 492 }
 493
 494 fs_reg
 495 fs_visitor::get_timestamp()
 496 {
 497    assert(brw->gen >= 7);
 498
 499    fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 500                                           BRW_ARF_TIMESTAMP,
 501                                           0),
 502                              BRW_REGISTER_TYPE_UD));
 503
 504    fs_reg dst = fs_reg(this, glsl_type::uint_type);
 505
 506    fs_inst *mov = emit(MOV(dst, ts));
 507    /* We want to read the 3 fields we care about (mostly field 0, but also 2)
 508     * even if it's not enabled in the dispatch.
 509     */
 510    mov->force_writemask_all = true;
 511    mov->force_uncompressed = true;
 512
 513    /* The caller wants the low 32 bits of the timestamp.  Since it's running
 514     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
 515     * which is plenty of time for our purposes.  It is identical across the
 516     * EUs, but since it's tracking GPU core speed it will increment at a
 517     * varying rate as render P-states change.
 518     *
 519     * The caller could also check if render P-states have changed (or anything
 520     * else that might disrupt timing) by setting smear to 2 and checking if
 521     * that field is != 0.
 522     */
 523    dst.smear = 0;
 524
 525    return dst;
 526 }
 527
 528 void
 529 fs_visitor::emit_shader_time_begin()
 530 {
 531    current_annotation = "shader time start";
 532    shader_start_time = get_timestamp();
 533 }
 534
 535 void
 536 fs_visitor::emit_shader_time_end()
 537 {
 538    current_annotation = "shader time end";
 539
 540    enum shader_time_shader_type type, written_type, reset_type;
 541    if (dispatch_width == 8) {
 542       type = ST_FS8;
 543       written_type = ST_FS8_WRITTEN;
 544       reset_type = ST_FS8_RESET;
 545    } else {
 546       assert(dispatch_width == 16);
 547       type = ST_FS16;
 548       written_type = ST_FS16_WRITTEN;
 549       reset_type = ST_FS16_RESET;
 550    }
 551
 552    fs_reg shader_end_time = get_timestamp();
 553
 554    /* Check that there weren't any timestamp reset events (assuming these
 555     * were the only two timestamp reads that happened).
 556     */
 557    fs_reg reset = shader_end_time;
 558    reset.smear = 2;
 559    fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
 560    test->conditional_mod = BRW_CONDITIONAL_Z;
 561    emit(IF(BRW_PREDICATE_NORMAL));
 562
 563    push_force_uncompressed();
 564    fs_reg start = shader_start_time;
 565    start.negate = true;
 566    fs_reg diff = fs_reg(this, glsl_type::uint_type);
 567    emit(ADD(diff, start, shader_end_time));
 568
 569    /* If there were no instructions between the two timestamp gets, the diff
 570     * is 2 cycles.  Remove that overhead, so I can forget about that when
 571     * trying to determine the time taken for single instructions.
 572     */
 573    emit(ADD(diff, diff, fs_reg(-2u)));
 574
 575    emit_shader_time_write(type, diff);
 576    emit_shader_time_write(written_type, fs_reg(1u));
 577    emit(BRW_OPCODE_ELSE);
 578    emit_shader_time_write(reset_type, fs_reg(1u));
 579    emit(BRW_OPCODE_ENDIF);
 580
 581    pop_force_uncompressed();
 582 }
 583
 584 void
 585 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
 586                                    fs_reg value)
 587 {
 588    int shader_time_index =
 589       brw_get_shader_time_index(brw, shader_prog, &fp->Base, type);
 590    fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
 591
 592    fs_reg payload;
 593    if (dispatch_width == 8)
 594       payload = fs_reg(this, glsl_type::uvec2_type);
 595    else
 596       payload = fs_reg(this, glsl_type::uint_type);
 597
 598    emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
 599                 fs_reg(), payload, offset, value));
 600 }
 601
 602 void
 603 fs_visitor::fail(const char *format, ...)
 604 {
 605    va_list va;
 606    char *msg;
 607
 608    if (failed)
 609       return;
 610
 611    failed = true;
 612
 613    va_start(va, format);
 614    msg = ralloc_vasprintf(mem_ctx, format, va);
 615    va_end(va);
 616    msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
 617
 618    this->fail_msg = msg;
 619
 620    if (INTEL_DEBUG & DEBUG_WM) {
 621       fprintf(stderr, "%s",  msg);
 622    }
 623 }
 624
 625 fs_inst *
 626 fs_visitor::emit(enum opcode opcode)
 627 {
 628    return emit(fs_inst(opcode));
 629 }
 630
 631 fs_inst *
 632 fs_visitor::emit(enum opcode opcode, fs_reg dst)
 633 {
 634    return emit(fs_inst(opcode, dst));
 635 }
 636
 637 fs_inst *
 638 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
 639 {
 640    return emit(fs_inst(opcode, dst, src0));
 641 }
 642
 643 fs_inst *
 644 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 645 {
 646    return emit(fs_inst(opcode, dst, src0, src1));
 647 }
 648
 649 fs_inst *
 650 fs_visitor::emit(enum opcode opcode, fs_reg dst,
 651                  fs_reg src0, fs_reg src1, fs_reg src2)
 652 {
 653    return emit(fs_inst(opcode, dst, src0, src1, src2));
 654 }
 655
 656 void
 657 fs_visitor::push_force_uncompressed()
 658 {
 659    force_uncompressed_stack++;
 660 }
 661
 662 void
 663 fs_visitor::pop_force_uncompressed()
 664 {
 665    force_uncompressed_stack--;
 666    assert(force_uncompressed_stack >= 0);
 667 }
 668
 669 void
 670 fs_visitor::push_force_sechalf()
 671 {
 672    force_sechalf_stack++;
 673 }
 674
 675 void
 676 fs_visitor::pop_force_sechalf()
 677 {
 678    force_sechalf_stack--;
 679    assert(force_sechalf_stack >= 0);
 680 }
 681
 682 /**
 683  * Returns true if the instruction has a flag that means it won't
 684  * update an entire destination register.
 685  *
 686  * For example, dead code elimination and live variable analysis want to know
 687  * when a write to a variable screens off any preceding values that were in
 688  * it.
 689  */
 690 bool
 691 fs_inst::is_partial_write()
 692 {
 693    return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
 694            this->force_uncompressed ||
 695            this->force_sechalf);
 696 }
 697
 698 /**
 699  * Returns how many MRFs an FS opcode will write over.
 700  *
 701  * Note that this is not the 0 or 1 implied writes in an actual gen
 702  * instruction -- the FS opcodes often generate MOVs in addition.
 703  */
 704 int
 705 fs_visitor::implied_mrf_writes(fs_inst *inst)
 706 {
 707    if (inst->mlen == 0)
 708       return 0;
 709
 710    switch (inst->opcode) {
 711    case SHADER_OPCODE_RCP:
 712    case SHADER_OPCODE_RSQ:
 713    case SHADER_OPCODE_SQRT:
 714    case SHADER_OPCODE_EXP2:
 715    case SHADER_OPCODE_LOG2:
 716    case SHADER_OPCODE_SIN:
 717    case SHADER_OPCODE_COS:
 718       return 1 * dispatch_width / 8;
 719    case SHADER_OPCODE_POW:
 720    case SHADER_OPCODE_INT_QUOTIENT:
 721    case SHADER_OPCODE_INT_REMAINDER:
 722       return 2 * dispatch_width / 8;
 723    case SHADER_OPCODE_TEX:
 724    case FS_OPCODE_TXB:
 725    case SHADER_OPCODE_TXD:
 726    case SHADER_OPCODE_TXF:
 727    case SHADER_OPCODE_TXF_MS:
 728    case SHADER_OPCODE_TG4:
 729    case SHADER_OPCODE_TXL:
 730    case SHADER_OPCODE_TXS:
 731    case SHADER_OPCODE_LOD:
 732       return 1;
 733    case FS_OPCODE_FB_WRITE:
 734       return 2;
 735    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 736    case FS_OPCODE_UNSPILL:
 737       return 1;
 738    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
 739       return inst->mlen;
 740    case FS_OPCODE_SPILL:
 741       return 2;
 742    default:
 743       assert(!"not reached");
 744       return inst->mlen;
 745    }
 746 }
 747
 748 int
 749 fs_visitor::virtual_grf_alloc(int size)
 750 {
 751    if (virtual_grf_array_size <= virtual_grf_count) {
 752       if (virtual_grf_array_size == 0)
 753          virtual_grf_array_size = 16;
 754       else
 755          virtual_grf_array_size *= 2;
 756       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 757                                    virtual_grf_array_size);
 758    }
 759    virtual_grf_sizes[virtual_grf_count] = size;
 760    return virtual_grf_count++;
 761 }
 762
 763 /** Fixed HW reg constructor. */
 764 fs_reg::fs_reg(enum register_file file, int reg)
 765 {
 766    init();
 767    this->file = file;
 768    this->reg = reg;
 769    this->type = BRW_REGISTER_TYPE_F;
 770 }
 771
 772 /** Fixed HW reg constructor. */
 773 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
 774 {
 775    init();
 776    this->file = file;
 777    this->reg = reg;
 778    this->type = type;
 779 }
 780
 781 /** Automatic reg constructor. */
 782 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
 783 {
 784    init();
 785
 786    this->file = GRF;
 787    this->reg = v->virtual_grf_alloc(v->type_size(type));
 788    this->reg_offset = 0;
 789    this->type = brw_type_for_base_type(type);
 790 }
 791
 792 fs_reg *
 793 fs_visitor::variable_storage(ir_variable *var)
 794 {
 795    return (fs_reg *)hash_table_find(this->variable_ht, var);
 796 }
 797
 798 void
 799 import_uniforms_callback(const void *key,
 800                          void *data,
 801                          void *closure)
 802 {
 803    struct hash_table *dst_ht = (struct hash_table *)closure;
 804    const fs_reg *reg = (const fs_reg *)data;
 805
 806    if (reg->file != UNIFORM)
 807       return;
 808
 809    hash_table_insert(dst_ht, data, key);
 810 }
 811
 812 /* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
 813  * This brings in those uniform definitions
 814  */
 815 void
 816 fs_visitor::import_uniforms(fs_visitor *v)
 817 {
 818    hash_table_call_foreach(v->variable_ht,
 819                            import_uniforms_callback,
 820                            variable_ht);
 821    this->params_remap = v->params_remap;
 822    this->nr_params_remap = v->nr_params_remap;
 823 }
 824
 825 /* Our support for uniforms is piggy-backed on the struct
 826  * gl_fragment_program, because that's where the values actually
 827  * get stored, rather than in some global gl_shader_program uniform
 828  * store.
 829  */
 830 void
 831 fs_visitor::setup_uniform_values(ir_variable *ir)
 832 {
 833    int namelen = strlen(ir->name);
 834
 835    /* The data for our (non-builtin) uniforms is stored in a series of
 836     * gl_uniform_driver_storage structs for each subcomponent that
 837     * glGetUniformLocation() could name.  We know it's been set up in the same
 838     * order we'd walk the type, so walk the list of storage and find anything
 839     * with our name, or the prefix of a component that starts with our name.
 840     */
 841    unsigned params_before = c->prog_data.nr_params;
 842    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
 843       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 844
 845       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 846           (storage->name[namelen] != 0 &&
 847            storage->name[namelen] != '.' &&
 848            storage->name[namelen] != '[')) {
 849          continue;
 850       }
 851
 852       unsigned slots = storage->type->component_slots();
 853       if (storage->array_elements)
 854          slots *= storage->array_elements;
 855
 856       for (unsigned i = 0; i < slots; i++) {
 857          c->prog_data.param[c->prog_data.nr_params++] =
 858             &storage->storage[i].f;
 859       }
 860    }
 861
 862    /* Make sure we actually initialized the right amount of stuff here. */
 863    assert(params_before + ir->type->component_slots() ==
 864           c->prog_data.nr_params);
 865    (void)params_before;
 866 }
 867
 868
 869 /* Our support for builtin uniforms is even scarier than non-builtin.
 870  * It sits on top of the PROG_STATE_VAR parameters that are
 871  * automatically updated from GL context state.
 872  */
 873 void
 874 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
 875 {
 876    const ir_state_slot *const slots = ir->state_slots;
 877    assert(ir->state_slots != NULL);
 878
 879    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 880       /* This state reference has already been setup by ir_to_mesa, but we'll
 881        * get the same index back here.
 882        */
 883       int index = _mesa_add_state_reference(this->fp->Base.Parameters,
 884                                             (gl_state_index *)slots[i].tokens);
 885
 886       /* Add each of the unique swizzles of the element as a parameter.
 887        * This'll end up matching the expected layout of the
 888        * array/matrix/structure we're trying to fill in.
 889        */
 890       int last_swiz = -1;
 891       for (unsigned int j = 0; j < 4; j++) {
 892          int swiz = GET_SWZ(slots[i].swizzle, j);
 893          if (swiz == last_swiz)
 894             break;
 895          last_swiz = swiz;
 896
 897          c->prog_data.param[c->prog_data.nr_params++] =
 898             &fp->Base.Parameters->ParameterValues[index][swiz].f;
 899       }
 900    }
 901 }
 902
 903 fs_reg *
 904 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
 905 {
 906    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 907    fs_reg wpos = *reg;
 908    bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
 909
 910    /* gl_FragCoord.x */
 911    if (ir->pixel_center_integer) {
 912       emit(MOV(wpos, this->pixel_x));
 913    } else {
 914       emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
 915    }
 916    wpos.reg_offset++;
 917
 918    /* gl_FragCoord.y */
 919    if (!flip && ir->pixel_center_integer) {
 920       emit(MOV(wpos, this->pixel_y));
 921    } else {
 922       fs_reg pixel_y = this->pixel_y;
 923       float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
 924
 925       if (flip) {
 926          pixel_y.negate = true;
 927          offset += c->key.drawable_height - 1.0;
 928       }
 929
 930       emit(ADD(wpos, pixel_y, fs_reg(offset)));
 931    }
 932    wpos.reg_offset++;
 933
 934    /* gl_FragCoord.z */
 935    if (brw->gen >= 6) {
 936       emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
 937    } else {
 938       emit(FS_OPCODE_LINTERP, wpos,
 939            this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
 940            this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
 941            interp_reg(VARYING_SLOT_POS, 2));
 942    }
 943    wpos.reg_offset++;
 944
 945    /* gl_FragCoord.w: Already set up in emit_interpolation */
 946    emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
 947
 948    return reg;
 949 }
 950
 951 fs_inst *
 952 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
 953                          glsl_interp_qualifier interpolation_mode,
 954                          bool is_centroid)
 955 {
 956    brw_wm_barycentric_interp_mode barycoord_mode;
 957    if (brw->gen >= 6) {
 958       if (is_centroid) {
 959          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
 960             barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
 961          else
 962             barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
 963       } else {
 964          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
 965             barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
 966          else
 967             barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
 968       }
 969    } else {
 970       /* On Ironlake and below, there is only one interpolation mode.
 971        * Centroid interpolation doesn't mean anything on this hardware --
 972        * there is no multisampling.
 973        */
 974       barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
 975    }
 976    return emit(FS_OPCODE_LINTERP, attr,
 977                this->delta_x[barycoord_mode],
 978                this->delta_y[barycoord_mode], interp);
 979 }
 980
 981 fs_reg *
 982 fs_visitor::emit_general_interpolation(ir_variable *ir)
 983 {
 984    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 985    reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
 986    fs_reg attr = *reg;
 987
 988    unsigned int array_elements;
 989    const glsl_type *type;
 990
 991    if (ir->type->is_array()) {
 992       array_elements = ir->type->length;
 993       if (array_elements == 0) {
 994          fail("dereferenced array '%s' has length 0\n", ir->name);
 995       }
 996       type = ir->type->fields.array;
 997    } else {
 998       array_elements = 1;
 999       type = ir->type;
1000    }
1001
1002    glsl_interp_qualifier interpolation_mode =
1003       ir->determine_interpolation_mode(c->key.flat_shade);
1004
1005    int location = ir->location;
1006    for (unsigned int i = 0; i < array_elements; i++) {
1007       for (unsigned int j = 0; j < type->matrix_columns; j++) {
1008          if (c->prog_data.urb_setup[location] == -1) {
1009             /* If there's no incoming setup data for this slot, don't
1010              * emit interpolation for it.
1011              */
1012             attr.reg_offset += type->vector_elements;
1013             location++;
1014             continue;
1015          }
1016
1017          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1018             /* Constant interpolation (flat shading) case. The SF has
1019              * handed us defined values in only the constant offset
1020              * field of the setup reg.
1021              */
1022             for (unsigned int k = 0; k < type->vector_elements; k++) {
1023                struct brw_reg interp = interp_reg(location, k);
1024                interp = suboffset(interp, 3);
1025                interp.type = reg->type;
1026                emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1027                attr.reg_offset++;
1028             }
1029          } else {
1030             /* Smooth/noperspective interpolation case. */
1031             for (unsigned int k = 0; k < type->vector_elements; k++) {
1032                /* FINISHME: At some point we probably want to push
1033                 * this farther by giving similar treatment to the
1034                 * other potentially constant components of the
1035                 * attribute, as well as making brw_vs_constval.c
1036                 * handle varyings other than gl_TexCoord.
1037                 */
1038                struct brw_reg interp = interp_reg(location, k);
1039                emit_linterp(attr, fs_reg(interp), interpolation_mode,
1040                             ir->centroid);
1041                if (brw->needs_unlit_centroid_workaround && ir->centroid) {
1042                   /* Get the pixel/sample mask into f0 so that we know
1043                    * which pixels are lit.  Then, for each channel that is
1044                    * unlit, replace the centroid data with non-centroid
1045                    * data.
1046                    */
1047                   emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1048                   fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1049                                                interpolation_mode, false);
1050                   inst->predicate = BRW_PREDICATE_NORMAL;
1051                   inst->predicate_inverse = true;
1052                }
1053                if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1054                   emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1055                }
1056                attr.reg_offset++;
1057             }
1058
1059          }
1060          location++;
1061       }
1062    }
1063
1064    return reg;
1065 }
1066
1067 fs_reg *
1068 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1069 {
1070    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1071
1072    /* The frontfacing comes in as a bit in the thread payload. */
1073    if (brw->gen >= 6) {
1074       emit(BRW_OPCODE_ASR, *reg,
1075            fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1076            fs_reg(15));
1077       emit(BRW_OPCODE_NOT, *reg, *reg);
1078       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1079    } else {
1080       struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1081       /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1082        * us front face
1083        */
1084       emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1085       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1086    }
1087
1088    return reg;
1089 }
1090
1091 fs_reg
1092 fs_visitor::fix_math_operand(fs_reg src)
1093 {
1094    /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1095     * might be able to do better by doing execsize = 1 math and then
1096     * expanding that result out, but we would need to be careful with
1097     * masking.
1098     *
1099     * The hardware ignores source modifiers (negate and abs) on math
1100     * instructions, so we also move to a temp to set those up.
1101     */
1102    if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1103        !src.abs && !src.negate)
1104       return src;
1105
1106    /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1107     * operands to math
1108     */
1109    if (brw->gen >= 7 && src.file != IMM)
1110       return src;
1111
1112    fs_reg expanded = fs_reg(this, glsl_type::float_type);
1113    expanded.type = src.type;
1114    emit(BRW_OPCODE_MOV, expanded, src);
1115    return expanded;
1116 }
1117
1118 fs_inst *
1119 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1120 {
1121    switch (opcode) {
1122    case SHADER_OPCODE_RCP:
1123    case SHADER_OPCODE_RSQ:
1124    case SHADER_OPCODE_SQRT:
1125    case SHADER_OPCODE_EXP2:
1126    case SHADER_OPCODE_LOG2:
1127    case SHADER_OPCODE_SIN:
1128    case SHADER_OPCODE_COS:
1129       break;
1130    default:
1131       assert(!"not reached: bad math opcode");
1132       return NULL;
1133    }
1134
1135    /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
1136     * might be able to do better by doing execsize = 1 math and then
1137     * expanding that result out, but we would need to be careful with
1138     * masking.
1139     *
1140     * Gen 6 hardware ignores source modifiers (negate and abs) on math
1141     * instructions, so we also move to a temp to set those up.
1142     */
1143    if (brw->gen >= 6)
1144       src = fix_math_operand(src);
1145
1146    fs_inst *inst = emit(opcode, dst, src);
1147
1148    if (brw->gen < 6) {
1149       inst->base_mrf = 2;
1150       inst->mlen = dispatch_width / 8;
1151    }
1152
1153    return inst;
1154 }
1155
1156 fs_inst *
1157 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1158 {
1159    int base_mrf = 2;
1160    fs_inst *inst;
1161
1162    switch (opcode) {
1163    case SHADER_OPCODE_INT_QUOTIENT:
1164    case SHADER_OPCODE_INT_REMAINDER:
1165       if (brw->gen >= 7 && dispatch_width == 16)
1166          fail("16-wide INTDIV unsupported\n");
1167       break;
1168    case SHADER_OPCODE_POW:
1169       break;
1170    default:
1171       assert(!"not reached: unsupported binary math opcode.");
1172       return NULL;
1173    }
1174
1175    if (brw->gen >= 6) {
1176       src0 = fix_math_operand(src0);
1177       src1 = fix_math_operand(src1);
1178
1179       inst = emit(opcode, dst, src0, src1);
1180    } else {
1181       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1182        * "Message Payload":
1183        *
1184        * "Operand0[7].  For the INT DIV functions, this operand is the
1185        *  denominator."
1186        *  ...
1187        * "Operand1[7].  For the INT DIV functions, this operand is the
1188        *  numerator."
1189        */
1190       bool is_int_div = opcode != SHADER_OPCODE_POW;
1191       fs_reg &op0 = is_int_div ? src1 : src0;
1192       fs_reg &op1 = is_int_div ? src0 : src1;
1193
1194       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1195       inst = emit(opcode, dst, op0, reg_null_f);
1196
1197       inst->base_mrf = base_mrf;
1198       inst->mlen = 2 * dispatch_width / 8;
1199    }
1200    return inst;
1201 }
1202
1203 void
1204 fs_visitor::assign_curb_setup()
1205 {
1206    c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
1207    if (dispatch_width == 8) {
1208       c->prog_data.first_curbe_grf = c->nr_payload_regs;
1209    } else {
1210       c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1211    }
1212
1213    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1214    foreach_list(node, &this->instructions) {
1215       fs_inst *inst = (fs_inst *)node;
1216
1217       for (unsigned int i = 0; i < 3; i++) {
1218          if (inst->src[i].file == UNIFORM) {
1219             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1220             struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1221                                                   constant_nr / 8,
1222                                                   constant_nr % 8);
1223
1224             inst->src[i].file = HW_REG;
1225             inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
1226          }
1227       }
1228    }
1229 }
1230
1231 void
1232 fs_visitor::calculate_urb_setup()
1233 {
1234    for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1235       c->prog_data.urb_setup[i] = -1;
1236    }
1237
1238    int urb_next = 0;
1239    /* Figure out where each of the incoming setup attributes lands. */
1240    if (brw->gen >= 6) {
1241       if (_mesa_bitcount_64(fp->Base.InputsRead &
1242                             BRW_FS_VARYING_INPUT_MASK) <= 16) {
1243          /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1244           * first 16 varying inputs, so we can put them wherever we want.
1245           * Just put them in order.
1246           *
1247           * This is useful because it means that (a) inputs not used by the
1248           * fragment shader won't take up valuable register space, and (b) we
1249           * won't have to recompile the fragment shader if it gets paired with
1250           * a different vertex (or geometry) shader.
1251           */
1252          for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1253             if (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1254                 BITFIELD64_BIT(i)) {
1255                c->prog_data.urb_setup[i] = urb_next++;
1256             }
1257          }
1258       } else {
1259          /* We have enough input varyings that the SF/SBE pipeline stage can't
1260           * arbitrarily rearrange them to suit our whim; we have to put them
1261           * in an order that matches the output of the previous pipeline stage
1262           * (geometry or vertex shader).
1263           */
1264          struct brw_vue_map prev_stage_vue_map;
1265          brw_compute_vue_map(brw, &prev_stage_vue_map,
1266                              c->key.input_slots_valid);
1267          int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1268          assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1269          for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1270               slot++) {
1271             int varying = prev_stage_vue_map.slot_to_varying[slot];
1272             /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1273              * unused.
1274              */
1275             if (varying != BRW_VARYING_SLOT_COUNT &&
1276                 (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1277                  BITFIELD64_BIT(varying))) {
1278                c->prog_data.urb_setup[varying] = slot - first_slot;
1279             }
1280          }
1281          urb_next = prev_stage_vue_map.num_slots - first_slot;
1282       }
1283    } else {
1284       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1285       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1286          /* Point size is packed into the header, not as a general attribute */
1287          if (i == VARYING_SLOT_PSIZ)
1288             continue;
1289
1290          if (c->key.input_slots_valid & BITFIELD64_BIT(i)) {
1291             /* The back color slot is skipped when the front color is
1292              * also written to.  In addition, some slots can be
1293              * written in the vertex shader and not read in the
1294              * fragment shader.  So the register number must always be
1295              * incremented, mapped or not.
1296              */
1297             if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1298                c->prog_data.urb_setup[i] = urb_next;
1299             urb_next++;
1300          }
1301       }
1302
1303       /*
1304        * It's a FS only attribute, and we did interpolation for this attribute
1305        * in SF thread. So, count it here, too.
1306        *
1307        * See compile_sf_prog() for more info.
1308        */
1309       if (fp->Base.InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1310          c->prog_data.urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1311    }
1312
1313    c->prog_data.num_varying_inputs = urb_next;
1314 }
1315
1316 void
1317 fs_visitor::assign_urb_setup()
1318 {
1319    int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1320
1321    /* Offset all the urb_setup[] index by the actual position of the
1322     * setup regs, now that the location of the constants has been chosen.
1323     */
1324    foreach_list(node, &this->instructions) {
1325       fs_inst *inst = (fs_inst *)node;
1326
1327       if (inst->opcode == FS_OPCODE_LINTERP) {
1328          assert(inst->src[2].file == HW_REG);
1329          inst->src[2].fixed_hw_reg.nr += urb_start;
1330       }
1331
1332       if (inst->opcode == FS_OPCODE_CINTERP) {
1333          assert(inst->src[0].file == HW_REG);
1334          inst->src[0].fixed_hw_reg.nr += urb_start;
1335       }
1336    }
1337
1338    /* Each attribute is 4 setup channels, each of which is half a reg. */
1339    this->first_non_payload_grf =
1340       urb_start + c->prog_data.num_varying_inputs * 2;
1341 }
1342
1343 /**
1344  * Split large virtual GRFs into separate components if we can.
1345  *
1346  * This is mostly duplicated with what brw_fs_vector_splitting does,
1347  * but that's really conservative because it's afraid of doing
1348  * splitting that doesn't result in real progress after the rest of
1349  * the optimization phases, which would cause infinite looping in
1350  * optimization.  We can do it once here, safely.  This also has the
1351  * opportunity to split interpolated values, or maybe even uniforms,
1352  * which we don't have at the IR level.
1353  *
1354  * We want to split, because virtual GRFs are what we register
1355  * allocate and spill (due to contiguousness requirements for some
1356  * instructions), and they're what we naturally generate in the
1357  * codegen process, but most virtual GRFs don't actually need to be
1358  * contiguous sets of GRFs.  If we split, we'll end up with reduced
1359  * live intervals and better dead code elimination and coalescing.
1360  */
1361 void
1362 fs_visitor::split_virtual_grfs()
1363 {
1364    int num_vars = this->virtual_grf_count;
1365    bool split_grf[num_vars];
1366    int new_virtual_grf[num_vars];
1367
1368    /* Try to split anything > 0 sized. */
1369    for (int i = 0; i < num_vars; i++) {
1370       if (this->virtual_grf_sizes[i] != 1)
1371          split_grf[i] = true;
1372       else
1373          split_grf[i] = false;
1374    }
1375
1376    if (brw->has_pln &&
1377        this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1378       /* PLN opcodes rely on the delta_xy being contiguous.  We only have to
1379        * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1380        * Gen6, that was the only supported interpolation mode, and since Gen6,
1381        * delta_x and delta_y are in fixed hardware registers.
1382        */
1383       split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1384          false;
1385    }
1386
1387    foreach_list(node, &this->instructions) {
1388       fs_inst *inst = (fs_inst *)node;
1389
1390       /* If there's a SEND message that requires contiguous destination
1391        * registers, no splitting is allowed.
1392        */
1393       if (inst->regs_written > 1) {
1394          split_grf[inst->dst.reg] = false;
1395       }
1396
1397       /* If we're sending from a GRF, don't split it, on the assumption that
1398        * the send is reading the whole thing.
1399        */
1400       if (inst->is_send_from_grf()) {
1401          for (int i = 0; i < 3; i++) {
1402             if (inst->src[i].file == GRF) {
1403                split_grf[inst->src[i].reg] = false;
1404             }
1405          }
1406       }
1407    }
1408
1409    /* Allocate new space for split regs.  Note that the virtual
1410     * numbers will be contiguous.
1411     */
1412    for (int i = 0; i < num_vars; i++) {
1413       if (split_grf[i]) {
1414          new_virtual_grf[i] = virtual_grf_alloc(1);
1415          for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1416             int reg = virtual_grf_alloc(1);
1417             assert(reg == new_virtual_grf[i] + j - 1);
1418             (void) reg;
1419          }
1420          this->virtual_grf_sizes[i] = 1;
1421       }
1422    }
1423
1424    foreach_list(node, &this->instructions) {
1425       fs_inst *inst = (fs_inst *)node;
1426
1427       if (inst->dst.file == GRF &&
1428           split_grf[inst->dst.reg] &&
1429           inst->dst.reg_offset != 0) {
1430          inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1431                           inst->dst.reg_offset - 1);
1432          inst->dst.reg_offset = 0;
1433       }
1434       for (int i = 0; i < 3; i++) {
1435          if (inst->src[i].file == GRF &&
1436              split_grf[inst->src[i].reg] &&
1437              inst->src[i].reg_offset != 0) {
1438             inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1439                                 inst->src[i].reg_offset - 1);
1440             inst->src[i].reg_offset = 0;
1441          }
1442       }
1443    }
1444    this->live_intervals_valid = false;
1445 }
1446
1447 /**
1448  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1449  *
1450  * During code generation, we create tons of temporary variables, many of
1451  * which get immediately killed and are never used again.  Yet, in later
1452  * optimization and analysis passes, such as compute_live_intervals, we need
1453  * to loop over all the virtual GRFs.  Compacting them can save a lot of
1454  * overhead.
1455  */
1456 void
1457 fs_visitor::compact_virtual_grfs()
1458 {
1459    /* Mark which virtual GRFs are used, and count how many. */
1460    int remap_table[this->virtual_grf_count];
1461    memset(remap_table, -1, sizeof(remap_table));
1462
1463    foreach_list(node, &this->instructions) {
1464       const fs_inst *inst = (const fs_inst *) node;
1465
1466       if (inst->dst.file == GRF)
1467          remap_table[inst->dst.reg] = 0;
1468
1469       for (int i = 0; i < 3; i++) {
1470          if (inst->src[i].file == GRF)
1471             remap_table[inst->src[i].reg] = 0;
1472       }
1473    }
1474
1475    /* In addition to registers used in instructions, fs_visitor keeps
1476     * direct references to certain special values which must be patched:
1477     */
1478    fs_reg *special[] = {
1479       &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1480       &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1481       &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1482       &delta_x[0], &delta_x[1], &delta_x[2],
1483       &delta_x[3], &delta_x[4], &delta_x[5],
1484       &delta_y[0], &delta_y[1], &delta_y[2],
1485       &delta_y[3], &delta_y[4], &delta_y[5],
1486    };
1487    STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1488    STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1489
1490    /* Treat all special values as used, to be conservative */
1491    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1492       if (special[i]->file == GRF)
1493          remap_table[special[i]->reg] = 0;
1494    }
1495
1496    /* Compact the GRF arrays. */
1497    int new_index = 0;
1498    for (int i = 0; i < this->virtual_grf_count; i++) {
1499       if (remap_table[i] != -1) {
1500          remap_table[i] = new_index;
1501          virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1502          if (live_intervals_valid) {
1503             virtual_grf_start[new_index] = virtual_grf_start[i];
1504             virtual_grf_end[new_index] = virtual_grf_end[i];
1505          }
1506          ++new_index;
1507       }
1508    }
1509
1510    this->virtual_grf_count = new_index;
1511
1512    /* Patch all the instructions to use the newly renumbered registers */
1513    foreach_list(node, &this->instructions) {
1514       fs_inst *inst = (fs_inst *) node;
1515
1516       if (inst->dst.file == GRF)
1517          inst->dst.reg = remap_table[inst->dst.reg];
1518
1519       for (int i = 0; i < 3; i++) {
1520          if (inst->src[i].file == GRF)
1521             inst->src[i].reg = remap_table[inst->src[i].reg];
1522       }
1523    }
1524
1525    /* Patch all the references to special values */
1526    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1527       if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1528          special[i]->reg = remap_table[special[i]->reg];
1529    }
1530 }
1531
1532 bool
1533 fs_visitor::remove_dead_constants()
1534 {
1535    if (dispatch_width == 8) {
1536       this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
1537       this->nr_params_remap = c->prog_data.nr_params;
1538
1539       for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
1540          this->params_remap[i] = -1;
1541
1542       /* Find which params are still in use. */
1543       foreach_list(node, &this->instructions) {
1544          fs_inst *inst = (fs_inst *)node;
1545
1546          for (int i = 0; i < 3; i++) {
1547             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1548
1549             if (inst->src[i].file != UNIFORM)
1550                continue;
1551
1552             /* Section 5.11 of the OpenGL 4.3 spec says:
1553              *
1554              *     "Out-of-bounds reads return undefined values, which include
1555              *     values from other variables of the active program or zero."
1556              */
1557             if (constant_nr < 0 || constant_nr >= (int)c->prog_data.nr_params) {
1558                constant_nr = 0;
1559             }
1560
1561             /* For now, set this to non-negative.  We'll give it the
1562              * actual new number in a moment, in order to keep the
1563              * register numbers nicely ordered.
1564              */
1565             this->params_remap[constant_nr] = 0;
1566          }
1567       }
1568
1569       /* Figure out what the new numbers for the params will be.  At some
1570        * point when we're doing uniform array access, we're going to want
1571        * to keep the distinction between .reg and .reg_offset, but for
1572        * now we don't care.
1573        */
1574       unsigned int new_nr_params = 0;
1575       for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1576          if (this->params_remap[i] != -1) {
1577             this->params_remap[i] = new_nr_params++;
1578          }
1579       }
1580
1581       /* Update the list of params to be uploaded to match our new numbering. */
1582       for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1583          int remapped = this->params_remap[i];
1584
1585          if (remapped == -1)
1586             continue;
1587
1588          c->prog_data.param[remapped] = c->prog_data.param[i];
1589       }
1590
1591       c->prog_data.nr_params = new_nr_params;
1592    } else {
1593       /* This should have been generated in the 8-wide pass already. */
1594       assert(this->params_remap);
1595    }
1596
1597    /* Now do the renumbering of the shader to remove unused params. */
1598    foreach_list(node, &this->instructions) {
1599       fs_inst *inst = (fs_inst *)node;
1600
1601       for (int i = 0; i < 3; i++) {
1602          int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1603
1604          if (inst->src[i].file != UNIFORM)
1605             continue;
1606
1607          /* as above alias to 0 */
1608          if (constant_nr < 0 || constant_nr >= (int)this->nr_params_remap) {
1609             constant_nr = 0;
1610          }
1611          assert(this->params_remap[constant_nr] != -1);
1612          inst->src[i].reg = this->params_remap[constant_nr];
1613          inst->src[i].reg_offset = 0;
1614       }
1615    }
1616
1617    return true;
1618 }
1619
1620 /*
1621  * Implements array access of uniforms by inserting a
1622  * PULL_CONSTANT_LOAD instruction.
1623  *
1624  * Unlike temporary GRF array access (where we don't support it due to
1625  * the difficulty of doing relative addressing on instruction
1626  * destinations), we could potentially do array access of uniforms
1627  * that were loaded in GRF space as push constants.  In real-world
1628  * usage we've seen, though, the arrays being used are always larger
1629  * than we could load as push constants, so just always move all
1630  * uniform array access out to a pull constant buffer.
1631  */
1632 void
1633 fs_visitor::move_uniform_array_access_to_pull_constants()
1634 {
1635    int pull_constant_loc[c->prog_data.nr_params];
1636
1637    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1638       pull_constant_loc[i] = -1;
1639    }
1640
1641    /* Walk through and find array access of uniforms.  Put a copy of that
1642     * uniform in the pull constant buffer.
1643     *
1644     * Note that we don't move constant-indexed accesses to arrays.  No
1645     * testing has been done of the performance impact of this choice.
1646     */
1647    foreach_list_safe(node, &this->instructions) {
1648       fs_inst *inst = (fs_inst *)node;
1649
1650       for (int i = 0 ; i < 3; i++) {
1651          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1652             continue;
1653
1654          int uniform = inst->src[i].reg;
1655
1656          /* If this array isn't already present in the pull constant buffer,
1657           * add it.
1658           */
1659          if (pull_constant_loc[uniform] == -1) {
1660             const float **values = &c->prog_data.param[uniform];
1661
1662             pull_constant_loc[uniform] = c->prog_data.nr_pull_params;
1663
1664             assert(param_size[uniform]);
1665
1666             for (int j = 0; j < param_size[uniform]; j++) {
1667                c->prog_data.pull_param[c->prog_data.nr_pull_params++] =
1668                   values[j];
1669             }
1670          }
1671
1672          /* Set up the annotation tracking for new generated instructions. */
1673          base_ir = inst->ir;
1674          current_annotation = inst->annotation;
1675
1676          fs_reg surf_index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1677          fs_reg temp = fs_reg(this, glsl_type::float_type);
1678          exec_list list = VARYING_PULL_CONSTANT_LOAD(temp,
1679                                                      surf_index,
1680                                                      *inst->src[i].reladdr,
1681                                                      pull_constant_loc[uniform] +
1682                                                      inst->src[i].reg_offset);
1683          inst->insert_before(&list);
1684
1685          inst->src[i].file = temp.file;
1686          inst->src[i].reg = temp.reg;
1687          inst->src[i].reg_offset = temp.reg_offset;
1688          inst->src[i].reladdr = NULL;
1689       }
1690    }
1691 }
1692
1693 /**
1694  * Choose accesses from the UNIFORM file to demote to using the pull
1695  * constant buffer.
1696  *
1697  * We allow a fragment shader to have more than the specified minimum
1698  * maximum number of fragment shader uniform components (64).  If
1699  * there are too many of these, they'd fill up all of register space.
1700  * So, this will push some of them out to the pull constant buffer and
1701  * update the program to load them.
1702  */
1703 void
1704 fs_visitor::setup_pull_constants()
1705 {
1706    /* Only allow 16 registers (128 uniform components) as push constants. */
1707    unsigned int max_uniform_components = 16 * 8;
1708    if (c->prog_data.nr_params <= max_uniform_components)
1709       return;
1710
1711    if (dispatch_width == 16) {
1712       fail("Pull constants not supported in 16-wide\n");
1713       return;
1714    }
1715
1716    /* Just demote the end of the list.  We could probably do better
1717     * here, demoting things that are rarely used in the program first.
1718     */
1719    unsigned int pull_uniform_base = max_uniform_components;
1720
1721    int pull_constant_loc[c->prog_data.nr_params];
1722    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1723       if (i < pull_uniform_base) {
1724          pull_constant_loc[i] = -1;
1725       } else {
1726          pull_constant_loc[i] = -1;
1727          /* If our constant is already being uploaded for reladdr purposes,
1728           * reuse it.
1729           */
1730          for (unsigned int j = 0; j < c->prog_data.nr_pull_params; j++) {
1731             if (c->prog_data.pull_param[j] == c->prog_data.param[i]) {
1732                pull_constant_loc[i] = j;
1733                break;
1734             }
1735          }
1736          if (pull_constant_loc[i] == -1) {
1737             int pull_index = c->prog_data.nr_pull_params++;
1738             c->prog_data.pull_param[pull_index] = c->prog_data.param[i];
1739             pull_constant_loc[i] = pull_index;;
1740          }
1741       }
1742    }
1743    c->prog_data.nr_params = pull_uniform_base;
1744
1745    foreach_list(node, &this->instructions) {
1746       fs_inst *inst = (fs_inst *)node;
1747
1748       for (int i = 0; i < 3; i++) {
1749          if (inst->src[i].file != UNIFORM)
1750             continue;
1751
1752          int pull_index = pull_constant_loc[inst->src[i].reg +
1753                                             inst->src[i].reg_offset];
1754          if (pull_index == -1)
1755             continue;
1756
1757          assert(!inst->src[i].reladdr);
1758
1759          fs_reg dst = fs_reg(this, glsl_type::float_type);
1760          fs_reg index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1761          fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1762          fs_inst *pull =
1763             new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1764                                  dst, index, offset);
1765          pull->ir = inst->ir;
1766          pull->annotation = inst->annotation;
1767
1768          inst->insert_before(pull);
1769
1770          inst->src[i].file = GRF;
1771          inst->src[i].reg = dst.reg;
1772          inst->src[i].reg_offset = 0;
1773          inst->src[i].smear = pull_index & 3;
1774       }
1775    }
1776 }
1777
1778 bool
1779 fs_visitor::opt_algebraic()
1780 {
1781    bool progress = false;
1782
1783    foreach_list(node, &this->instructions) {
1784       fs_inst *inst = (fs_inst *)node;
1785
1786       switch (inst->opcode) {
1787       case BRW_OPCODE_MUL:
1788          if (inst->src[1].file != IMM)
1789             continue;
1790
1791          /* a * 1.0 = a */
1792          if (inst->src[1].is_one()) {
1793             inst->opcode = BRW_OPCODE_MOV;
1794             inst->src[1] = reg_undef;
1795             progress = true;
1796             break;
1797          }
1798
1799          /* a * 0.0 = 0.0 */
1800          if (inst->src[1].is_zero()) {
1801             inst->opcode = BRW_OPCODE_MOV;
1802             inst->src[0] = inst->src[1];
1803             inst->src[1] = reg_undef;
1804             progress = true;
1805             break;
1806          }
1807
1808          break;
1809       case BRW_OPCODE_ADD:
1810          if (inst->src[1].file != IMM)
1811             continue;
1812
1813          /* a + 0.0 = a */
1814          if (inst->src[1].is_zero()) {
1815             inst->opcode = BRW_OPCODE_MOV;
1816             inst->src[1] = reg_undef;
1817             progress = true;
1818             break;
1819          }
1820          break;
1821       default:
1822          break;
1823       }
1824    }
1825
1826    return progress;
1827 }
1828
1829 /**
1830  * Removes any instructions writing a VGRF where that VGRF is not used by any
1831  * later instruction.
1832  */
1833 bool
1834 fs_visitor::dead_code_eliminate()
1835 {
1836    bool progress = false;
1837    int pc = 0;
1838
1839    calculate_live_intervals();
1840
1841    foreach_list_safe(node, &this->instructions) {
1842       fs_inst *inst = (fs_inst *)node;
1843
1844       if (inst->dst.file == GRF) {
1845          assert(this->virtual_grf_end[inst->dst.reg] >= pc);
1846          if (this->virtual_grf_end[inst->dst.reg] == pc) {
1847             inst->remove();
1848             progress = true;
1849          }
1850       }
1851
1852       pc++;
1853    }
1854
1855    if (progress)
1856       live_intervals_valid = false;
1857
1858    return progress;
1859 }
1860
1861 struct dead_code_hash_key
1862 {
1863    int vgrf;
1864    int reg_offset;
1865 };
1866
1867 static bool
1868 dead_code_hash_compare(const void *a, const void *b)
1869 {
1870    return memcmp(a, b, sizeof(struct dead_code_hash_key)) == 0;
1871 }
1872
1873 static void
1874 clear_dead_code_hash(struct hash_table *ht)
1875 {
1876    struct hash_entry *entry;
1877
1878    hash_table_foreach(ht, entry) {
1879       _mesa_hash_table_remove(ht, entry);
1880    }
1881 }
1882
1883 static void
1884 insert_dead_code_hash(struct hash_table *ht,
1885                       int vgrf, int reg_offset, fs_inst *inst)
1886 {
1887    /* We don't bother freeing keys, because they'll be GCed with the ht. */
1888    struct dead_code_hash_key *key = ralloc(ht, struct dead_code_hash_key);
1889
1890    key->vgrf = vgrf;
1891    key->reg_offset = reg_offset;
1892
1893    _mesa_hash_table_insert(ht, _mesa_hash_data(key, sizeof(*key)), key, inst);
1894 }
1895
1896 static struct hash_entry *
1897 get_dead_code_hash_entry(struct hash_table *ht, int vgrf, int reg_offset)
1898 {
1899    struct dead_code_hash_key key;
1900
1901    key.vgrf = vgrf;
1902    key.reg_offset = reg_offset;
1903
1904    return _mesa_hash_table_search(ht, _mesa_hash_data(&key, sizeof(key)), &key);
1905 }
1906
1907 static void
1908 remove_dead_code_hash(struct hash_table *ht,
1909                       int vgrf, int reg_offset)
1910 {
1911    struct hash_entry *entry = get_dead_code_hash_entry(ht, vgrf, reg_offset);
1912    if (!entry)
1913       return;
1914
1915    _mesa_hash_table_remove(ht, entry);
1916 }
1917
1918 /**
1919  * Walks basic blocks, removing any regs that are written but not read before
1920  * being redefined.
1921  *
1922  * The dead_code_eliminate() function implements a global dead code
1923  * elimination, but it only handles the removing the last write to a register
1924  * if it's never read.  This one can handle intermediate writes, but only
1925  * within a basic block.
1926  */
1927 bool
1928 fs_visitor::dead_code_eliminate_local()
1929 {
1930    struct hash_table *ht;
1931    bool progress = false;
1932
1933    ht = _mesa_hash_table_create(mem_ctx, dead_code_hash_compare);
1934
1935    foreach_list_safe(node, &this->instructions) {
1936       fs_inst *inst = (fs_inst *)node;
1937
1938       /* At a basic block, empty the HT since we don't understand dataflow
1939        * here.
1940        */
1941       if (inst->is_control_flow()) {
1942          clear_dead_code_hash(ht);
1943          continue;
1944       }
1945
1946       /* Clear the HT of any instructions that got read. */
1947       for (int i = 0; i < 3; i++) {
1948          fs_reg src = inst->src[i];
1949          if (src.file != GRF)
1950             continue;
1951
1952          int read = 1;
1953          if (inst->is_send_from_grf())
1954             read = virtual_grf_sizes[src.reg] - src.reg_offset;
1955
1956          for (int reg_offset = src.reg_offset;
1957               reg_offset < src.reg_offset + read;
1958               reg_offset++) {
1959             remove_dead_code_hash(ht, src.reg, reg_offset);
1960          }
1961       }
1962
1963       /* Add any update of a GRF to the HT, removing a previous write if it
1964        * wasn't read.
1965        */
1966       if (inst->dst.file == GRF) {
1967          if (inst->regs_written > 1) {
1968             /* We don't know how to trim channels from an instruction's
1969              * writes, so we can't incrementally remove unread channels from
1970              * it.  Just remove whatever it overwrites from the table
1971              */
1972             for (int i = 0; i < inst->regs_written; i++) {
1973                remove_dead_code_hash(ht,
1974                                      inst->dst.reg,
1975                                      inst->dst.reg_offset + i);
1976             }
1977          } else {
1978             struct hash_entry *entry =
1979                get_dead_code_hash_entry(ht, inst->dst.reg,
1980                                         inst->dst.reg_offset);
1981
1982             if (inst->is_partial_write()) {
1983                /* For a partial write, we can't remove any previous dead code
1984                 * candidate, since we're just modifying their result, but we can
1985                 * be dead code eliminiated ourselves.
1986                 */
1987                if (entry) {
1988                   entry->data = inst;
1989                } else {
1990                   insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
1991                                         inst);
1992                }
1993             } else {
1994                if (entry) {
1995                   /* We're completely updating a channel, and there was a
1996                    * previous write to the channel that wasn't read.  Kill it!
1997                    */
1998                   fs_inst *inst = (fs_inst *)entry->data;
1999                   inst->remove();
2000                   progress = true;
2001                   _mesa_hash_table_remove(ht, entry);
2002                }
2003
2004                insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
2005                                      inst);
2006             }
2007          }
2008       }
2009    }
2010
2011    _mesa_hash_table_destroy(ht, NULL);
2012
2013    if (progress)
2014       live_intervals_valid = false;
2015
2016    return progress;
2017 }
2018
2019 /**
2020  * Implements a second type of register coalescing: This one checks if
2021  * the two regs involved in a raw move don't interfere, in which case
2022  * they can both by stored in the same place and the MOV removed.
2023  */
2024 bool
2025 fs_visitor::register_coalesce_2()
2026 {
2027    bool progress = false;
2028
2029    calculate_live_intervals();
2030
2031    foreach_list_safe(node, &this->instructions) {
2032       fs_inst *inst = (fs_inst *)node;
2033
2034       if (inst->opcode != BRW_OPCODE_MOV ||
2035           inst->is_partial_write() ||
2036           inst->saturate ||
2037           inst->src[0].file != GRF ||
2038           inst->src[0].negate ||
2039           inst->src[0].abs ||
2040           inst->src[0].smear != -1 ||
2041           inst->dst.file != GRF ||
2042           inst->dst.type != inst->src[0].type ||
2043           virtual_grf_sizes[inst->src[0].reg] != 1 ||
2044           virtual_grf_interferes(inst->dst.reg, inst->src[0].reg)) {
2045          continue;
2046       }
2047
2048       int reg_from = inst->src[0].reg;
2049       assert(inst->src[0].reg_offset == 0);
2050       int reg_to = inst->dst.reg;
2051       int reg_to_offset = inst->dst.reg_offset;
2052
2053       foreach_list(node, &this->instructions) {
2054          fs_inst *scan_inst = (fs_inst *)node;
2055
2056          if (scan_inst->dst.file == GRF &&
2057              scan_inst->dst.reg == reg_from) {
2058             scan_inst->dst.reg = reg_to;
2059             scan_inst->dst.reg_offset = reg_to_offset;
2060          }
2061          for (int i = 0; i < 3; i++) {
2062             if (scan_inst->src[i].file == GRF &&
2063                 scan_inst->src[i].reg == reg_from) {
2064                scan_inst->src[i].reg = reg_to;
2065                scan_inst->src[i].reg_offset = reg_to_offset;
2066             }
2067          }
2068       }
2069
2070       inst->remove();
2071
2072       /* We don't need to recalculate live intervals inside the loop despite
2073        * flagging live_intervals_valid because we only use live intervals for
2074        * the interferes test, and we must have had a situation where the
2075        * intervals were:
2076        *
2077        *  from  to
2078        *  ^
2079        *  |
2080        *  v
2081        *        ^
2082        *        |
2083        *        v
2084        *
2085        * Some register R that might get coalesced with one of these two could
2086        * only be referencing "to", otherwise "from"'s range would have been
2087        * longer.  R's range could also only start at the end of "to" or later,
2088        * otherwise it will conflict with "to" when we try to coalesce "to"
2089        * into Rw anyway.
2090        */
2091       live_intervals_valid = false;
2092
2093       progress = true;
2094       continue;
2095    }
2096
2097    return progress;
2098 }
2099
2100 bool
2101 fs_visitor::register_coalesce()
2102 {
2103    bool progress = false;
2104    int if_depth = 0;
2105    int loop_depth = 0;
2106
2107    foreach_list_safe(node, &this->instructions) {
2108       fs_inst *inst = (fs_inst *)node;
2109
2110       /* Make sure that we dominate the instructions we're going to
2111        * scan for interfering with our coalescing, or we won't have
2112        * scanned enough to see if anything interferes with our
2113        * coalescing.  We don't dominate the following instructions if
2114        * we're in a loop or an if block.
2115        */
2116       switch (inst->opcode) {
2117       case BRW_OPCODE_DO:
2118          loop_depth++;
2119          break;
2120       case BRW_OPCODE_WHILE:
2121          loop_depth--;
2122          break;
2123       case BRW_OPCODE_IF:
2124          if_depth++;
2125          break;
2126       case BRW_OPCODE_ENDIF:
2127          if_depth--;
2128          break;
2129       default:
2130          break;
2131       }
2132       if (loop_depth || if_depth)
2133          continue;
2134
2135       if (inst->opcode != BRW_OPCODE_MOV ||
2136           inst->is_partial_write() ||
2137           inst->saturate ||
2138           inst->dst.file != GRF || (inst->src[0].file != GRF &&
2139                                     inst->src[0].file != UNIFORM)||
2140           inst->dst.type != inst->src[0].type)
2141          continue;
2142
2143       bool has_source_modifiers = (inst->src[0].abs ||
2144                                    inst->src[0].negate ||
2145                                    inst->src[0].smear != -1 ||
2146                                    inst->src[0].file == UNIFORM);
2147
2148       /* Found a move of a GRF to a GRF.  Let's see if we can coalesce
2149        * them: check for no writes to either one until the exit of the
2150        * program.
2151        */
2152       bool interfered = false;
2153
2154       for (fs_inst *scan_inst = (fs_inst *)inst->next;
2155            !scan_inst->is_tail_sentinel();
2156            scan_inst = (fs_inst *)scan_inst->next) {
2157          if (scan_inst->dst.file == GRF) {
2158             if (scan_inst->overwrites_reg(inst->dst) ||
2159                 scan_inst->overwrites_reg(inst->src[0])) {
2160                interfered = true;
2161                break;
2162             }
2163          }
2164
2165          if (has_source_modifiers) {
2166             for (int i = 0; i < 3; i++) {
2167                if (scan_inst->src[i].file == GRF &&
2168                    scan_inst->src[i].reg == inst->dst.reg &&
2169                    scan_inst->src[i].reg_offset == inst->dst.reg_offset &&
2170                    inst->dst.type != scan_inst->src[i].type)
2171                {
2172                  interfered = true;
2173                  break;
2174                }
2175             }
2176          }
2177
2178
2179          /* The gen6 MATH instruction can't handle source modifiers or
2180           * unusual register regions, so avoid coalescing those for
2181           * now.  We should do something more specific.
2182           */
2183          if (has_source_modifiers && !can_do_source_mods(scan_inst)) {
2184             interfered = true;
2185             break;
2186          }
2187
2188          /* The accumulator result appears to get used for the
2189           * conditional modifier generation.  When negating a UD
2190           * value, there is a 33rd bit generated for the sign in the
2191           * accumulator value, so now you can't check, for example,
2192           * equality with a 32-bit value.  See piglit fs-op-neg-uint.
2193           */
2194          if (scan_inst->conditional_mod &&
2195              inst->src[0].negate &&
2196              inst->src[0].type == BRW_REGISTER_TYPE_UD) {
2197             interfered = true;
2198             break;
2199          }
2200       }
2201       if (interfered) {
2202          continue;
2203       }
2204
2205       /* Rewrite the later usage to point at the source of the move to
2206        * be removed.
2207        */
2208       for (fs_inst *scan_inst = inst;
2209            !scan_inst->is_tail_sentinel();
2210            scan_inst = (fs_inst *)scan_inst->next) {
2211          for (int i = 0; i < 3; i++) {
2212             if (scan_inst->src[i].file == GRF &&
2213                 scan_inst->src[i].reg == inst->dst.reg &&
2214                 scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
2215                fs_reg new_src = inst->src[0];
2216                if (scan_inst->src[i].abs) {
2217                   new_src.negate = 0;
2218                   new_src.abs = 1;
2219                }
2220                new_src.negate ^= scan_inst->src[i].negate;
2221                scan_inst->src[i] = new_src;
2222             }
2223          }
2224       }
2225
2226       inst->remove();
2227       progress = true;
2228    }
2229
2230    if (progress)
2231       live_intervals_valid = false;
2232
2233    return progress;
2234 }
2235
2236
2237 bool
2238 fs_visitor::compute_to_mrf()
2239 {
2240    bool progress = false;
2241    int next_ip = 0;
2242
2243    calculate_live_intervals();
2244
2245    foreach_list_safe(node, &this->instructions) {
2246       fs_inst *inst = (fs_inst *)node;
2247
2248       int ip = next_ip;
2249       next_ip++;
2250
2251       if (inst->opcode != BRW_OPCODE_MOV ||
2252           inst->is_partial_write() ||
2253           inst->dst.file != MRF || inst->src[0].file != GRF ||
2254           inst->dst.type != inst->src[0].type ||
2255           inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
2256          continue;
2257
2258       /* Work out which hardware MRF registers are written by this
2259        * instruction.
2260        */
2261       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2262       int mrf_high;
2263       if (inst->dst.reg & BRW_MRF_COMPR4) {
2264          mrf_high = mrf_low + 4;
2265       } else if (dispatch_width == 16 &&
2266                  (!inst->force_uncompressed && !inst->force_sechalf)) {
2267          mrf_high = mrf_low + 1;
2268       } else {
2269          mrf_high = mrf_low;
2270       }
2271
2272       /* Can't compute-to-MRF this GRF if someone else was going to
2273        * read it later.
2274        */
2275       if (this->virtual_grf_end[inst->src[0].reg] > ip)
2276          continue;
2277
2278       /* Found a move of a GRF to a MRF.  Let's see if we can go
2279        * rewrite the thing that made this GRF to write into the MRF.
2280        */
2281       fs_inst *scan_inst;
2282       for (scan_inst = (fs_inst *)inst->prev;
2283            scan_inst->prev != NULL;
2284            scan_inst = (fs_inst *)scan_inst->prev) {
2285          if (scan_inst->dst.file == GRF &&
2286              scan_inst->dst.reg == inst->src[0].reg) {
2287             /* Found the last thing to write our reg we want to turn
2288              * into a compute-to-MRF.
2289              */
2290
2291             /* If this one instruction didn't populate all the
2292              * channels, bail.  We might be able to rewrite everything
2293              * that writes that reg, but it would require smarter
2294              * tracking to delay the rewriting until complete success.
2295              */
2296             if (scan_inst->is_partial_write())
2297                break;
2298
2299             /* Things returning more than one register would need us to
2300              * understand coalescing out more than one MOV at a time.
2301              */
2302             if (scan_inst->regs_written > 1)
2303                break;
2304
2305             /* SEND instructions can't have MRF as a destination. */
2306             if (scan_inst->mlen)
2307                break;
2308
2309             if (brw->gen == 6) {
2310                /* gen6 math instructions must have the destination be
2311                 * GRF, so no compute-to-MRF for them.
2312                 */
2313                if (scan_inst->is_math()) {
2314                   break;
2315                }
2316             }
2317
2318             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2319                /* Found the creator of our MRF's source value. */
2320                scan_inst->dst.file = MRF;
2321                scan_inst->dst.reg = inst->dst.reg;
2322                scan_inst->saturate |= inst->saturate;
2323                inst->remove();
2324                progress = true;
2325             }
2326             break;
2327          }
2328
2329          /* We don't handle control flow here.  Most computation of
2330           * values that end up in MRFs are shortly before the MRF
2331           * write anyway.
2332           */
2333          if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2334             break;
2335
2336          /* You can't read from an MRF, so if someone else reads our
2337           * MRF's source GRF that we wanted to rewrite, that stops us.
2338           */
2339          bool interfered = false;
2340          for (int i = 0; i < 3; i++) {
2341             if (scan_inst->src[i].file == GRF &&
2342                 scan_inst->src[i].reg == inst->src[0].reg &&
2343                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2344                interfered = true;
2345             }
2346          }
2347          if (interfered)
2348             break;
2349
2350          if (scan_inst->dst.file == MRF) {
2351             /* If somebody else writes our MRF here, we can't
2352              * compute-to-MRF before that.
2353              */
2354             int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2355             int scan_mrf_high;
2356
2357             if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2358                scan_mrf_high = scan_mrf_low + 4;
2359             } else if (dispatch_width == 16 &&
2360                        (!scan_inst->force_uncompressed &&
2361                         !scan_inst->force_sechalf)) {
2362                scan_mrf_high = scan_mrf_low + 1;
2363             } else {
2364                scan_mrf_high = scan_mrf_low;
2365             }
2366
2367             if (mrf_low == scan_mrf_low ||
2368                 mrf_low == scan_mrf_high ||
2369                 mrf_high == scan_mrf_low ||
2370                 mrf_high == scan_mrf_high) {
2371                break;
2372             }
2373          }
2374
2375          if (scan_inst->mlen > 0) {
2376             /* Found a SEND instruction, which means that there are
2377              * live values in MRFs from base_mrf to base_mrf +
2378              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
2379              * above it.
2380              */
2381             if (mrf_low >= scan_inst->base_mrf &&
2382                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2383                break;
2384             }
2385             if (mrf_high >= scan_inst->base_mrf &&
2386                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2387                break;
2388             }
2389          }
2390       }
2391    }
2392
2393    if (progress)
2394       live_intervals_valid = false;
2395
2396    return progress;
2397 }
2398
2399 /**
2400  * Walks through basic blocks, looking for repeated MRF writes and
2401  * removing the later ones.
2402  */
2403 bool
2404 fs_visitor::remove_duplicate_mrf_writes()
2405 {
2406    fs_inst *last_mrf_move[16];
2407    bool progress = false;
2408
2409    /* Need to update the MRF tracking for compressed instructions. */
2410    if (dispatch_width == 16)
2411       return false;
2412
2413    memset(last_mrf_move, 0, sizeof(last_mrf_move));
2414
2415    foreach_list_safe(node, &this->instructions) {
2416       fs_inst *inst = (fs_inst *)node;
2417
2418       if (inst->is_control_flow()) {
2419          memset(last_mrf_move, 0, sizeof(last_mrf_move));
2420       }
2421
2422       if (inst->opcode == BRW_OPCODE_MOV &&
2423           inst->dst.file == MRF) {
2424          fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2425          if (prev_inst && inst->equals(prev_inst)) {
2426             inst->remove();
2427             progress = true;
2428             continue;
2429          }
2430       }
2431
2432       /* Clear out the last-write records for MRFs that were overwritten. */
2433       if (inst->dst.file == MRF) {
2434          last_mrf_move[inst->dst.reg] = NULL;
2435       }
2436
2437       if (inst->mlen > 0) {
2438          /* Found a SEND instruction, which will include two or fewer
2439           * implied MRF writes.  We could do better here.
2440           */
2441          for (int i = 0; i < implied_mrf_writes(inst); i++) {
2442             last_mrf_move[inst->base_mrf + i] = NULL;
2443          }
2444       }
2445
2446       /* Clear out any MRF move records whose sources got overwritten. */
2447       if (inst->dst.file == GRF) {
2448          for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2449             if (last_mrf_move[i] &&
2450                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2451                last_mrf_move[i] = NULL;
2452             }
2453          }
2454       }
2455
2456       if (inst->opcode == BRW_OPCODE_MOV &&
2457           inst->dst.file == MRF &&
2458           inst->src[0].file == GRF &&
2459           !inst->is_partial_write()) {
2460          last_mrf_move[inst->dst.reg] = inst;
2461       }
2462    }
2463
2464    if (progress)
2465       live_intervals_valid = false;
2466
2467    return progress;
2468 }
2469
2470 static void
2471 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2472                         int first_grf, int grf_len)
2473 {
2474    bool inst_16wide = (dispatch_width > 8 &&
2475                        !inst->force_uncompressed &&
2476                        !inst->force_sechalf);
2477
2478    /* Clear the flag for registers that actually got read (as expected). */
2479    for (int i = 0; i < 3; i++) {
2480       int grf;
2481       if (inst->src[i].file == GRF) {
2482          grf = inst->src[i].reg;
2483       } else if (inst->src[i].file == HW_REG &&
2484                  inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2485          grf = inst->src[i].fixed_hw_reg.nr;
2486       } else {
2487          continue;
2488       }
2489
2490       if (grf >= first_grf &&
2491           grf < first_grf + grf_len) {
2492          deps[grf - first_grf] = false;
2493          if (inst_16wide)
2494             deps[grf - first_grf + 1] = false;
2495       }
2496    }
2497 }
2498
2499 /**
2500  * Implements this workaround for the original 965:
2501  *
2502  *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2503  *      check for post destination dependencies on this instruction, software
2504  *      must ensure that there is no destination hazard for the case of ‘write
2505  *      followed by a posted write’ shown in the following example.
2506  *
2507  *      1. mov r3 0
2508  *      2. send r3.xy <rest of send instruction>
2509  *      3. mov r2 r3
2510  *
2511  *      Due to no post-destination dependency check on the ‘send’, the above
2512  *      code sequence could have two instructions (1 and 2) in flight at the
2513  *      same time that both consider ‘r3’ as the target of their final writes.
2514  */
2515 void
2516 fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
2517 {
2518    int reg_size = dispatch_width / 8;
2519    int write_len = inst->regs_written * reg_size;
2520    int first_write_grf = inst->dst.reg;
2521    bool needs_dep[BRW_MAX_MRF];
2522    assert(write_len < (int)sizeof(needs_dep) - 1);
2523
2524    memset(needs_dep, false, sizeof(needs_dep));
2525    memset(needs_dep, true, write_len);
2526
2527    clear_deps_for_inst_src(inst, dispatch_width,
2528                            needs_dep, first_write_grf, write_len);
2529
2530    /* Walk backwards looking for writes to registers we're writing which
2531     * aren't read since being written.  If we hit the start of the program,
2532     * we assume that there are no outstanding dependencies on entry to the
2533     * program.
2534     */
2535    for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2536         scan_inst != NULL;
2537         scan_inst = (fs_inst *)scan_inst->prev) {
2538
2539       /* If we hit control flow, assume that there *are* outstanding
2540        * dependencies, and force their cleanup before our instruction.
2541        */
2542       if (scan_inst->is_control_flow()) {
2543          for (int i = 0; i < write_len; i++) {
2544             if (needs_dep[i]) {
2545                inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2546             }
2547          }
2548          return;
2549       }
2550
2551       bool scan_inst_16wide = (dispatch_width > 8 &&
2552                                !scan_inst->force_uncompressed &&
2553                                !scan_inst->force_sechalf);
2554
2555       /* We insert our reads as late as possible on the assumption that any
2556        * instruction but a MOV that might have left us an outstanding
2557        * dependency has more latency than a MOV.
2558        */
2559       if (scan_inst->dst.file == GRF) {
2560          for (int i = 0; i < scan_inst->regs_written; i++) {
2561             int reg = scan_inst->dst.reg + i * reg_size;
2562
2563             if (reg >= first_write_grf &&
2564                 reg < first_write_grf + write_len &&
2565                 needs_dep[reg - first_write_grf]) {
2566                inst->insert_before(DEP_RESOLVE_MOV(reg));
2567                needs_dep[reg - first_write_grf] = false;
2568                if (scan_inst_16wide)
2569                   needs_dep[reg - first_write_grf + 1] = false;
2570             }
2571          }
2572       }
2573
2574       /* Clear the flag for registers that actually got read (as expected). */
2575       clear_deps_for_inst_src(scan_inst, dispatch_width,
2576                               needs_dep, first_write_grf, write_len);
2577
2578       /* Continue the loop only if we haven't resolved all the dependencies */
2579       int i;
2580       for (i = 0; i < write_len; i++) {
2581          if (needs_dep[i])
2582             break;
2583       }
2584       if (i == write_len)
2585          return;
2586    }
2587 }
2588
2589 /**
2590  * Implements this workaround for the original 965:
2591  *
2592  *     "[DevBW, DevCL] Errata: A destination register from a send can not be
2593  *      used as a destination register until after it has been sourced by an
2594  *      instruction with a different destination register.
2595  */
2596 void
2597 fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
2598 {
2599    int write_len = inst->regs_written * dispatch_width / 8;
2600    int first_write_grf = inst->dst.reg;
2601    bool needs_dep[BRW_MAX_MRF];
2602    assert(write_len < (int)sizeof(needs_dep) - 1);
2603
2604    memset(needs_dep, false, sizeof(needs_dep));
2605    memset(needs_dep, true, write_len);
2606    /* Walk forwards looking for writes to registers we're writing which aren't
2607     * read before being written.
2608     */
2609    for (fs_inst *scan_inst = (fs_inst *)inst->next;
2610         !scan_inst->is_tail_sentinel();
2611         scan_inst = (fs_inst *)scan_inst->next) {
2612       /* If we hit control flow, force resolve all remaining dependencies. */
2613       if (scan_inst->is_control_flow()) {
2614          for (int i = 0; i < write_len; i++) {
2615             if (needs_dep[i])
2616                scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2617          }
2618          return;
2619       }
2620
2621       /* Clear the flag for registers that actually got read (as expected). */
2622       clear_deps_for_inst_src(scan_inst, dispatch_width,
2623                               needs_dep, first_write_grf, write_len);
2624
2625       /* We insert our reads as late as possible since they're reading the
2626        * result of a SEND, which has massive latency.
2627        */
2628       if (scan_inst->dst.file == GRF &&
2629           scan_inst->dst.reg >= first_write_grf &&
2630           scan_inst->dst.reg < first_write_grf + write_len &&
2631           needs_dep[scan_inst->dst.reg - first_write_grf]) {
2632          scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2633          needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2634       }
2635
2636       /* Continue the loop only if we haven't resolved all the dependencies */
2637       int i;
2638       for (i = 0; i < write_len; i++) {
2639          if (needs_dep[i])
2640             break;
2641       }
2642       if (i == write_len)
2643          return;
2644    }
2645
2646    /* If we hit the end of the program, resolve all remaining dependencies out
2647     * of paranoia.
2648     */
2649    fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2650    assert(last_inst->eot);
2651    for (int i = 0; i < write_len; i++) {
2652       if (needs_dep[i])
2653          last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2654    }
2655 }
2656
2657 void
2658 fs_visitor::insert_gen4_send_dependency_workarounds()
2659 {
2660    if (brw->gen != 4 || brw->is_g4x)
2661       return;
2662
2663    /* Note that we're done with register allocation, so GRF fs_regs always
2664     * have a .reg_offset of 0.
2665     */
2666
2667    foreach_list_safe(node, &this->instructions) {
2668       fs_inst *inst = (fs_inst *)node;
2669
2670       if (inst->mlen != 0 && inst->dst.file == GRF) {
2671          insert_gen4_pre_send_dependency_workarounds(inst);
2672          insert_gen4_post_send_dependency_workarounds(inst);
2673       }
2674    }
2675 }
2676
2677 /**
2678  * Turns the generic expression-style uniform pull constant load instruction
2679  * into a hardware-specific series of instructions for loading a pull
2680  * constant.
2681  *
2682  * The expression style allows the CSE pass before this to optimize out
2683  * repeated loads from the same offset, and gives the pre-register-allocation
2684  * scheduling full flexibility, while the conversion to native instructions
2685  * allows the post-register-allocation scheduler the best information
2686  * possible.
2687  *
2688  * Note that execution masking for setting up pull constant loads is special:
2689  * the channels that need to be written are unrelated to the current execution
2690  * mask, since a later instruction will use one of the result channels as a
2691  * source operand for all 8 or 16 of its channels.
2692  */
2693 void
2694 fs_visitor::lower_uniform_pull_constant_loads()
2695 {
2696    foreach_list(node, &this->instructions) {
2697       fs_inst *inst = (fs_inst *)node;
2698
2699       if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2700          continue;
2701
2702       if (brw->gen >= 7) {
2703          /* The offset arg before was a vec4-aligned byte offset.  We need to
2704           * turn it into a dword offset.
2705           */
2706          fs_reg const_offset_reg = inst->src[1];
2707          assert(const_offset_reg.file == IMM &&
2708                 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2709          const_offset_reg.imm.u /= 4;
2710          fs_reg payload = fs_reg(this, glsl_type::uint_type);
2711
2712          /* This is actually going to be a MOV, but since only the first dword
2713           * is accessed, we have a special opcode to do just that one.  Note
2714           * that this needs to be an operation that will be considered a def
2715           * by live variable analysis, or register allocation will explode.
2716           */
2717          fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2718                                                payload, const_offset_reg);
2719          setup->force_writemask_all = true;
2720
2721          setup->ir = inst->ir;
2722          setup->annotation = inst->annotation;
2723          inst->insert_before(setup);
2724
2725          /* Similarly, this will only populate the first 4 channels of the
2726           * result register (since we only use smear values from 0-3), but we
2727           * don't tell the optimizer.
2728           */
2729          inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2730          inst->src[1] = payload;
2731
2732          this->live_intervals_valid = false;
2733       } else {
2734          /* Before register allocation, we didn't tell the scheduler about the
2735           * MRF we use.  We know it's safe to use this MRF because nothing
2736           * else does except for register spill/unspill, which generates and
2737           * uses its MRF within a single IR instruction.
2738           */
2739          inst->base_mrf = 14;
2740          inst->mlen = 1;
2741       }
2742    }
2743 }
2744
2745 void
2746 fs_visitor::dump_instruction(backend_instruction *be_inst)
2747 {
2748    fs_inst *inst = (fs_inst *)be_inst;
2749
2750    if (inst->predicate) {
2751       printf("(%cf0.%d) ",
2752              inst->predicate_inverse ? '-' : '+',
2753              inst->flag_subreg);
2754    }
2755
2756    printf("%s", brw_instruction_name(inst->opcode));
2757    if (inst->saturate)
2758       printf(".sat");
2759    if (inst->conditional_mod) {
2760       printf(".cmod");
2761       if (!inst->predicate &&
2762           (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2763                               inst->opcode != BRW_OPCODE_IF &&
2764                               inst->opcode != BRW_OPCODE_WHILE))) {
2765          printf(".f0.%d", inst->flag_subreg);
2766       }
2767    }
2768    printf(" ");
2769
2770
2771    switch (inst->dst.file) {
2772    case GRF:
2773       printf("vgrf%d", inst->dst.reg);
2774       if (inst->dst.reg_offset)
2775          printf("+%d", inst->dst.reg_offset);
2776       break;
2777    case MRF:
2778       printf("m%d", inst->dst.reg);
2779       break;
2780    case BAD_FILE:
2781       printf("(null)");
2782       break;
2783    case UNIFORM:
2784       printf("***u%d***", inst->dst.reg);
2785       break;
2786    case ARF:
2787       if (inst->dst.reg == BRW_ARF_NULL)
2788          printf("(null)");
2789       else
2790          printf("arf%d", inst->dst.reg);
2791       break;
2792    default:
2793       printf("???");
2794       break;
2795    }
2796    printf(", ");
2797
2798    for (int i = 0; i < 3; i++) {
2799       if (inst->src[i].negate)
2800          printf("-");
2801       if (inst->src[i].abs)
2802          printf("|");
2803       switch (inst->src[i].file) {
2804       case GRF:
2805          printf("vgrf%d", inst->src[i].reg);
2806          if (inst->src[i].reg_offset)
2807             printf("+%d", inst->src[i].reg_offset);
2808          break;
2809       case MRF:
2810          printf("***m%d***", inst->src[i].reg);
2811          break;
2812       case UNIFORM:
2813          printf("u%d", inst->src[i].reg);
2814          if (inst->src[i].reg_offset)
2815             printf(".%d", inst->src[i].reg_offset);
2816          break;
2817       case BAD_FILE:
2818          printf("(null)");
2819          break;
2820       case IMM:
2821          switch (inst->src[i].type) {
2822          case BRW_REGISTER_TYPE_F:
2823             printf("%ff", inst->src[i].imm.f);
2824             break;
2825          case BRW_REGISTER_TYPE_D:
2826             printf("%dd", inst->src[i].imm.i);
2827             break;
2828          case BRW_REGISTER_TYPE_UD:
2829             printf("%uu", inst->src[i].imm.u);
2830             break;
2831          default:
2832             printf("???");
2833             break;
2834          }
2835          break;
2836       default:
2837          printf("???");
2838          break;
2839       }
2840       if (inst->src[i].abs)
2841          printf("|");
2842
2843       if (i < 3)
2844          printf(", ");
2845    }
2846
2847    printf(" ");
2848
2849    if (inst->force_uncompressed)
2850       printf("1sthalf ");
2851
2852    if (inst->force_sechalf)
2853       printf("2ndhalf ");
2854
2855    printf("\n");
2856 }
2857
2858 /**
2859  * Possibly returns an instruction that set up @param reg.
2860  *
2861  * Sometimes we want to take the result of some expression/variable
2862  * dereference tree and rewrite the instruction generating the result
2863  * of the tree.  When processing the tree, we know that the
2864  * instructions generated are all writing temporaries that are dead
2865  * outside of this tree.  So, if we have some instructions that write
2866  * a temporary, we're free to point that temp write somewhere else.
2867  *
2868  * Note that this doesn't guarantee that the instruction generated
2869  * only reg -- it might be the size=4 destination of a texture instruction.
2870  */
2871 fs_inst *
2872 fs_visitor::get_instruction_generating_reg(fs_inst *start,
2873                                            fs_inst *end,
2874                                            fs_reg reg)
2875 {
2876    if (end == start ||
2877        end->is_partial_write() ||
2878        reg.reladdr ||
2879        !reg.equals(end->dst)) {
2880       return NULL;
2881    } else {
2882       return end;
2883    }
2884 }
2885
2886 void
2887 fs_visitor::setup_payload_gen6()
2888 {
2889    bool uses_depth =
2890       (fp->Base.InputsRead & (1 << VARYING_SLOT_POS)) != 0;
2891    unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
2892
2893    assert(brw->gen >= 6);
2894
2895    /* R0-1: masks, pixel X/Y coordinates. */
2896    c->nr_payload_regs = 2;
2897    /* R2: only for 32-pixel dispatch.*/
2898
2899    /* R3-26: barycentric interpolation coordinates.  These appear in the
2900     * same order that they appear in the brw_wm_barycentric_interp_mode
2901     * enum.  Each set of coordinates occupies 2 registers if dispatch width
2902     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
2903     * appear if they were enabled using the "Barycentric Interpolation
2904     * Mode" bits in WM_STATE.
2905     */
2906    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
2907       if (barycentric_interp_modes & (1 << i)) {
2908          c->barycentric_coord_reg[i] = c->nr_payload_regs;
2909          c->nr_payload_regs += 2;
2910          if (dispatch_width == 16) {
2911             c->nr_payload_regs += 2;
2912          }
2913       }
2914    }
2915
2916    /* R27: interpolated depth if uses source depth */
2917    if (uses_depth) {
2918       c->source_depth_reg = c->nr_payload_regs;
2919       c->nr_payload_regs++;
2920       if (dispatch_width == 16) {
2921          /* R28: interpolated depth if not 8-wide. */
2922          c->nr_payload_regs++;
2923       }
2924    }
2925    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
2926    if (uses_depth) {
2927       c->source_w_reg = c->nr_payload_regs;
2928       c->nr_payload_regs++;
2929       if (dispatch_width == 16) {
2930          /* R30: interpolated W if not 8-wide. */
2931          c->nr_payload_regs++;
2932       }
2933    }
2934    /* R31: MSAA position offsets. */
2935    /* R32-: bary for 32-pixel. */
2936    /* R58-59: interp W for 32-pixel. */
2937
2938    if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
2939       c->source_depth_to_render_target = true;
2940    }
2941 }
2942
2943 bool
2944 fs_visitor::run()
2945 {
2946    sanity_param_count = fp->Base.Parameters->NumParameters;
2947    uint32_t orig_nr_params = c->prog_data.nr_params;
2948
2949    if (brw->gen >= 6)
2950       setup_payload_gen6();
2951    else
2952       setup_payload_gen4();
2953
2954    if (0) {
2955       emit_dummy_fs();
2956    } else {
2957       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2958          emit_shader_time_begin();
2959
2960       calculate_urb_setup();
2961       if (brw->gen < 6)
2962          emit_interpolation_setup_gen4();
2963       else
2964          emit_interpolation_setup_gen6();
2965
2966       /* We handle discards by keeping track of the still-live pixels in f0.1.
2967        * Initialize it with the dispatched pixels.
2968        */
2969       if (fp->UsesKill) {
2970          fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
2971          discard_init->flag_subreg = 1;
2972       }
2973
2974       /* Generate FS IR for main().  (the visitor only descends into
2975        * functions called "main").
2976        */
2977       if (shader) {
2978          foreach_list(node, &*shader->ir) {
2979             ir_instruction *ir = (ir_instruction *)node;
2980             base_ir = ir;
2981             this->result = reg_undef;
2982             ir->accept(this);
2983          }
2984       } else {
2985          emit_fragment_program_code();
2986       }
2987       base_ir = NULL;
2988       if (failed)
2989          return false;
2990
2991       emit(FS_OPCODE_PLACEHOLDER_HALT);
2992
2993       emit_fb_writes();
2994
2995       split_virtual_grfs();
2996
2997       move_uniform_array_access_to_pull_constants();
2998       setup_pull_constants();
2999
3000       bool progress;
3001       do {
3002          progress = false;
3003
3004          compact_virtual_grfs();
3005
3006          progress = remove_duplicate_mrf_writes() || progress;
3007
3008          progress = opt_algebraic() || progress;
3009          progress = opt_cse() || progress;
3010          progress = opt_copy_propagate() || progress;
3011          progress = dead_code_eliminate() || progress;
3012          progress = dead_code_eliminate_local() || progress;
3013          progress = register_coalesce() || progress;
3014          progress = register_coalesce_2() || progress;
3015          progress = compute_to_mrf() || progress;
3016       } while (progress);
3017
3018       remove_dead_constants();
3019
3020       schedule_instructions(false);
3021
3022       lower_uniform_pull_constant_loads();
3023
3024       assign_curb_setup();
3025       assign_urb_setup();
3026
3027       if (0) {
3028          /* Debug of register spilling: Go spill everything. */
3029          for (int i = 0; i < virtual_grf_count; i++) {
3030             spill_reg(i);
3031          }
3032       }
3033
3034       if (0)
3035          assign_regs_trivial();
3036       else {
3037          while (!assign_regs()) {
3038             if (failed)
3039                break;
3040          }
3041       }
3042    }
3043    assert(force_uncompressed_stack == 0);
3044    assert(force_sechalf_stack == 0);
3045
3046    /* This must come after all optimization and register allocation, since
3047     * it inserts dead code that happens to have side effects, and it does
3048     * so based on the actual physical registers in use.
3049     */
3050    insert_gen4_send_dependency_workarounds();
3051
3052    if (failed)
3053       return false;
3054
3055    schedule_instructions(true);
3056
3057    if (dispatch_width == 8) {
3058       c->prog_data.reg_blocks = brw_register_blocks(grf_used);
3059    } else {
3060       c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
3061
3062       /* Make sure we didn't try to sneak in an extra uniform */
3063       assert(orig_nr_params == c->prog_data.nr_params);
3064       (void) orig_nr_params;
3065    }
3066
3067    /* If any state parameters were appended, then ParameterValues could have
3068     * been realloced, in which case the driver uniform storage set up by
3069     * _mesa_associate_uniform_storage() would point to freed memory.  Make
3070     * sure that didn't happen.
3071     */
3072    assert(sanity_param_count == fp->Base.Parameters->NumParameters);
3073
3074    return !failed;
3075 }
3076
3077 const unsigned *
3078 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
3079                struct gl_fragment_program *fp,
3080                struct gl_shader_program *prog,
3081                unsigned *final_assembly_size)
3082 {
3083    bool start_busy = false;
3084    float start_time = 0;
3085
3086    if (unlikely(brw->perf_debug)) {
3087       start_busy = (brw->batch.last_bo &&
3088                     drm_intel_bo_busy(brw->batch.last_bo));
3089       start_time = get_time();
3090    }
3091
3092    struct brw_shader *shader = NULL;
3093    if (prog)
3094       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3095
3096    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3097       if (prog) {
3098          printf("GLSL IR for native fragment shader %d:\n", prog->Name);
3099          _mesa_print_ir(shader->ir, NULL);
3100          printf("\n\n");
3101       } else {
3102          printf("ARB_fragment_program %d ir for native fragment shader\n",
3103                 fp->Base.Id);
3104          _mesa_print_program(&fp->Base);
3105       }
3106    }
3107
3108    /* Now the main event: Visit the shader IR and generate our FS IR for it.
3109     */
3110    fs_visitor v(brw, c, prog, fp, 8);
3111    if (!v.run()) {
3112       if (prog) {
3113          prog->LinkStatus = false;
3114          ralloc_strcat(&prog->InfoLog, v.fail_msg);
3115       }
3116
3117       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3118                     v.fail_msg);
3119
3120       return NULL;
3121    }
3122
3123    exec_list *simd16_instructions = NULL;
3124    fs_visitor v2(brw, c, prog, fp, 16);
3125    if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16))) {
3126       if (c->prog_data.nr_pull_params == 0) {
3127          /* Try a 16-wide compile */
3128          v2.import_uniforms(&v);
3129          if (!v2.run()) {
3130             perf_debug("16-wide shader failed to compile, falling back to "
3131                        "8-wide at a 10-20%% performance cost: %s", v2.fail_msg);
3132          } else {
3133             simd16_instructions = &v2.instructions;
3134          }
3135       } else {
3136          perf_debug("Skipping 16-wide due to pull parameters.\n");
3137       }
3138    }
3139
3140    c->prog_data.dispatch_width = 8;
3141
3142    fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
3143    const unsigned *generated = g.generate_assembly(&v.instructions,
3144                                                    simd16_instructions,
3145                                                    final_assembly_size);
3146
3147    if (unlikely(brw->perf_debug) && shader) {
3148       if (shader->compiled_once)
3149          brw_wm_debug_recompile(brw, prog, &c->key);
3150       shader->compiled_once = true;
3151
3152       if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
3153          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3154                     (get_time() - start_time) * 1000);
3155       }
3156    }
3157
3158    return generated;
3159 }
3160
3161 bool
3162 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
3163 {
3164    struct brw_context *brw = brw_context(ctx);
3165    struct brw_wm_prog_key key;
3166
3167    if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
3168       return true;
3169
3170    struct gl_fragment_program *fp = (struct gl_fragment_program *)
3171       prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
3172    struct brw_fragment_program *bfp = brw_fragment_program(fp);
3173    bool program_uses_dfdy = fp->UsesDFdy;
3174
3175    memset(&key, 0, sizeof(key));
3176
3177    if (brw->gen < 6) {
3178       if (fp->UsesKill)
3179          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3180
3181       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3182          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3183
3184       /* Just assume depth testing. */
3185       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3186       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3187    }
3188
3189    if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
3190                                          BRW_FS_VARYING_INPUT_MASK) > 16)
3191       key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
3192
3193    key.clamp_fragment_color = ctx->API == API_OPENGL_COMPAT;
3194
3195    unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
3196    for (unsigned i = 0; i < sampler_count; i++) {
3197       if (fp->Base.ShadowSamplers & (1 << i)) {
3198          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3199          key.tex.swizzles[i] =
3200             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3201       } else {
3202          /* Color sampler: assume no swizzling. */
3203          key.tex.swizzles[i] = SWIZZLE_XYZW;
3204       }
3205    }
3206
3207    if (fp->Base.InputsRead & VARYING_BIT_POS) {
3208       key.drawable_height = ctx->DrawBuffer->Height;
3209    }
3210
3211    if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3212       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
3213    }
3214
3215    key.nr_color_regions = 1;
3216
3217    /* GL_FRAGMENT_SHADER_DERIVATIVE_HINT is almost always GL_DONT_CARE.  The
3218     * quality of the derivatives is likely to be determined by the driconf
3219     * option.
3220     */
3221    key.high_quality_derivatives = brw->disable_derivative_optimization;
3222
3223    key.program_string_id = bfp->id;
3224
3225    uint32_t old_prog_offset = brw->wm.base.prog_offset;
3226    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3227
3228    bool success = do_wm_prog(brw, prog, bfp, &key);
3229
3230    brw->wm.base.prog_offset = old_prog_offset;
3231    brw->wm.prog_data = old_prog_data;
3232
3233    return success;
3234 }