src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 extern "C" {
  32
  33 #include <sys/types.h>
  34
  35 #include "main/hash_table.h"
  36 #include "main/macros.h"
  37 #include "main/shaderobj.h"
  38 #include "main/fbobject.h"
  39 #include "program/prog_parameter.h"
  40 #include "program/prog_print.h"
  41 #include "program/register_allocate.h"
  42 #include "program/sampler.h"
  43 #include "program/hash_table.h"
  44 #include "brw_context.h"
  45 #include "brw_eu.h"
  46 #include "brw_wm.h"
  47 }
  48 #include "brw_fs.h"
  49 #include "main/uniforms.h"
  50 #include "brw_fs_live_variables.h"
  51 #include "glsl/glsl_types.h"
  52
  53 void
  54 fs_inst::init()
  55 {
  56    memset(this, 0, sizeof(*this));
  57    this->opcode = BRW_OPCODE_NOP;
  58    this->conditional_mod = BRW_CONDITIONAL_NONE;
  59
  60    this->dst = reg_undef;
  61    this->src[0] = reg_undef;
  62    this->src[1] = reg_undef;
  63    this->src[2] = reg_undef;
  64
  65    /* This will be the case for almost all instructions. */
  66    this->regs_written = 1;
  67 }
  68
  69 fs_inst::fs_inst()
  70 {
  71    init();
  72 }
  73
  74 fs_inst::fs_inst(enum opcode opcode)
  75 {
  76    init();
  77    this->opcode = opcode;
  78 }
  79
  80 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
  81 {
  82    init();
  83    this->opcode = opcode;
  84    this->dst = dst;
  85
  86    if (dst.file == GRF)
  87       assert(dst.reg_offset >= 0);
  88 }
  89
  90 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
  91 {
  92    init();
  93    this->opcode = opcode;
  94    this->dst = dst;
  95    this->src[0] = src0;
  96
  97    if (dst.file == GRF)
  98       assert(dst.reg_offset >= 0);
  99    if (src[0].file == GRF)
 100       assert(src[0].reg_offset >= 0);
 101 }
 102
 103 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 104 {
 105    init();
 106    this->opcode = opcode;
 107    this->dst = dst;
 108    this->src[0] = src0;
 109    this->src[1] = src1;
 110
 111    if (dst.file == GRF)
 112       assert(dst.reg_offset >= 0);
 113    if (src[0].file == GRF)
 114       assert(src[0].reg_offset >= 0);
 115    if (src[1].file == GRF)
 116       assert(src[1].reg_offset >= 0);
 117 }
 118
 119 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
 120                  fs_reg src0, fs_reg src1, fs_reg src2)
 121 {
 122    init();
 123    this->opcode = opcode;
 124    this->dst = dst;
 125    this->src[0] = src0;
 126    this->src[1] = src1;
 127    this->src[2] = src2;
 128
 129    if (dst.file == GRF)
 130       assert(dst.reg_offset >= 0);
 131    if (src[0].file == GRF)
 132       assert(src[0].reg_offset >= 0);
 133    if (src[1].file == GRF)
 134       assert(src[1].reg_offset >= 0);
 135    if (src[2].file == GRF)
 136       assert(src[2].reg_offset >= 0);
 137 }
 138
 139 #define ALU1(op)                                                        \
 140    fs_inst *                                                            \
 141    fs_visitor::op(fs_reg dst, fs_reg src0)                              \
 142    {                                                                    \
 143       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
 144    }
 145
 146 #define ALU2(op)                                                        \
 147    fs_inst *                                                            \
 148    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1)                 \
 149    {                                                                    \
 150       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
 151    }
 152
 153 #define ALU3(op)                                                        \
 154    fs_inst *                                                            \
 155    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1, fs_reg src2)    \
 156    {                                                                    \
 157       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
 158    }
 159
 160 ALU1(NOT)
 161 ALU1(MOV)
 162 ALU1(FRC)
 163 ALU1(RNDD)
 164 ALU1(RNDE)
 165 ALU1(RNDZ)
 166 ALU2(ADD)
 167 ALU2(MUL)
 168 ALU2(MACH)
 169 ALU2(AND)
 170 ALU2(OR)
 171 ALU2(XOR)
 172 ALU2(SHL)
 173 ALU2(SHR)
 174 ALU2(ASR)
 175 ALU3(LRP)
 176 ALU1(BFREV)
 177 ALU3(BFE)
 178 ALU2(BFI1)
 179 ALU3(BFI2)
 180 ALU1(FBH)
 181 ALU1(FBL)
 182 ALU1(CBIT)
 183 ALU3(MAD)
 184 ALU2(ADDC)
 185 ALU2(SUBB)
 186
 187 /** Gen4 predicated IF. */
 188 fs_inst *
 189 fs_visitor::IF(uint32_t predicate)
 190 {
 191    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
 192    inst->predicate = predicate;
 193    return inst;
 194 }
 195
 196 /** Gen6+ IF with embedded comparison. */
 197 fs_inst *
 198 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
 199 {
 200    assert(brw->gen >= 6);
 201    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
 202                                         reg_null_d, src0, src1);
 203    inst->conditional_mod = condition;
 204    return inst;
 205 }
 206
 207 /**
 208  * CMP: Sets the low bit of the destination channels with the result
 209  * of the comparison, while the upper bits are undefined, and updates
 210  * the flag register with the packed 16 bits of the result.
 211  */
 212 fs_inst *
 213 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
 214 {
 215    fs_inst *inst;
 216
 217    /* Take the instruction:
 218     *
 219     * CMP null<d> src0<f> src1<f>
 220     *
 221     * Original gen4 does type conversion to the destination type before
 222     * comparison, producing garbage results for floating point comparisons.
 223     * gen5 does the comparison on the execution type (resolved source types),
 224     * so dst type doesn't matter.  gen6 does comparison and then uses the
 225     * result as if it was the dst type with no conversion, which happens to
 226     * mostly work out for float-interpreted-as-int since our comparisons are
 227     * for >0, =0, <0.
 228     */
 229    if (brw->gen == 4) {
 230       dst.type = src0.type;
 231       if (dst.file == HW_REG)
 232          dst.fixed_hw_reg.type = dst.type;
 233    }
 234
 235    resolve_ud_negate(&src0);
 236    resolve_ud_negate(&src1);
 237
 238    inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
 239    inst->conditional_mod = condition;
 240
 241    return inst;
 242 }
 243
 244 exec_list
 245 fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
 246                                        fs_reg varying_offset,
 247                                        uint32_t const_offset)
 248 {
 249    exec_list instructions;
 250    fs_inst *inst;
 251
 252    /* We have our constant surface use a pitch of 4 bytes, so our index can
 253     * be any component of a vector, and then we load 4 contiguous
 254     * components starting from that.
 255     *
 256     * We break down the const_offset to a portion added to the variable
 257     * offset and a portion done using reg_offset, which means that if you
 258     * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
 259     * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
 260     * CSE can later notice that those loads are all the same and eliminate
 261     * the redundant ones.
 262     */
 263    fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
 264    instructions.push_tail(ADD(vec4_offset,
 265                               varying_offset, const_offset & ~3));
 266
 267    int scale = 1;
 268    if (brw->gen == 4 && dispatch_width == 8) {
 269       /* Pre-gen5, we can either use a SIMD8 message that requires (header,
 270        * u, v, r) as parameters, or we can just use the SIMD16 message
 271        * consisting of (header, u).  We choose the second, at the cost of a
 272        * longer return length.
 273        */
 274       scale = 2;
 275    }
 276
 277    enum opcode op;
 278    if (brw->gen >= 7)
 279       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
 280    else
 281       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
 282    fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4 * scale), dst.type);
 283    inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
 284    inst->regs_written = 4 * scale;
 285    instructions.push_tail(inst);
 286
 287    if (brw->gen < 7) {
 288       inst->base_mrf = 13;
 289       inst->header_present = true;
 290       if (brw->gen == 4)
 291          inst->mlen = 3;
 292       else
 293          inst->mlen = 1 + dispatch_width / 8;
 294    }
 295
 296    vec4_result.reg_offset += (const_offset & 3) * scale;
 297    instructions.push_tail(MOV(dst, vec4_result));
 298
 299    return instructions;
 300 }
 301
 302 /**
 303  * A helper for MOV generation for fixing up broken hardware SEND dependency
 304  * handling.
 305  */
 306 fs_inst *
 307 fs_visitor::DEP_RESOLVE_MOV(int grf)
 308 {
 309    fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
 310
 311    inst->ir = NULL;
 312    inst->annotation = "send dependency resolve";
 313
 314    /* The caller always wants uncompressed to emit the minimal extra
 315     * dependencies, and to avoid having to deal with aligning its regs to 2.
 316     */
 317    inst->force_uncompressed = true;
 318
 319    return inst;
 320 }
 321
 322 bool
 323 fs_inst::equals(fs_inst *inst)
 324 {
 325    return (opcode == inst->opcode &&
 326            dst.equals(inst->dst) &&
 327            src[0].equals(inst->src[0]) &&
 328            src[1].equals(inst->src[1]) &&
 329            src[2].equals(inst->src[2]) &&
 330            saturate == inst->saturate &&
 331            predicate == inst->predicate &&
 332            conditional_mod == inst->conditional_mod &&
 333            mlen == inst->mlen &&
 334            base_mrf == inst->base_mrf &&
 335            sampler == inst->sampler &&
 336            target == inst->target &&
 337            eot == inst->eot &&
 338            header_present == inst->header_present &&
 339            shadow_compare == inst->shadow_compare &&
 340            offset == inst->offset);
 341 }
 342
 343 bool
 344 fs_inst::overwrites_reg(const fs_reg &reg)
 345 {
 346    return (reg.file == dst.file &&
 347            reg.reg == dst.reg &&
 348            reg.reg_offset >= dst.reg_offset  &&
 349            reg.reg_offset < dst.reg_offset + regs_written);
 350 }
 351
 352 bool
 353 fs_inst::is_send_from_grf()
 354 {
 355    return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
 356            opcode == SHADER_OPCODE_SHADER_TIME_ADD ||
 357            (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
 358             src[1].file == GRF) ||
 359            (is_tex() && src[0].file == GRF));
 360 }
 361
 362 bool
 363 fs_visitor::can_do_source_mods(fs_inst *inst)
 364 {
 365    if (brw->gen == 6 && inst->is_math())
 366       return false;
 367
 368    if (inst->is_send_from_grf())
 369       return false;
 370
 371    if (!inst->can_do_source_mods())
 372       return false;
 373
 374    return true;
 375 }
 376
 377 void
 378 fs_reg::init()
 379 {
 380    memset(this, 0, sizeof(*this));
 381    this->smear = -1;
 382 }
 383
 384 /** Generic unset register constructor. */
 385 fs_reg::fs_reg()
 386 {
 387    init();
 388    this->file = BAD_FILE;
 389 }
 390
 391 /** Immediate value constructor. */
 392 fs_reg::fs_reg(float f)
 393 {
 394    init();
 395    this->file = IMM;
 396    this->type = BRW_REGISTER_TYPE_F;
 397    this->imm.f = f;
 398 }
 399
 400 /** Immediate value constructor. */
 401 fs_reg::fs_reg(int32_t i)
 402 {
 403    init();
 404    this->file = IMM;
 405    this->type = BRW_REGISTER_TYPE_D;
 406    this->imm.i = i;
 407 }
 408
 409 /** Immediate value constructor. */
 410 fs_reg::fs_reg(uint32_t u)
 411 {
 412    init();
 413    this->file = IMM;
 414    this->type = BRW_REGISTER_TYPE_UD;
 415    this->imm.u = u;
 416 }
 417
 418 /** Fixed brw_reg Immediate value constructor. */
 419 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
 420 {
 421    init();
 422    this->file = HW_REG;
 423    this->fixed_hw_reg = fixed_hw_reg;
 424    this->type = fixed_hw_reg.type;
 425 }
 426
 427 bool
 428 fs_reg::equals(const fs_reg &r) const
 429 {
 430    return (file == r.file &&
 431            reg == r.reg &&
 432            reg_offset == r.reg_offset &&
 433            type == r.type &&
 434            negate == r.negate &&
 435            abs == r.abs &&
 436            !reladdr && !r.reladdr &&
 437            memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
 438                   sizeof(fixed_hw_reg)) == 0 &&
 439            smear == r.smear &&
 440            imm.u == r.imm.u);
 441 }
 442
 443 fs_reg
 444 fs_reg::retype(uint32_t type)
 445 {
 446    fs_reg result = *this;
 447    result.type = type;
 448    return result;
 449 }
 450
 451 bool
 452 fs_reg::is_zero() const
 453 {
 454    if (file != IMM)
 455       return false;
 456
 457    return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
 458 }
 459
 460 bool
 461 fs_reg::is_one() const
 462 {
 463    if (file != IMM)
 464       return false;
 465
 466    return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
 467 }
 468
 469 bool
 470 fs_reg::is_valid_3src() const
 471 {
 472    return file == GRF || file == UNIFORM;
 473 }
 474
 475 int
 476 fs_visitor::type_size(const struct glsl_type *type)
 477 {
 478    unsigned int size, i;
 479
 480    switch (type->base_type) {
 481    case GLSL_TYPE_UINT:
 482    case GLSL_TYPE_INT:
 483    case GLSL_TYPE_FLOAT:
 484    case GLSL_TYPE_BOOL:
 485       return type->components();
 486    case GLSL_TYPE_ARRAY:
 487       return type_size(type->fields.array) * type->length;
 488    case GLSL_TYPE_STRUCT:
 489       size = 0;
 490       for (i = 0; i < type->length; i++) {
 491          size += type_size(type->fields.structure[i].type);
 492       }
 493       return size;
 494    case GLSL_TYPE_SAMPLER:
 495       /* Samplers take up no register space, since they're baked in at
 496        * link time.
 497        */
 498       return 0;
 499    case GLSL_TYPE_VOID:
 500    case GLSL_TYPE_ERROR:
 501    case GLSL_TYPE_INTERFACE:
 502       assert(!"not reached");
 503       break;
 504    }
 505
 506    return 0;
 507 }
 508
 509 fs_reg
 510 fs_visitor::get_timestamp()
 511 {
 512    assert(brw->gen >= 7);
 513
 514    fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 515                                           BRW_ARF_TIMESTAMP,
 516                                           0),
 517                              BRW_REGISTER_TYPE_UD));
 518
 519    fs_reg dst = fs_reg(this, glsl_type::uint_type);
 520
 521    fs_inst *mov = emit(MOV(dst, ts));
 522    /* We want to read the 3 fields we care about (mostly field 0, but also 2)
 523     * even if it's not enabled in the dispatch.
 524     */
 525    mov->force_writemask_all = true;
 526    mov->force_uncompressed = true;
 527
 528    /* The caller wants the low 32 bits of the timestamp.  Since it's running
 529     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
 530     * which is plenty of time for our purposes.  It is identical across the
 531     * EUs, but since it's tracking GPU core speed it will increment at a
 532     * varying rate as render P-states change.
 533     *
 534     * The caller could also check if render P-states have changed (or anything
 535     * else that might disrupt timing) by setting smear to 2 and checking if
 536     * that field is != 0.
 537     */
 538    dst.smear = 0;
 539
 540    return dst;
 541 }
 542
 543 void
 544 fs_visitor::emit_shader_time_begin()
 545 {
 546    current_annotation = "shader time start";
 547    shader_start_time = get_timestamp();
 548 }
 549
 550 void
 551 fs_visitor::emit_shader_time_end()
 552 {
 553    current_annotation = "shader time end";
 554
 555    enum shader_time_shader_type type, written_type, reset_type;
 556    if (dispatch_width == 8) {
 557       type = ST_FS8;
 558       written_type = ST_FS8_WRITTEN;
 559       reset_type = ST_FS8_RESET;
 560    } else {
 561       assert(dispatch_width == 16);
 562       type = ST_FS16;
 563       written_type = ST_FS16_WRITTEN;
 564       reset_type = ST_FS16_RESET;
 565    }
 566
 567    fs_reg shader_end_time = get_timestamp();
 568
 569    /* Check that there weren't any timestamp reset events (assuming these
 570     * were the only two timestamp reads that happened).
 571     */
 572    fs_reg reset = shader_end_time;
 573    reset.smear = 2;
 574    fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
 575    test->conditional_mod = BRW_CONDITIONAL_Z;
 576    emit(IF(BRW_PREDICATE_NORMAL));
 577
 578    push_force_uncompressed();
 579    fs_reg start = shader_start_time;
 580    start.negate = true;
 581    fs_reg diff = fs_reg(this, glsl_type::uint_type);
 582    emit(ADD(diff, start, shader_end_time));
 583
 584    /* If there were no instructions between the two timestamp gets, the diff
 585     * is 2 cycles.  Remove that overhead, so I can forget about that when
 586     * trying to determine the time taken for single instructions.
 587     */
 588    emit(ADD(diff, diff, fs_reg(-2u)));
 589
 590    emit_shader_time_write(type, diff);
 591    emit_shader_time_write(written_type, fs_reg(1u));
 592    emit(BRW_OPCODE_ELSE);
 593    emit_shader_time_write(reset_type, fs_reg(1u));
 594    emit(BRW_OPCODE_ENDIF);
 595
 596    pop_force_uncompressed();
 597 }
 598
 599 void
 600 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
 601                                    fs_reg value)
 602 {
 603    int shader_time_index =
 604       brw_get_shader_time_index(brw, shader_prog, &fp->Base, type);
 605    fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
 606
 607    fs_reg payload;
 608    if (dispatch_width == 8)
 609       payload = fs_reg(this, glsl_type::uvec2_type);
 610    else
 611       payload = fs_reg(this, glsl_type::uint_type);
 612
 613    emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
 614                 fs_reg(), payload, offset, value));
 615 }
 616
 617 void
 618 fs_visitor::fail(const char *format, ...)
 619 {
 620    va_list va;
 621    char *msg;
 622
 623    if (failed)
 624       return;
 625
 626    failed = true;
 627
 628    va_start(va, format);
 629    msg = ralloc_vasprintf(mem_ctx, format, va);
 630    va_end(va);
 631    msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
 632
 633    this->fail_msg = msg;
 634
 635    if (INTEL_DEBUG & DEBUG_WM) {
 636       fprintf(stderr, "%s",  msg);
 637    }
 638 }
 639
 640 fs_inst *
 641 fs_visitor::emit(enum opcode opcode)
 642 {
 643    return emit(fs_inst(opcode));
 644 }
 645
 646 fs_inst *
 647 fs_visitor::emit(enum opcode opcode, fs_reg dst)
 648 {
 649    return emit(fs_inst(opcode, dst));
 650 }
 651
 652 fs_inst *
 653 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
 654 {
 655    return emit(fs_inst(opcode, dst, src0));
 656 }
 657
 658 fs_inst *
 659 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 660 {
 661    return emit(fs_inst(opcode, dst, src0, src1));
 662 }
 663
 664 fs_inst *
 665 fs_visitor::emit(enum opcode opcode, fs_reg dst,
 666                  fs_reg src0, fs_reg src1, fs_reg src2)
 667 {
 668    return emit(fs_inst(opcode, dst, src0, src1, src2));
 669 }
 670
 671 void
 672 fs_visitor::push_force_uncompressed()
 673 {
 674    force_uncompressed_stack++;
 675 }
 676
 677 void
 678 fs_visitor::pop_force_uncompressed()
 679 {
 680    force_uncompressed_stack--;
 681    assert(force_uncompressed_stack >= 0);
 682 }
 683
 684 void
 685 fs_visitor::push_force_sechalf()
 686 {
 687    force_sechalf_stack++;
 688 }
 689
 690 void
 691 fs_visitor::pop_force_sechalf()
 692 {
 693    force_sechalf_stack--;
 694    assert(force_sechalf_stack >= 0);
 695 }
 696
 697 /**
 698  * Returns true if the instruction has a flag that means it won't
 699  * update an entire destination register.
 700  *
 701  * For example, dead code elimination and live variable analysis want to know
 702  * when a write to a variable screens off any preceding values that were in
 703  * it.
 704  */
 705 bool
 706 fs_inst::is_partial_write()
 707 {
 708    return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
 709            this->force_uncompressed ||
 710            this->force_sechalf);
 711 }
 712
 713 int
 714 fs_inst::regs_read(fs_visitor *v, int arg)
 715 {
 716    if (is_tex() && arg == 0 && src[0].file == GRF) {
 717       if (v->dispatch_width == 16)
 718          return (mlen + 1) / 2;
 719       else
 720          return mlen;
 721    }
 722    return 1;
 723 }
 724
 725 /**
 726  * Returns how many MRFs an FS opcode will write over.
 727  *
 728  * Note that this is not the 0 or 1 implied writes in an actual gen
 729  * instruction -- the FS opcodes often generate MOVs in addition.
 730  */
 731 int
 732 fs_visitor::implied_mrf_writes(fs_inst *inst)
 733 {
 734    if (inst->mlen == 0)
 735       return 0;
 736
 737    if (inst->base_mrf == -1)
 738       return 0;
 739
 740    switch (inst->opcode) {
 741    case SHADER_OPCODE_RCP:
 742    case SHADER_OPCODE_RSQ:
 743    case SHADER_OPCODE_SQRT:
 744    case SHADER_OPCODE_EXP2:
 745    case SHADER_OPCODE_LOG2:
 746    case SHADER_OPCODE_SIN:
 747    case SHADER_OPCODE_COS:
 748       return 1 * dispatch_width / 8;
 749    case SHADER_OPCODE_POW:
 750    case SHADER_OPCODE_INT_QUOTIENT:
 751    case SHADER_OPCODE_INT_REMAINDER:
 752       return 2 * dispatch_width / 8;
 753    case SHADER_OPCODE_TEX:
 754    case FS_OPCODE_TXB:
 755    case SHADER_OPCODE_TXD:
 756    case SHADER_OPCODE_TXF:
 757    case SHADER_OPCODE_TXF_MS:
 758    case SHADER_OPCODE_TG4:
 759    case SHADER_OPCODE_TXL:
 760    case SHADER_OPCODE_TXS:
 761    case SHADER_OPCODE_LOD:
 762       return 1;
 763    case FS_OPCODE_FB_WRITE:
 764       return 2;
 765    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 766    case FS_OPCODE_UNSPILL:
 767       return 1;
 768    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
 769       return inst->mlen;
 770    case FS_OPCODE_SPILL:
 771       return 2;
 772    default:
 773       assert(!"not reached");
 774       return inst->mlen;
 775    }
 776 }
 777
 778 int
 779 fs_visitor::virtual_grf_alloc(int size)
 780 {
 781    if (virtual_grf_array_size <= virtual_grf_count) {
 782       if (virtual_grf_array_size == 0)
 783          virtual_grf_array_size = 16;
 784       else
 785          virtual_grf_array_size *= 2;
 786       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 787                                    virtual_grf_array_size);
 788    }
 789    virtual_grf_sizes[virtual_grf_count] = size;
 790    return virtual_grf_count++;
 791 }
 792
 793 /** Fixed HW reg constructor. */
 794 fs_reg::fs_reg(enum register_file file, int reg)
 795 {
 796    init();
 797    this->file = file;
 798    this->reg = reg;
 799    this->type = BRW_REGISTER_TYPE_F;
 800 }
 801
 802 /** Fixed HW reg constructor. */
 803 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
 804 {
 805    init();
 806    this->file = file;
 807    this->reg = reg;
 808    this->type = type;
 809 }
 810
 811 /** Automatic reg constructor. */
 812 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
 813 {
 814    init();
 815
 816    this->file = GRF;
 817    this->reg = v->virtual_grf_alloc(v->type_size(type));
 818    this->reg_offset = 0;
 819    this->type = brw_type_for_base_type(type);
 820 }
 821
 822 fs_reg *
 823 fs_visitor::variable_storage(ir_variable *var)
 824 {
 825    return (fs_reg *)hash_table_find(this->variable_ht, var);
 826 }
 827
 828 void
 829 import_uniforms_callback(const void *key,
 830                          void *data,
 831                          void *closure)
 832 {
 833    struct hash_table *dst_ht = (struct hash_table *)closure;
 834    const fs_reg *reg = (const fs_reg *)data;
 835
 836    if (reg->file != UNIFORM)
 837       return;
 838
 839    hash_table_insert(dst_ht, data, key);
 840 }
 841
 842 /* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
 843  * This brings in those uniform definitions
 844  */
 845 void
 846 fs_visitor::import_uniforms(fs_visitor *v)
 847 {
 848    hash_table_call_foreach(v->variable_ht,
 849                            import_uniforms_callback,
 850                            variable_ht);
 851    this->params_remap = v->params_remap;
 852    this->nr_params_remap = v->nr_params_remap;
 853 }
 854
 855 /* Our support for uniforms is piggy-backed on the struct
 856  * gl_fragment_program, because that's where the values actually
 857  * get stored, rather than in some global gl_shader_program uniform
 858  * store.
 859  */
 860 void
 861 fs_visitor::setup_uniform_values(ir_variable *ir)
 862 {
 863    int namelen = strlen(ir->name);
 864
 865    /* The data for our (non-builtin) uniforms is stored in a series of
 866     * gl_uniform_driver_storage structs for each subcomponent that
 867     * glGetUniformLocation() could name.  We know it's been set up in the same
 868     * order we'd walk the type, so walk the list of storage and find anything
 869     * with our name, or the prefix of a component that starts with our name.
 870     */
 871    unsigned params_before = c->prog_data.nr_params;
 872    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
 873       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 874
 875       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 876           (storage->name[namelen] != 0 &&
 877            storage->name[namelen] != '.' &&
 878            storage->name[namelen] != '[')) {
 879          continue;
 880       }
 881
 882       unsigned slots = storage->type->component_slots();
 883       if (storage->array_elements)
 884          slots *= storage->array_elements;
 885
 886       for (unsigned i = 0; i < slots; i++) {
 887          c->prog_data.param[c->prog_data.nr_params++] =
 888             &storage->storage[i].f;
 889       }
 890    }
 891
 892    /* Make sure we actually initialized the right amount of stuff here. */
 893    assert(params_before + ir->type->component_slots() ==
 894           c->prog_data.nr_params);
 895    (void)params_before;
 896 }
 897
 898
 899 /* Our support for builtin uniforms is even scarier than non-builtin.
 900  * It sits on top of the PROG_STATE_VAR parameters that are
 901  * automatically updated from GL context state.
 902  */
 903 void
 904 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
 905 {
 906    const ir_state_slot *const slots = ir->state_slots;
 907    assert(ir->state_slots != NULL);
 908
 909    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 910       /* This state reference has already been setup by ir_to_mesa, but we'll
 911        * get the same index back here.
 912        */
 913       int index = _mesa_add_state_reference(this->fp->Base.Parameters,
 914                                             (gl_state_index *)slots[i].tokens);
 915
 916       /* Add each of the unique swizzles of the element as a parameter.
 917        * This'll end up matching the expected layout of the
 918        * array/matrix/structure we're trying to fill in.
 919        */
 920       int last_swiz = -1;
 921       for (unsigned int j = 0; j < 4; j++) {
 922          int swiz = GET_SWZ(slots[i].swizzle, j);
 923          if (swiz == last_swiz)
 924             break;
 925          last_swiz = swiz;
 926
 927          c->prog_data.param[c->prog_data.nr_params++] =
 928             &fp->Base.Parameters->ParameterValues[index][swiz].f;
 929       }
 930    }
 931 }
 932
 933 fs_reg *
 934 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
 935 {
 936    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 937    fs_reg wpos = *reg;
 938    bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
 939
 940    /* gl_FragCoord.x */
 941    if (ir->pixel_center_integer) {
 942       emit(MOV(wpos, this->pixel_x));
 943    } else {
 944       emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
 945    }
 946    wpos.reg_offset++;
 947
 948    /* gl_FragCoord.y */
 949    if (!flip && ir->pixel_center_integer) {
 950       emit(MOV(wpos, this->pixel_y));
 951    } else {
 952       fs_reg pixel_y = this->pixel_y;
 953       float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
 954
 955       if (flip) {
 956          pixel_y.negate = true;
 957          offset += c->key.drawable_height - 1.0;
 958       }
 959
 960       emit(ADD(wpos, pixel_y, fs_reg(offset)));
 961    }
 962    wpos.reg_offset++;
 963
 964    /* gl_FragCoord.z */
 965    if (brw->gen >= 6) {
 966       emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
 967    } else {
 968       emit(FS_OPCODE_LINTERP, wpos,
 969            this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
 970            this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
 971            interp_reg(VARYING_SLOT_POS, 2));
 972    }
 973    wpos.reg_offset++;
 974
 975    /* gl_FragCoord.w: Already set up in emit_interpolation */
 976    emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
 977
 978    return reg;
 979 }
 980
 981 fs_inst *
 982 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
 983                          glsl_interp_qualifier interpolation_mode,
 984                          bool is_centroid)
 985 {
 986    brw_wm_barycentric_interp_mode barycoord_mode;
 987    if (brw->gen >= 6) {
 988       if (is_centroid) {
 989          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
 990             barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
 991          else
 992             barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
 993       } else {
 994          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
 995             barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
 996          else
 997             barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
 998       }
 999    } else {
1000       /* On Ironlake and below, there is only one interpolation mode.
1001        * Centroid interpolation doesn't mean anything on this hardware --
1002        * there is no multisampling.
1003        */
1004       barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1005    }
1006    return emit(FS_OPCODE_LINTERP, attr,
1007                this->delta_x[barycoord_mode],
1008                this->delta_y[barycoord_mode], interp);
1009 }
1010
1011 fs_reg *
1012 fs_visitor::emit_general_interpolation(ir_variable *ir)
1013 {
1014    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1015    reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
1016    fs_reg attr = *reg;
1017
1018    unsigned int array_elements;
1019    const glsl_type *type;
1020
1021    if (ir->type->is_array()) {
1022       array_elements = ir->type->length;
1023       if (array_elements == 0) {
1024          fail("dereferenced array '%s' has length 0\n", ir->name);
1025       }
1026       type = ir->type->fields.array;
1027    } else {
1028       array_elements = 1;
1029       type = ir->type;
1030    }
1031
1032    glsl_interp_qualifier interpolation_mode =
1033       ir->determine_interpolation_mode(c->key.flat_shade);
1034
1035    int location = ir->location;
1036    for (unsigned int i = 0; i < array_elements; i++) {
1037       for (unsigned int j = 0; j < type->matrix_columns; j++) {
1038          if (c->prog_data.urb_setup[location] == -1) {
1039             /* If there's no incoming setup data for this slot, don't
1040              * emit interpolation for it.
1041              */
1042             attr.reg_offset += type->vector_elements;
1043             location++;
1044             continue;
1045          }
1046
1047          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1048             /* Constant interpolation (flat shading) case. The SF has
1049              * handed us defined values in only the constant offset
1050              * field of the setup reg.
1051              */
1052             for (unsigned int k = 0; k < type->vector_elements; k++) {
1053                struct brw_reg interp = interp_reg(location, k);
1054                interp = suboffset(interp, 3);
1055                interp.type = reg->type;
1056                emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1057                attr.reg_offset++;
1058             }
1059          } else {
1060             /* Smooth/noperspective interpolation case. */
1061             for (unsigned int k = 0; k < type->vector_elements; k++) {
1062                /* FINISHME: At some point we probably want to push
1063                 * this farther by giving similar treatment to the
1064                 * other potentially constant components of the
1065                 * attribute, as well as making brw_vs_constval.c
1066                 * handle varyings other than gl_TexCoord.
1067                 */
1068                struct brw_reg interp = interp_reg(location, k);
1069                emit_linterp(attr, fs_reg(interp), interpolation_mode,
1070                             ir->centroid);
1071                if (brw->needs_unlit_centroid_workaround && ir->centroid) {
1072                   /* Get the pixel/sample mask into f0 so that we know
1073                    * which pixels are lit.  Then, for each channel that is
1074                    * unlit, replace the centroid data with non-centroid
1075                    * data.
1076                    */
1077                   emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1078                   fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1079                                                interpolation_mode, false);
1080                   inst->predicate = BRW_PREDICATE_NORMAL;
1081                   inst->predicate_inverse = true;
1082                }
1083                if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1084                   emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1085                }
1086                attr.reg_offset++;
1087             }
1088
1089          }
1090          location++;
1091       }
1092    }
1093
1094    return reg;
1095 }
1096
1097 fs_reg *
1098 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1099 {
1100    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1101
1102    /* The frontfacing comes in as a bit in the thread payload. */
1103    if (brw->gen >= 6) {
1104       emit(BRW_OPCODE_ASR, *reg,
1105            fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1106            fs_reg(15));
1107       emit(BRW_OPCODE_NOT, *reg, *reg);
1108       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1109    } else {
1110       struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1111       /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1112        * us front face
1113        */
1114       emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1115       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1116    }
1117
1118    return reg;
1119 }
1120
1121 fs_reg
1122 fs_visitor::fix_math_operand(fs_reg src)
1123 {
1124    /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1125     * might be able to do better by doing execsize = 1 math and then
1126     * expanding that result out, but we would need to be careful with
1127     * masking.
1128     *
1129     * The hardware ignores source modifiers (negate and abs) on math
1130     * instructions, so we also move to a temp to set those up.
1131     */
1132    if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1133        !src.abs && !src.negate)
1134       return src;
1135
1136    /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1137     * operands to math
1138     */
1139    if (brw->gen >= 7 && src.file != IMM)
1140       return src;
1141
1142    fs_reg expanded = fs_reg(this, glsl_type::float_type);
1143    expanded.type = src.type;
1144    emit(BRW_OPCODE_MOV, expanded, src);
1145    return expanded;
1146 }
1147
1148 fs_inst *
1149 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1150 {
1151    switch (opcode) {
1152    case SHADER_OPCODE_RCP:
1153    case SHADER_OPCODE_RSQ:
1154    case SHADER_OPCODE_SQRT:
1155    case SHADER_OPCODE_EXP2:
1156    case SHADER_OPCODE_LOG2:
1157    case SHADER_OPCODE_SIN:
1158    case SHADER_OPCODE_COS:
1159       break;
1160    default:
1161       assert(!"not reached: bad math opcode");
1162       return NULL;
1163    }
1164
1165    /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
1166     * might be able to do better by doing execsize = 1 math and then
1167     * expanding that result out, but we would need to be careful with
1168     * masking.
1169     *
1170     * Gen 6 hardware ignores source modifiers (negate and abs) on math
1171     * instructions, so we also move to a temp to set those up.
1172     */
1173    if (brw->gen >= 6)
1174       src = fix_math_operand(src);
1175
1176    fs_inst *inst = emit(opcode, dst, src);
1177
1178    if (brw->gen < 6) {
1179       inst->base_mrf = 2;
1180       inst->mlen = dispatch_width / 8;
1181    }
1182
1183    return inst;
1184 }
1185
1186 fs_inst *
1187 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1188 {
1189    int base_mrf = 2;
1190    fs_inst *inst;
1191
1192    switch (opcode) {
1193    case SHADER_OPCODE_INT_QUOTIENT:
1194    case SHADER_OPCODE_INT_REMAINDER:
1195       if (brw->gen >= 7 && dispatch_width == 16)
1196          fail("16-wide INTDIV unsupported\n");
1197       break;
1198    case SHADER_OPCODE_POW:
1199       break;
1200    default:
1201       assert(!"not reached: unsupported binary math opcode.");
1202       return NULL;
1203    }
1204
1205    if (brw->gen >= 6) {
1206       src0 = fix_math_operand(src0);
1207       src1 = fix_math_operand(src1);
1208
1209       inst = emit(opcode, dst, src0, src1);
1210    } else {
1211       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1212        * "Message Payload":
1213        *
1214        * "Operand0[7].  For the INT DIV functions, this operand is the
1215        *  denominator."
1216        *  ...
1217        * "Operand1[7].  For the INT DIV functions, this operand is the
1218        *  numerator."
1219        */
1220       bool is_int_div = opcode != SHADER_OPCODE_POW;
1221       fs_reg &op0 = is_int_div ? src1 : src0;
1222       fs_reg &op1 = is_int_div ? src0 : src1;
1223
1224       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1225       inst = emit(opcode, dst, op0, reg_null_f);
1226
1227       inst->base_mrf = base_mrf;
1228       inst->mlen = 2 * dispatch_width / 8;
1229    }
1230    return inst;
1231 }
1232
1233 void
1234 fs_visitor::assign_curb_setup()
1235 {
1236    c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
1237    if (dispatch_width == 8) {
1238       c->prog_data.first_curbe_grf = c->nr_payload_regs;
1239    } else {
1240       c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1241    }
1242
1243    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1244    foreach_list(node, &this->instructions) {
1245       fs_inst *inst = (fs_inst *)node;
1246
1247       for (unsigned int i = 0; i < 3; i++) {
1248          if (inst->src[i].file == UNIFORM) {
1249             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1250             struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1251                                                   constant_nr / 8,
1252                                                   constant_nr % 8);
1253
1254             inst->src[i].file = HW_REG;
1255             inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
1256          }
1257       }
1258    }
1259 }
1260
1261 void
1262 fs_visitor::calculate_urb_setup()
1263 {
1264    for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1265       c->prog_data.urb_setup[i] = -1;
1266    }
1267
1268    int urb_next = 0;
1269    /* Figure out where each of the incoming setup attributes lands. */
1270    if (brw->gen >= 6) {
1271       if (_mesa_bitcount_64(fp->Base.InputsRead &
1272                             BRW_FS_VARYING_INPUT_MASK) <= 16) {
1273          /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1274           * first 16 varying inputs, so we can put them wherever we want.
1275           * Just put them in order.
1276           *
1277           * This is useful because it means that (a) inputs not used by the
1278           * fragment shader won't take up valuable register space, and (b) we
1279           * won't have to recompile the fragment shader if it gets paired with
1280           * a different vertex (or geometry) shader.
1281           */
1282          for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1283             if (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1284                 BITFIELD64_BIT(i)) {
1285                c->prog_data.urb_setup[i] = urb_next++;
1286             }
1287          }
1288       } else {
1289          /* We have enough input varyings that the SF/SBE pipeline stage can't
1290           * arbitrarily rearrange them to suit our whim; we have to put them
1291           * in an order that matches the output of the previous pipeline stage
1292           * (geometry or vertex shader).
1293           */
1294          struct brw_vue_map prev_stage_vue_map;
1295          brw_compute_vue_map(brw, &prev_stage_vue_map,
1296                              c->key.input_slots_valid);
1297          int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1298          assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1299          for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1300               slot++) {
1301             int varying = prev_stage_vue_map.slot_to_varying[slot];
1302             /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1303              * unused.
1304              */
1305             if (varying != BRW_VARYING_SLOT_COUNT &&
1306                 (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1307                  BITFIELD64_BIT(varying))) {
1308                c->prog_data.urb_setup[varying] = slot - first_slot;
1309             }
1310          }
1311          urb_next = prev_stage_vue_map.num_slots - first_slot;
1312       }
1313    } else {
1314       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1315       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1316          /* Point size is packed into the header, not as a general attribute */
1317          if (i == VARYING_SLOT_PSIZ)
1318             continue;
1319
1320          if (c->key.input_slots_valid & BITFIELD64_BIT(i)) {
1321             /* The back color slot is skipped when the front color is
1322              * also written to.  In addition, some slots can be
1323              * written in the vertex shader and not read in the
1324              * fragment shader.  So the register number must always be
1325              * incremented, mapped or not.
1326              */
1327             if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1328                c->prog_data.urb_setup[i] = urb_next;
1329             urb_next++;
1330          }
1331       }
1332
1333       /*
1334        * It's a FS only attribute, and we did interpolation for this attribute
1335        * in SF thread. So, count it here, too.
1336        *
1337        * See compile_sf_prog() for more info.
1338        */
1339       if (fp->Base.InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1340          c->prog_data.urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1341    }
1342
1343    c->prog_data.num_varying_inputs = urb_next;
1344 }
1345
1346 void
1347 fs_visitor::assign_urb_setup()
1348 {
1349    int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1350
1351    /* Offset all the urb_setup[] index by the actual position of the
1352     * setup regs, now that the location of the constants has been chosen.
1353     */
1354    foreach_list(node, &this->instructions) {
1355       fs_inst *inst = (fs_inst *)node;
1356
1357       if (inst->opcode == FS_OPCODE_LINTERP) {
1358          assert(inst->src[2].file == HW_REG);
1359          inst->src[2].fixed_hw_reg.nr += urb_start;
1360       }
1361
1362       if (inst->opcode == FS_OPCODE_CINTERP) {
1363          assert(inst->src[0].file == HW_REG);
1364          inst->src[0].fixed_hw_reg.nr += urb_start;
1365       }
1366    }
1367
1368    /* Each attribute is 4 setup channels, each of which is half a reg. */
1369    this->first_non_payload_grf =
1370       urb_start + c->prog_data.num_varying_inputs * 2;
1371 }
1372
1373 /**
1374  * Split large virtual GRFs into separate components if we can.
1375  *
1376  * This is mostly duplicated with what brw_fs_vector_splitting does,
1377  * but that's really conservative because it's afraid of doing
1378  * splitting that doesn't result in real progress after the rest of
1379  * the optimization phases, which would cause infinite looping in
1380  * optimization.  We can do it once here, safely.  This also has the
1381  * opportunity to split interpolated values, or maybe even uniforms,
1382  * which we don't have at the IR level.
1383  *
1384  * We want to split, because virtual GRFs are what we register
1385  * allocate and spill (due to contiguousness requirements for some
1386  * instructions), and they're what we naturally generate in the
1387  * codegen process, but most virtual GRFs don't actually need to be
1388  * contiguous sets of GRFs.  If we split, we'll end up with reduced
1389  * live intervals and better dead code elimination and coalescing.
1390  */
1391 void
1392 fs_visitor::split_virtual_grfs()
1393 {
1394    int num_vars = this->virtual_grf_count;
1395    bool split_grf[num_vars];
1396    int new_virtual_grf[num_vars];
1397
1398    /* Try to split anything > 0 sized. */
1399    for (int i = 0; i < num_vars; i++) {
1400       if (this->virtual_grf_sizes[i] != 1)
1401          split_grf[i] = true;
1402       else
1403          split_grf[i] = false;
1404    }
1405
1406    if (brw->has_pln &&
1407        this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1408       /* PLN opcodes rely on the delta_xy being contiguous.  We only have to
1409        * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1410        * Gen6, that was the only supported interpolation mode, and since Gen6,
1411        * delta_x and delta_y are in fixed hardware registers.
1412        */
1413       split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1414          false;
1415    }
1416
1417    foreach_list(node, &this->instructions) {
1418       fs_inst *inst = (fs_inst *)node;
1419
1420       /* If there's a SEND message that requires contiguous destination
1421        * registers, no splitting is allowed.
1422        */
1423       if (inst->regs_written > 1) {
1424          split_grf[inst->dst.reg] = false;
1425       }
1426
1427       /* If we're sending from a GRF, don't split it, on the assumption that
1428        * the send is reading the whole thing.
1429        */
1430       if (inst->is_send_from_grf()) {
1431          for (int i = 0; i < 3; i++) {
1432             if (inst->src[i].file == GRF) {
1433                split_grf[inst->src[i].reg] = false;
1434             }
1435          }
1436       }
1437    }
1438
1439    /* Allocate new space for split regs.  Note that the virtual
1440     * numbers will be contiguous.
1441     */
1442    for (int i = 0; i < num_vars; i++) {
1443       if (split_grf[i]) {
1444          new_virtual_grf[i] = virtual_grf_alloc(1);
1445          for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1446             int reg = virtual_grf_alloc(1);
1447             assert(reg == new_virtual_grf[i] + j - 1);
1448             (void) reg;
1449          }
1450          this->virtual_grf_sizes[i] = 1;
1451       }
1452    }
1453
1454    foreach_list(node, &this->instructions) {
1455       fs_inst *inst = (fs_inst *)node;
1456
1457       if (inst->dst.file == GRF &&
1458           split_grf[inst->dst.reg] &&
1459           inst->dst.reg_offset != 0) {
1460          inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1461                           inst->dst.reg_offset - 1);
1462          inst->dst.reg_offset = 0;
1463       }
1464       for (int i = 0; i < 3; i++) {
1465          if (inst->src[i].file == GRF &&
1466              split_grf[inst->src[i].reg] &&
1467              inst->src[i].reg_offset != 0) {
1468             inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1469                                 inst->src[i].reg_offset - 1);
1470             inst->src[i].reg_offset = 0;
1471          }
1472       }
1473    }
1474    invalidate_live_intervals();
1475 }
1476
1477 /**
1478  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1479  *
1480  * During code generation, we create tons of temporary variables, many of
1481  * which get immediately killed and are never used again.  Yet, in later
1482  * optimization and analysis passes, such as compute_live_intervals, we need
1483  * to loop over all the virtual GRFs.  Compacting them can save a lot of
1484  * overhead.
1485  */
1486 void
1487 fs_visitor::compact_virtual_grfs()
1488 {
1489    /* Mark which virtual GRFs are used, and count how many. */
1490    int remap_table[this->virtual_grf_count];
1491    memset(remap_table, -1, sizeof(remap_table));
1492
1493    foreach_list(node, &this->instructions) {
1494       const fs_inst *inst = (const fs_inst *) node;
1495
1496       if (inst->dst.file == GRF)
1497          remap_table[inst->dst.reg] = 0;
1498
1499       for (int i = 0; i < 3; i++) {
1500          if (inst->src[i].file == GRF)
1501             remap_table[inst->src[i].reg] = 0;
1502       }
1503    }
1504
1505    /* In addition to registers used in instructions, fs_visitor keeps
1506     * direct references to certain special values which must be patched:
1507     */
1508    fs_reg *special[] = {
1509       &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1510       &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1511       &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1512       &delta_x[0], &delta_x[1], &delta_x[2],
1513       &delta_x[3], &delta_x[4], &delta_x[5],
1514       &delta_y[0], &delta_y[1], &delta_y[2],
1515       &delta_y[3], &delta_y[4], &delta_y[5],
1516    };
1517    STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1518    STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1519
1520    /* Treat all special values as used, to be conservative */
1521    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1522       if (special[i]->file == GRF)
1523          remap_table[special[i]->reg] = 0;
1524    }
1525
1526    /* Compact the GRF arrays. */
1527    int new_index = 0;
1528    for (int i = 0; i < this->virtual_grf_count; i++) {
1529       if (remap_table[i] != -1) {
1530          remap_table[i] = new_index;
1531          virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1532          invalidate_live_intervals();
1533          ++new_index;
1534       }
1535    }
1536
1537    this->virtual_grf_count = new_index;
1538
1539    /* Patch all the instructions to use the newly renumbered registers */
1540    foreach_list(node, &this->instructions) {
1541       fs_inst *inst = (fs_inst *) node;
1542
1543       if (inst->dst.file == GRF)
1544          inst->dst.reg = remap_table[inst->dst.reg];
1545
1546       for (int i = 0; i < 3; i++) {
1547          if (inst->src[i].file == GRF)
1548             inst->src[i].reg = remap_table[inst->src[i].reg];
1549       }
1550    }
1551
1552    /* Patch all the references to special values */
1553    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1554       if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1555          special[i]->reg = remap_table[special[i]->reg];
1556    }
1557 }
1558
1559 bool
1560 fs_visitor::remove_dead_constants()
1561 {
1562    if (dispatch_width == 8) {
1563       this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
1564       this->nr_params_remap = c->prog_data.nr_params;
1565
1566       for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
1567          this->params_remap[i] = -1;
1568
1569       /* Find which params are still in use. */
1570       foreach_list(node, &this->instructions) {
1571          fs_inst *inst = (fs_inst *)node;
1572
1573          for (int i = 0; i < 3; i++) {
1574             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1575
1576             if (inst->src[i].file != UNIFORM)
1577                continue;
1578
1579             /* Section 5.11 of the OpenGL 4.3 spec says:
1580              *
1581              *     "Out-of-bounds reads return undefined values, which include
1582              *     values from other variables of the active program or zero."
1583              */
1584             if (constant_nr < 0 || constant_nr >= (int)c->prog_data.nr_params) {
1585                constant_nr = 0;
1586             }
1587
1588             /* For now, set this to non-negative.  We'll give it the
1589              * actual new number in a moment, in order to keep the
1590              * register numbers nicely ordered.
1591              */
1592             this->params_remap[constant_nr] = 0;
1593          }
1594       }
1595
1596       /* Figure out what the new numbers for the params will be.  At some
1597        * point when we're doing uniform array access, we're going to want
1598        * to keep the distinction between .reg and .reg_offset, but for
1599        * now we don't care.
1600        */
1601       unsigned int new_nr_params = 0;
1602       for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1603          if (this->params_remap[i] != -1) {
1604             this->params_remap[i] = new_nr_params++;
1605          }
1606       }
1607
1608       /* Update the list of params to be uploaded to match our new numbering. */
1609       for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1610          int remapped = this->params_remap[i];
1611
1612          if (remapped == -1)
1613             continue;
1614
1615          c->prog_data.param[remapped] = c->prog_data.param[i];
1616       }
1617
1618       c->prog_data.nr_params = new_nr_params;
1619    } else {
1620       /* This should have been generated in the 8-wide pass already. */
1621       assert(this->params_remap);
1622    }
1623
1624    /* Now do the renumbering of the shader to remove unused params. */
1625    foreach_list(node, &this->instructions) {
1626       fs_inst *inst = (fs_inst *)node;
1627
1628       for (int i = 0; i < 3; i++) {
1629          int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1630
1631          if (inst->src[i].file != UNIFORM)
1632             continue;
1633
1634          /* as above alias to 0 */
1635          if (constant_nr < 0 || constant_nr >= (int)this->nr_params_remap) {
1636             constant_nr = 0;
1637          }
1638          assert(this->params_remap[constant_nr] != -1);
1639          inst->src[i].reg = this->params_remap[constant_nr];
1640          inst->src[i].reg_offset = 0;
1641       }
1642    }
1643
1644    return true;
1645 }
1646
1647 /*
1648  * Implements array access of uniforms by inserting a
1649  * PULL_CONSTANT_LOAD instruction.
1650  *
1651  * Unlike temporary GRF array access (where we don't support it due to
1652  * the difficulty of doing relative addressing on instruction
1653  * destinations), we could potentially do array access of uniforms
1654  * that were loaded in GRF space as push constants.  In real-world
1655  * usage we've seen, though, the arrays being used are always larger
1656  * than we could load as push constants, so just always move all
1657  * uniform array access out to a pull constant buffer.
1658  */
1659 void
1660 fs_visitor::move_uniform_array_access_to_pull_constants()
1661 {
1662    int pull_constant_loc[c->prog_data.nr_params];
1663
1664    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1665       pull_constant_loc[i] = -1;
1666    }
1667
1668    /* Walk through and find array access of uniforms.  Put a copy of that
1669     * uniform in the pull constant buffer.
1670     *
1671     * Note that we don't move constant-indexed accesses to arrays.  No
1672     * testing has been done of the performance impact of this choice.
1673     */
1674    foreach_list_safe(node, &this->instructions) {
1675       fs_inst *inst = (fs_inst *)node;
1676
1677       for (int i = 0 ; i < 3; i++) {
1678          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1679             continue;
1680
1681          int uniform = inst->src[i].reg;
1682
1683          /* If this array isn't already present in the pull constant buffer,
1684           * add it.
1685           */
1686          if (pull_constant_loc[uniform] == -1) {
1687             const float **values = &c->prog_data.param[uniform];
1688
1689             pull_constant_loc[uniform] = c->prog_data.nr_pull_params;
1690
1691             assert(param_size[uniform]);
1692
1693             for (int j = 0; j < param_size[uniform]; j++) {
1694                c->prog_data.pull_param[c->prog_data.nr_pull_params++] =
1695                   values[j];
1696             }
1697          }
1698
1699          /* Set up the annotation tracking for new generated instructions. */
1700          base_ir = inst->ir;
1701          current_annotation = inst->annotation;
1702
1703          fs_reg surf_index = fs_reg(c->prog_data.base.binding_table.pull_constants_start);
1704          fs_reg temp = fs_reg(this, glsl_type::float_type);
1705          exec_list list = VARYING_PULL_CONSTANT_LOAD(temp,
1706                                                      surf_index,
1707                                                      *inst->src[i].reladdr,
1708                                                      pull_constant_loc[uniform] +
1709                                                      inst->src[i].reg_offset);
1710          inst->insert_before(&list);
1711
1712          inst->src[i].file = temp.file;
1713          inst->src[i].reg = temp.reg;
1714          inst->src[i].reg_offset = temp.reg_offset;
1715          inst->src[i].reladdr = NULL;
1716       }
1717    }
1718 }
1719
1720 /**
1721  * Choose accesses from the UNIFORM file to demote to using the pull
1722  * constant buffer.
1723  *
1724  * We allow a fragment shader to have more than the specified minimum
1725  * maximum number of fragment shader uniform components (64).  If
1726  * there are too many of these, they'd fill up all of register space.
1727  * So, this will push some of them out to the pull constant buffer and
1728  * update the program to load them.
1729  */
1730 void
1731 fs_visitor::setup_pull_constants()
1732 {
1733    /* Only allow 16 registers (128 uniform components) as push constants. */
1734    unsigned int max_uniform_components = 16 * 8;
1735    if (c->prog_data.nr_params <= max_uniform_components)
1736       return;
1737
1738    if (dispatch_width == 16) {
1739       fail("Pull constants not supported in 16-wide\n");
1740       return;
1741    }
1742
1743    /* Just demote the end of the list.  We could probably do better
1744     * here, demoting things that are rarely used in the program first.
1745     */
1746    unsigned int pull_uniform_base = max_uniform_components;
1747
1748    int pull_constant_loc[c->prog_data.nr_params];
1749    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1750       if (i < pull_uniform_base) {
1751          pull_constant_loc[i] = -1;
1752       } else {
1753          pull_constant_loc[i] = -1;
1754          /* If our constant is already being uploaded for reladdr purposes,
1755           * reuse it.
1756           */
1757          for (unsigned int j = 0; j < c->prog_data.nr_pull_params; j++) {
1758             if (c->prog_data.pull_param[j] == c->prog_data.param[i]) {
1759                pull_constant_loc[i] = j;
1760                break;
1761             }
1762          }
1763          if (pull_constant_loc[i] == -1) {
1764             int pull_index = c->prog_data.nr_pull_params++;
1765             c->prog_data.pull_param[pull_index] = c->prog_data.param[i];
1766             pull_constant_loc[i] = pull_index;;
1767          }
1768       }
1769    }
1770    c->prog_data.nr_params = pull_uniform_base;
1771
1772    foreach_list(node, &this->instructions) {
1773       fs_inst *inst = (fs_inst *)node;
1774
1775       for (int i = 0; i < 3; i++) {
1776          if (inst->src[i].file != UNIFORM)
1777             continue;
1778
1779          int pull_index = pull_constant_loc[inst->src[i].reg +
1780                                             inst->src[i].reg_offset];
1781          if (pull_index == -1)
1782             continue;
1783
1784          assert(!inst->src[i].reladdr);
1785
1786          fs_reg dst = fs_reg(this, glsl_type::float_type);
1787          fs_reg index = fs_reg(c->prog_data.base.binding_table.pull_constants_start);
1788          fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1789          fs_inst *pull =
1790             new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1791                                  dst, index, offset);
1792          pull->ir = inst->ir;
1793          pull->annotation = inst->annotation;
1794
1795          inst->insert_before(pull);
1796
1797          inst->src[i].file = GRF;
1798          inst->src[i].reg = dst.reg;
1799          inst->src[i].reg_offset = 0;
1800          inst->src[i].smear = pull_index & 3;
1801       }
1802    }
1803 }
1804
1805 bool
1806 fs_visitor::opt_algebraic()
1807 {
1808    bool progress = false;
1809
1810    foreach_list(node, &this->instructions) {
1811       fs_inst *inst = (fs_inst *)node;
1812
1813       switch (inst->opcode) {
1814       case BRW_OPCODE_MUL:
1815          if (inst->src[1].file != IMM)
1816             continue;
1817
1818          /* a * 1.0 = a */
1819          if (inst->src[1].is_one()) {
1820             inst->opcode = BRW_OPCODE_MOV;
1821             inst->src[1] = reg_undef;
1822             progress = true;
1823             break;
1824          }
1825
1826          /* a * 0.0 = 0.0 */
1827          if (inst->src[1].is_zero()) {
1828             inst->opcode = BRW_OPCODE_MOV;
1829             inst->src[0] = inst->src[1];
1830             inst->src[1] = reg_undef;
1831             progress = true;
1832             break;
1833          }
1834
1835          break;
1836       case BRW_OPCODE_ADD:
1837          if (inst->src[1].file != IMM)
1838             continue;
1839
1840          /* a + 0.0 = a */
1841          if (inst->src[1].is_zero()) {
1842             inst->opcode = BRW_OPCODE_MOV;
1843             inst->src[1] = reg_undef;
1844             progress = true;
1845             break;
1846          }
1847          break;
1848       default:
1849          break;
1850       }
1851    }
1852
1853    return progress;
1854 }
1855
1856 /**
1857  * Removes any instructions writing a VGRF where that VGRF is not used by any
1858  * later instruction.
1859  */
1860 bool
1861 fs_visitor::dead_code_eliminate()
1862 {
1863    bool progress = false;
1864    int pc = 0;
1865
1866    calculate_live_intervals();
1867
1868    foreach_list_safe(node, &this->instructions) {
1869       fs_inst *inst = (fs_inst *)node;
1870
1871       if (inst->dst.file == GRF) {
1872          bool dead = true;
1873
1874          for (int i = 0; i < inst->regs_written; i++) {
1875             int var = live_intervals->var_from_vgrf[inst->dst.reg];
1876             assert(live_intervals->end[var + inst->dst.reg_offset + i] >= pc);
1877             if (live_intervals->end[var + inst->dst.reg_offset + i] != pc) {
1878                dead = false;
1879                break;
1880             }
1881          }
1882
1883          if (dead) {
1884             /* Don't dead code eliminate instructions that write to the
1885              * accumulator as a side-effect. Instead just set the destination
1886              * to the null register to free it.
1887              */
1888             switch (inst->opcode) {
1889             case BRW_OPCODE_ADDC:
1890             case BRW_OPCODE_SUBB:
1891             case BRW_OPCODE_MACH:
1892                inst->dst = fs_reg(retype(brw_null_reg(), inst->dst.type));
1893                break;
1894             default:
1895                inst->remove();
1896                progress = true;
1897                break;
1898             }
1899          }
1900       }
1901
1902       pc++;
1903    }
1904
1905    if (progress)
1906       invalidate_live_intervals();
1907
1908    return progress;
1909 }
1910
1911 struct dead_code_hash_key
1912 {
1913    int vgrf;
1914    int reg_offset;
1915 };
1916
1917 static bool
1918 dead_code_hash_compare(const void *a, const void *b)
1919 {
1920    return memcmp(a, b, sizeof(struct dead_code_hash_key)) == 0;
1921 }
1922
1923 static void
1924 clear_dead_code_hash(struct hash_table *ht)
1925 {
1926    struct hash_entry *entry;
1927
1928    hash_table_foreach(ht, entry) {
1929       _mesa_hash_table_remove(ht, entry);
1930    }
1931 }
1932
1933 static void
1934 insert_dead_code_hash(struct hash_table *ht,
1935                       int vgrf, int reg_offset, fs_inst *inst)
1936 {
1937    /* We don't bother freeing keys, because they'll be GCed with the ht. */
1938    struct dead_code_hash_key *key = ralloc(ht, struct dead_code_hash_key);
1939
1940    key->vgrf = vgrf;
1941    key->reg_offset = reg_offset;
1942
1943    _mesa_hash_table_insert(ht, _mesa_hash_data(key, sizeof(*key)), key, inst);
1944 }
1945
1946 static struct hash_entry *
1947 get_dead_code_hash_entry(struct hash_table *ht, int vgrf, int reg_offset)
1948 {
1949    struct dead_code_hash_key key;
1950
1951    key.vgrf = vgrf;
1952    key.reg_offset = reg_offset;
1953
1954    return _mesa_hash_table_search(ht, _mesa_hash_data(&key, sizeof(key)), &key);
1955 }
1956
1957 static void
1958 remove_dead_code_hash(struct hash_table *ht,
1959                       int vgrf, int reg_offset)
1960 {
1961    struct hash_entry *entry = get_dead_code_hash_entry(ht, vgrf, reg_offset);
1962    if (!entry)
1963       return;
1964
1965    _mesa_hash_table_remove(ht, entry);
1966 }
1967
1968 /**
1969  * Walks basic blocks, removing any regs that are written but not read before
1970  * being redefined.
1971  *
1972  * The dead_code_eliminate() function implements a global dead code
1973  * elimination, but it only handles the removing the last write to a register
1974  * if it's never read.  This one can handle intermediate writes, but only
1975  * within a basic block.
1976  */
1977 bool
1978 fs_visitor::dead_code_eliminate_local()
1979 {
1980    struct hash_table *ht;
1981    bool progress = false;
1982
1983    ht = _mesa_hash_table_create(mem_ctx, dead_code_hash_compare);
1984
1985    foreach_list_safe(node, &this->instructions) {
1986       fs_inst *inst = (fs_inst *)node;
1987
1988       /* At a basic block, empty the HT since we don't understand dataflow
1989        * here.
1990        */
1991       if (inst->is_control_flow()) {
1992          clear_dead_code_hash(ht);
1993          continue;
1994       }
1995
1996       /* Clear the HT of any instructions that got read. */
1997       for (int i = 0; i < 3; i++) {
1998          fs_reg src = inst->src[i];
1999          if (src.file != GRF)
2000             continue;
2001
2002          int read = 1;
2003          if (inst->is_send_from_grf())
2004             read = virtual_grf_sizes[src.reg] - src.reg_offset;
2005
2006          for (int reg_offset = src.reg_offset;
2007               reg_offset < src.reg_offset + read;
2008               reg_offset++) {
2009             remove_dead_code_hash(ht, src.reg, reg_offset);
2010          }
2011       }
2012
2013       /* Add any update of a GRF to the HT, removing a previous write if it
2014        * wasn't read.
2015        */
2016       if (inst->dst.file == GRF) {
2017          if (inst->regs_written > 1) {
2018             /* We don't know how to trim channels from an instruction's
2019              * writes, so we can't incrementally remove unread channels from
2020              * it.  Just remove whatever it overwrites from the table
2021              */
2022             for (int i = 0; i < inst->regs_written; i++) {
2023                remove_dead_code_hash(ht,
2024                                      inst->dst.reg,
2025                                      inst->dst.reg_offset + i);
2026             }
2027          } else {
2028             struct hash_entry *entry =
2029                get_dead_code_hash_entry(ht, inst->dst.reg,
2030                                         inst->dst.reg_offset);
2031
2032             if (inst->is_partial_write()) {
2033                /* For a partial write, we can't remove any previous dead code
2034                 * candidate, since we're just modifying their result, but we can
2035                 * be dead code eliminiated ourselves.
2036                 */
2037                if (entry) {
2038                   entry->data = inst;
2039                } else {
2040                   insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
2041                                         inst);
2042                }
2043             } else {
2044                if (entry) {
2045                   /* We're completely updating a channel, and there was a
2046                    * previous write to the channel that wasn't read.  Kill it!
2047                    */
2048                   fs_inst *inst = (fs_inst *)entry->data;
2049                   inst->remove();
2050                   progress = true;
2051                   _mesa_hash_table_remove(ht, entry);
2052                }
2053
2054                insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
2055                                      inst);
2056             }
2057          }
2058       }
2059    }
2060
2061    _mesa_hash_table_destroy(ht, NULL);
2062
2063    if (progress)
2064       invalidate_live_intervals();
2065
2066    return progress;
2067 }
2068
2069 /**
2070  * Implements a second type of register coalescing: This one checks if
2071  * the two regs involved in a raw move don't interfere, in which case
2072  * they can both by stored in the same place and the MOV removed.
2073  */
2074 bool
2075 fs_visitor::register_coalesce_2()
2076 {
2077    bool progress = false;
2078
2079    calculate_live_intervals();
2080
2081    foreach_list_safe(node, &this->instructions) {
2082       fs_inst *inst = (fs_inst *)node;
2083
2084       if (inst->opcode != BRW_OPCODE_MOV ||
2085           inst->is_partial_write() ||
2086           inst->saturate ||
2087           inst->src[0].file != GRF ||
2088           inst->src[0].negate ||
2089           inst->src[0].abs ||
2090           inst->src[0].smear != -1 ||
2091           inst->dst.file != GRF ||
2092           inst->dst.type != inst->src[0].type ||
2093           virtual_grf_sizes[inst->src[0].reg] != 1) {
2094          continue;
2095       }
2096
2097       int var_from = live_intervals->var_from_reg(&inst->src[0]);
2098       int var_to = live_intervals->var_from_reg(&inst->dst);
2099
2100       if (live_intervals->vars_interfere(var_from, var_to))
2101          continue;
2102
2103       int reg_from = inst->src[0].reg;
2104       assert(inst->src[0].reg_offset == 0);
2105       int reg_to = inst->dst.reg;
2106       int reg_to_offset = inst->dst.reg_offset;
2107
2108       foreach_list(node, &this->instructions) {
2109          fs_inst *scan_inst = (fs_inst *)node;
2110
2111          if (scan_inst->dst.file == GRF &&
2112              scan_inst->dst.reg == reg_from) {
2113             scan_inst->dst.reg = reg_to;
2114             scan_inst->dst.reg_offset = reg_to_offset;
2115          }
2116          for (int i = 0; i < 3; i++) {
2117             if (scan_inst->src[i].file == GRF &&
2118                 scan_inst->src[i].reg == reg_from) {
2119                scan_inst->src[i].reg = reg_to;
2120                scan_inst->src[i].reg_offset = reg_to_offset;
2121             }
2122          }
2123       }
2124
2125       inst->remove();
2126       progress = true;
2127       continue;
2128    }
2129
2130    if (progress)
2131       invalidate_live_intervals();
2132
2133    return progress;
2134 }
2135
2136 bool
2137 fs_visitor::register_coalesce()
2138 {
2139    bool progress = false;
2140    int if_depth = 0;
2141    int loop_depth = 0;
2142
2143    foreach_list_safe(node, &this->instructions) {
2144       fs_inst *inst = (fs_inst *)node;
2145
2146       /* Make sure that we dominate the instructions we're going to
2147        * scan for interfering with our coalescing, or we won't have
2148        * scanned enough to see if anything interferes with our
2149        * coalescing.  We don't dominate the following instructions if
2150        * we're in a loop or an if block.
2151        */
2152       switch (inst->opcode) {
2153       case BRW_OPCODE_DO:
2154          loop_depth++;
2155          break;
2156       case BRW_OPCODE_WHILE:
2157          loop_depth--;
2158          break;
2159       case BRW_OPCODE_IF:
2160          if_depth++;
2161          break;
2162       case BRW_OPCODE_ENDIF:
2163          if_depth--;
2164          break;
2165       default:
2166          break;
2167       }
2168       if (loop_depth || if_depth)
2169          continue;
2170
2171       if (inst->opcode != BRW_OPCODE_MOV ||
2172           inst->is_partial_write() ||
2173           inst->saturate ||
2174           inst->dst.file != GRF || (inst->src[0].file != GRF &&
2175                                     inst->src[0].file != UNIFORM)||
2176           inst->dst.type != inst->src[0].type)
2177          continue;
2178
2179       bool has_source_modifiers = (inst->src[0].abs ||
2180                                    inst->src[0].negate ||
2181                                    inst->src[0].smear != -1 ||
2182                                    inst->src[0].file == UNIFORM);
2183
2184       /* Found a move of a GRF to a GRF.  Let's see if we can coalesce
2185        * them: check for no writes to either one until the exit of the
2186        * program.
2187        */
2188       bool interfered = false;
2189
2190       for (fs_inst *scan_inst = (fs_inst *)inst->next;
2191            !scan_inst->is_tail_sentinel();
2192            scan_inst = (fs_inst *)scan_inst->next) {
2193          if (scan_inst->dst.file == GRF) {
2194             if (scan_inst->overwrites_reg(inst->dst) ||
2195                 scan_inst->overwrites_reg(inst->src[0])) {
2196                interfered = true;
2197                break;
2198             }
2199          }
2200
2201          if (has_source_modifiers) {
2202             for (int i = 0; i < 3; i++) {
2203                if (scan_inst->src[i].file == GRF &&
2204                    scan_inst->src[i].reg == inst->dst.reg &&
2205                    scan_inst->src[i].reg_offset == inst->dst.reg_offset &&
2206                    inst->dst.type != scan_inst->src[i].type)
2207                {
2208                  interfered = true;
2209                  break;
2210                }
2211             }
2212          }
2213
2214
2215          /* The gen6 MATH instruction can't handle source modifiers or
2216           * unusual register regions, so avoid coalescing those for
2217           * now.  We should do something more specific.
2218           */
2219          if (has_source_modifiers && !can_do_source_mods(scan_inst)) {
2220             interfered = true;
2221             break;
2222          }
2223
2224          if (scan_inst->mlen > 0 && scan_inst->base_mrf == -1 &&
2225              scan_inst->src[0].file == GRF &&
2226              scan_inst->src[0].reg == inst->dst.reg) {
2227             interfered = true;
2228             break;
2229          }
2230
2231          /* The accumulator result appears to get used for the
2232           * conditional modifier generation.  When negating a UD
2233           * value, there is a 33rd bit generated for the sign in the
2234           * accumulator value, so now you can't check, for example,
2235           * equality with a 32-bit value.  See piglit fs-op-neg-uint.
2236           */
2237          if (scan_inst->conditional_mod &&
2238              inst->src[0].negate &&
2239              inst->src[0].type == BRW_REGISTER_TYPE_UD) {
2240             interfered = true;
2241             break;
2242          }
2243       }
2244       if (interfered) {
2245          continue;
2246       }
2247
2248       /* Rewrite the later usage to point at the source of the move to
2249        * be removed.
2250        */
2251       for (fs_inst *scan_inst = inst;
2252            !scan_inst->is_tail_sentinel();
2253            scan_inst = (fs_inst *)scan_inst->next) {
2254          for (int i = 0; i < 3; i++) {
2255             if (scan_inst->src[i].file == GRF &&
2256                 scan_inst->src[i].reg == inst->dst.reg &&
2257                 scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
2258                fs_reg new_src = inst->src[0];
2259                if (scan_inst->src[i].abs) {
2260                   new_src.negate = 0;
2261                   new_src.abs = 1;
2262                }
2263                new_src.negate ^= scan_inst->src[i].negate;
2264                new_src.sechalf = scan_inst->src[i].sechalf;
2265                scan_inst->src[i] = new_src;
2266             }
2267          }
2268       }
2269
2270       inst->remove();
2271       progress = true;
2272    }
2273
2274    if (progress)
2275       invalidate_live_intervals();
2276
2277    return progress;
2278 }
2279
2280
2281 bool
2282 fs_visitor::compute_to_mrf()
2283 {
2284    bool progress = false;
2285    int next_ip = 0;
2286
2287    calculate_live_intervals();
2288
2289    foreach_list_safe(node, &this->instructions) {
2290       fs_inst *inst = (fs_inst *)node;
2291
2292       int ip = next_ip;
2293       next_ip++;
2294
2295       if (inst->opcode != BRW_OPCODE_MOV ||
2296           inst->is_partial_write() ||
2297           inst->dst.file != MRF || inst->src[0].file != GRF ||
2298           inst->dst.type != inst->src[0].type ||
2299           inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
2300          continue;
2301
2302       /* Work out which hardware MRF registers are written by this
2303        * instruction.
2304        */
2305       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2306       int mrf_high;
2307       if (inst->dst.reg & BRW_MRF_COMPR4) {
2308          mrf_high = mrf_low + 4;
2309       } else if (dispatch_width == 16 &&
2310                  (!inst->force_uncompressed && !inst->force_sechalf)) {
2311          mrf_high = mrf_low + 1;
2312       } else {
2313          mrf_high = mrf_low;
2314       }
2315
2316       /* Can't compute-to-MRF this GRF if someone else was going to
2317        * read it later.
2318        */
2319       if (this->virtual_grf_end[inst->src[0].reg] > ip)
2320          continue;
2321
2322       /* Found a move of a GRF to a MRF.  Let's see if we can go
2323        * rewrite the thing that made this GRF to write into the MRF.
2324        */
2325       fs_inst *scan_inst;
2326       for (scan_inst = (fs_inst *)inst->prev;
2327            scan_inst->prev != NULL;
2328            scan_inst = (fs_inst *)scan_inst->prev) {
2329          if (scan_inst->dst.file == GRF &&
2330              scan_inst->dst.reg == inst->src[0].reg) {
2331             /* Found the last thing to write our reg we want to turn
2332              * into a compute-to-MRF.
2333              */
2334
2335             /* If this one instruction didn't populate all the
2336              * channels, bail.  We might be able to rewrite everything
2337              * that writes that reg, but it would require smarter
2338              * tracking to delay the rewriting until complete success.
2339              */
2340             if (scan_inst->is_partial_write())
2341                break;
2342
2343             /* Things returning more than one register would need us to
2344              * understand coalescing out more than one MOV at a time.
2345              */
2346             if (scan_inst->regs_written > 1)
2347                break;
2348
2349             /* SEND instructions can't have MRF as a destination. */
2350             if (scan_inst->mlen)
2351                break;
2352
2353             if (brw->gen == 6) {
2354                /* gen6 math instructions must have the destination be
2355                 * GRF, so no compute-to-MRF for them.
2356                 */
2357                if (scan_inst->is_math()) {
2358                   break;
2359                }
2360             }
2361
2362             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2363                /* Found the creator of our MRF's source value. */
2364                scan_inst->dst.file = MRF;
2365                scan_inst->dst.reg = inst->dst.reg;
2366                scan_inst->saturate |= inst->saturate;
2367                inst->remove();
2368                progress = true;
2369             }
2370             break;
2371          }
2372
2373          /* We don't handle control flow here.  Most computation of
2374           * values that end up in MRFs are shortly before the MRF
2375           * write anyway.
2376           */
2377          if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2378             break;
2379
2380          /* You can't read from an MRF, so if someone else reads our
2381           * MRF's source GRF that we wanted to rewrite, that stops us.
2382           */
2383          bool interfered = false;
2384          for (int i = 0; i < 3; i++) {
2385             if (scan_inst->src[i].file == GRF &&
2386                 scan_inst->src[i].reg == inst->src[0].reg &&
2387                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2388                interfered = true;
2389             }
2390          }
2391          if (interfered)
2392             break;
2393
2394          if (scan_inst->dst.file == MRF) {
2395             /* If somebody else writes our MRF here, we can't
2396              * compute-to-MRF before that.
2397              */
2398             int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2399             int scan_mrf_high;
2400
2401             if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2402                scan_mrf_high = scan_mrf_low + 4;
2403             } else if (dispatch_width == 16 &&
2404                        (!scan_inst->force_uncompressed &&
2405                         !scan_inst->force_sechalf)) {
2406                scan_mrf_high = scan_mrf_low + 1;
2407             } else {
2408                scan_mrf_high = scan_mrf_low;
2409             }
2410
2411             if (mrf_low == scan_mrf_low ||
2412                 mrf_low == scan_mrf_high ||
2413                 mrf_high == scan_mrf_low ||
2414                 mrf_high == scan_mrf_high) {
2415                break;
2416             }
2417          }
2418
2419          if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2420             /* Found a SEND instruction, which means that there are
2421              * live values in MRFs from base_mrf to base_mrf +
2422              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
2423              * above it.
2424              */
2425             if (mrf_low >= scan_inst->base_mrf &&
2426                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2427                break;
2428             }
2429             if (mrf_high >= scan_inst->base_mrf &&
2430                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2431                break;
2432             }
2433          }
2434       }
2435    }
2436
2437    if (progress)
2438       invalidate_live_intervals();
2439
2440    return progress;
2441 }
2442
2443 /**
2444  * Walks through basic blocks, looking for repeated MRF writes and
2445  * removing the later ones.
2446  */
2447 bool
2448 fs_visitor::remove_duplicate_mrf_writes()
2449 {
2450    fs_inst *last_mrf_move[16];
2451    bool progress = false;
2452
2453    /* Need to update the MRF tracking for compressed instructions. */
2454    if (dispatch_width == 16)
2455       return false;
2456
2457    memset(last_mrf_move, 0, sizeof(last_mrf_move));
2458
2459    foreach_list_safe(node, &this->instructions) {
2460       fs_inst *inst = (fs_inst *)node;
2461
2462       if (inst->is_control_flow()) {
2463          memset(last_mrf_move, 0, sizeof(last_mrf_move));
2464       }
2465
2466       if (inst->opcode == BRW_OPCODE_MOV &&
2467           inst->dst.file == MRF) {
2468          fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2469          if (prev_inst && inst->equals(prev_inst)) {
2470             inst->remove();
2471             progress = true;
2472             continue;
2473          }
2474       }
2475
2476       /* Clear out the last-write records for MRFs that were overwritten. */
2477       if (inst->dst.file == MRF) {
2478          last_mrf_move[inst->dst.reg] = NULL;
2479       }
2480
2481       if (inst->mlen > 0 && inst->base_mrf != -1) {
2482          /* Found a SEND instruction, which will include two or fewer
2483           * implied MRF writes.  We could do better here.
2484           */
2485          for (int i = 0; i < implied_mrf_writes(inst); i++) {
2486             last_mrf_move[inst->base_mrf + i] = NULL;
2487          }
2488       }
2489
2490       /* Clear out any MRF move records whose sources got overwritten. */
2491       if (inst->dst.file == GRF) {
2492          for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2493             if (last_mrf_move[i] &&
2494                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2495                last_mrf_move[i] = NULL;
2496             }
2497          }
2498       }
2499
2500       if (inst->opcode == BRW_OPCODE_MOV &&
2501           inst->dst.file == MRF &&
2502           inst->src[0].file == GRF &&
2503           !inst->is_partial_write()) {
2504          last_mrf_move[inst->dst.reg] = inst;
2505       }
2506    }
2507
2508    if (progress)
2509       invalidate_live_intervals();
2510
2511    return progress;
2512 }
2513
2514 static void
2515 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2516                         int first_grf, int grf_len)
2517 {
2518    bool inst_16wide = (dispatch_width > 8 &&
2519                        !inst->force_uncompressed &&
2520                        !inst->force_sechalf);
2521
2522    /* Clear the flag for registers that actually got read (as expected). */
2523    for (int i = 0; i < 3; i++) {
2524       int grf;
2525       if (inst->src[i].file == GRF) {
2526          grf = inst->src[i].reg;
2527       } else if (inst->src[i].file == HW_REG &&
2528                  inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2529          grf = inst->src[i].fixed_hw_reg.nr;
2530       } else {
2531          continue;
2532       }
2533
2534       if (grf >= first_grf &&
2535           grf < first_grf + grf_len) {
2536          deps[grf - first_grf] = false;
2537          if (inst_16wide)
2538             deps[grf - first_grf + 1] = false;
2539       }
2540    }
2541 }
2542
2543 /**
2544  * Implements this workaround for the original 965:
2545  *
2546  *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2547  *      check for post destination dependencies on this instruction, software
2548  *      must ensure that there is no destination hazard for the case of ‘write
2549  *      followed by a posted write’ shown in the following example.
2550  *
2551  *      1. mov r3 0
2552  *      2. send r3.xy <rest of send instruction>
2553  *      3. mov r2 r3
2554  *
2555  *      Due to no post-destination dependency check on the ‘send’, the above
2556  *      code sequence could have two instructions (1 and 2) in flight at the
2557  *      same time that both consider ‘r3’ as the target of their final writes.
2558  */
2559 void
2560 fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
2561 {
2562    int reg_size = dispatch_width / 8;
2563    int write_len = inst->regs_written * reg_size;
2564    int first_write_grf = inst->dst.reg;
2565    bool needs_dep[BRW_MAX_MRF];
2566    assert(write_len < (int)sizeof(needs_dep) - 1);
2567
2568    memset(needs_dep, false, sizeof(needs_dep));
2569    memset(needs_dep, true, write_len);
2570
2571    clear_deps_for_inst_src(inst, dispatch_width,
2572                            needs_dep, first_write_grf, write_len);
2573
2574    /* Walk backwards looking for writes to registers we're writing which
2575     * aren't read since being written.  If we hit the start of the program,
2576     * we assume that there are no outstanding dependencies on entry to the
2577     * program.
2578     */
2579    for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2580         scan_inst != NULL;
2581         scan_inst = (fs_inst *)scan_inst->prev) {
2582
2583       /* If we hit control flow, assume that there *are* outstanding
2584        * dependencies, and force their cleanup before our instruction.
2585        */
2586       if (scan_inst->is_control_flow()) {
2587          for (int i = 0; i < write_len; i++) {
2588             if (needs_dep[i]) {
2589                inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2590             }
2591          }
2592          return;
2593       }
2594
2595       bool scan_inst_16wide = (dispatch_width > 8 &&
2596                                !scan_inst->force_uncompressed &&
2597                                !scan_inst->force_sechalf);
2598
2599       /* We insert our reads as late as possible on the assumption that any
2600        * instruction but a MOV that might have left us an outstanding
2601        * dependency has more latency than a MOV.
2602        */
2603       if (scan_inst->dst.file == GRF) {
2604          for (int i = 0; i < scan_inst->regs_written; i++) {
2605             int reg = scan_inst->dst.reg + i * reg_size;
2606
2607             if (reg >= first_write_grf &&
2608                 reg < first_write_grf + write_len &&
2609                 needs_dep[reg - first_write_grf]) {
2610                inst->insert_before(DEP_RESOLVE_MOV(reg));
2611                needs_dep[reg - first_write_grf] = false;
2612                if (scan_inst_16wide)
2613                   needs_dep[reg - first_write_grf + 1] = false;
2614             }
2615          }
2616       }
2617
2618       /* Clear the flag for registers that actually got read (as expected). */
2619       clear_deps_for_inst_src(scan_inst, dispatch_width,
2620                               needs_dep, first_write_grf, write_len);
2621
2622       /* Continue the loop only if we haven't resolved all the dependencies */
2623       int i;
2624       for (i = 0; i < write_len; i++) {
2625          if (needs_dep[i])
2626             break;
2627       }
2628       if (i == write_len)
2629          return;
2630    }
2631 }
2632
2633 /**
2634  * Implements this workaround for the original 965:
2635  *
2636  *     "[DevBW, DevCL] Errata: A destination register from a send can not be
2637  *      used as a destination register until after it has been sourced by an
2638  *      instruction with a different destination register.
2639  */
2640 void
2641 fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
2642 {
2643    int write_len = inst->regs_written * dispatch_width / 8;
2644    int first_write_grf = inst->dst.reg;
2645    bool needs_dep[BRW_MAX_MRF];
2646    assert(write_len < (int)sizeof(needs_dep) - 1);
2647
2648    memset(needs_dep, false, sizeof(needs_dep));
2649    memset(needs_dep, true, write_len);
2650    /* Walk forwards looking for writes to registers we're writing which aren't
2651     * read before being written.
2652     */
2653    for (fs_inst *scan_inst = (fs_inst *)inst->next;
2654         !scan_inst->is_tail_sentinel();
2655         scan_inst = (fs_inst *)scan_inst->next) {
2656       /* If we hit control flow, force resolve all remaining dependencies. */
2657       if (scan_inst->is_control_flow()) {
2658          for (int i = 0; i < write_len; i++) {
2659             if (needs_dep[i])
2660                scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2661          }
2662          return;
2663       }
2664
2665       /* Clear the flag for registers that actually got read (as expected). */
2666       clear_deps_for_inst_src(scan_inst, dispatch_width,
2667                               needs_dep, first_write_grf, write_len);
2668
2669       /* We insert our reads as late as possible since they're reading the
2670        * result of a SEND, which has massive latency.
2671        */
2672       if (scan_inst->dst.file == GRF &&
2673           scan_inst->dst.reg >= first_write_grf &&
2674           scan_inst->dst.reg < first_write_grf + write_len &&
2675           needs_dep[scan_inst->dst.reg - first_write_grf]) {
2676          scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2677          needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2678       }
2679
2680       /* Continue the loop only if we haven't resolved all the dependencies */
2681       int i;
2682       for (i = 0; i < write_len; i++) {
2683          if (needs_dep[i])
2684             break;
2685       }
2686       if (i == write_len)
2687          return;
2688    }
2689
2690    /* If we hit the end of the program, resolve all remaining dependencies out
2691     * of paranoia.
2692     */
2693    fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2694    assert(last_inst->eot);
2695    for (int i = 0; i < write_len; i++) {
2696       if (needs_dep[i])
2697          last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2698    }
2699 }
2700
2701 void
2702 fs_visitor::insert_gen4_send_dependency_workarounds()
2703 {
2704    if (brw->gen != 4 || brw->is_g4x)
2705       return;
2706
2707    /* Note that we're done with register allocation, so GRF fs_regs always
2708     * have a .reg_offset of 0.
2709     */
2710
2711    foreach_list_safe(node, &this->instructions) {
2712       fs_inst *inst = (fs_inst *)node;
2713
2714       if (inst->mlen != 0 && inst->dst.file == GRF) {
2715          insert_gen4_pre_send_dependency_workarounds(inst);
2716          insert_gen4_post_send_dependency_workarounds(inst);
2717       }
2718    }
2719 }
2720
2721 /**
2722  * Turns the generic expression-style uniform pull constant load instruction
2723  * into a hardware-specific series of instructions for loading a pull
2724  * constant.
2725  *
2726  * The expression style allows the CSE pass before this to optimize out
2727  * repeated loads from the same offset, and gives the pre-register-allocation
2728  * scheduling full flexibility, while the conversion to native instructions
2729  * allows the post-register-allocation scheduler the best information
2730  * possible.
2731  *
2732  * Note that execution masking for setting up pull constant loads is special:
2733  * the channels that need to be written are unrelated to the current execution
2734  * mask, since a later instruction will use one of the result channels as a
2735  * source operand for all 8 or 16 of its channels.
2736  */
2737 void
2738 fs_visitor::lower_uniform_pull_constant_loads()
2739 {
2740    foreach_list(node, &this->instructions) {
2741       fs_inst *inst = (fs_inst *)node;
2742
2743       if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2744          continue;
2745
2746       if (brw->gen >= 7) {
2747          /* The offset arg before was a vec4-aligned byte offset.  We need to
2748           * turn it into a dword offset.
2749           */
2750          fs_reg const_offset_reg = inst->src[1];
2751          assert(const_offset_reg.file == IMM &&
2752                 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2753          const_offset_reg.imm.u /= 4;
2754          fs_reg payload = fs_reg(this, glsl_type::uint_type);
2755
2756          /* This is actually going to be a MOV, but since only the first dword
2757           * is accessed, we have a special opcode to do just that one.  Note
2758           * that this needs to be an operation that will be considered a def
2759           * by live variable analysis, or register allocation will explode.
2760           */
2761          fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2762                                                payload, const_offset_reg);
2763          setup->force_writemask_all = true;
2764
2765          setup->ir = inst->ir;
2766          setup->annotation = inst->annotation;
2767          inst->insert_before(setup);
2768
2769          /* Similarly, this will only populate the first 4 channels of the
2770           * result register (since we only use smear values from 0-3), but we
2771           * don't tell the optimizer.
2772           */
2773          inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2774          inst->src[1] = payload;
2775
2776          invalidate_live_intervals();
2777       } else {
2778          /* Before register allocation, we didn't tell the scheduler about the
2779           * MRF we use.  We know it's safe to use this MRF because nothing
2780           * else does except for register spill/unspill, which generates and
2781           * uses its MRF within a single IR instruction.
2782           */
2783          inst->base_mrf = 14;
2784          inst->mlen = 1;
2785       }
2786    }
2787 }
2788
2789 void
2790 fs_visitor::dump_instruction(backend_instruction *be_inst)
2791 {
2792    fs_inst *inst = (fs_inst *)be_inst;
2793
2794    if (inst->predicate) {
2795       printf("(%cf0.%d) ",
2796              inst->predicate_inverse ? '-' : '+',
2797              inst->flag_subreg);
2798    }
2799
2800    printf("%s", brw_instruction_name(inst->opcode));
2801    if (inst->saturate)
2802       printf(".sat");
2803    if (inst->conditional_mod) {
2804       printf(".cmod");
2805       if (!inst->predicate &&
2806           (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2807                               inst->opcode != BRW_OPCODE_IF &&
2808                               inst->opcode != BRW_OPCODE_WHILE))) {
2809          printf(".f0.%d", inst->flag_subreg);
2810       }
2811    }
2812    printf(" ");
2813
2814
2815    switch (inst->dst.file) {
2816    case GRF:
2817       printf("vgrf%d", inst->dst.reg);
2818       if (inst->dst.reg_offset)
2819          printf("+%d", inst->dst.reg_offset);
2820       break;
2821    case MRF:
2822       printf("m%d", inst->dst.reg);
2823       break;
2824    case BAD_FILE:
2825       printf("(null)");
2826       break;
2827    case UNIFORM:
2828       printf("***u%d***", inst->dst.reg);
2829       break;
2830    default:
2831       printf("???");
2832       break;
2833    }
2834    printf(", ");
2835
2836    for (int i = 0; i < 3; i++) {
2837       if (inst->src[i].negate)
2838          printf("-");
2839       if (inst->src[i].abs)
2840          printf("|");
2841       switch (inst->src[i].file) {
2842       case GRF:
2843          printf("vgrf%d", inst->src[i].reg);
2844          if (inst->src[i].reg_offset)
2845             printf("+%d", inst->src[i].reg_offset);
2846          break;
2847       case MRF:
2848          printf("***m%d***", inst->src[i].reg);
2849          break;
2850       case UNIFORM:
2851          printf("u%d", inst->src[i].reg);
2852          if (inst->src[i].reg_offset)
2853             printf(".%d", inst->src[i].reg_offset);
2854          break;
2855       case BAD_FILE:
2856          printf("(null)");
2857          break;
2858       case IMM:
2859          switch (inst->src[i].type) {
2860          case BRW_REGISTER_TYPE_F:
2861             printf("%ff", inst->src[i].imm.f);
2862             break;
2863          case BRW_REGISTER_TYPE_D:
2864             printf("%dd", inst->src[i].imm.i);
2865             break;
2866          case BRW_REGISTER_TYPE_UD:
2867             printf("%uu", inst->src[i].imm.u);
2868             break;
2869          default:
2870             printf("???");
2871             break;
2872          }
2873          break;
2874       default:
2875          printf("???");
2876          break;
2877       }
2878       if (inst->src[i].abs)
2879          printf("|");
2880
2881       if (i < 3)
2882          printf(", ");
2883    }
2884
2885    printf(" ");
2886
2887    if (inst->force_uncompressed)
2888       printf("1sthalf ");
2889
2890    if (inst->force_sechalf)
2891       printf("2ndhalf ");
2892
2893    printf("\n");
2894 }
2895
2896 /**
2897  * Possibly returns an instruction that set up @param reg.
2898  *
2899  * Sometimes we want to take the result of some expression/variable
2900  * dereference tree and rewrite the instruction generating the result
2901  * of the tree.  When processing the tree, we know that the
2902  * instructions generated are all writing temporaries that are dead
2903  * outside of this tree.  So, if we have some instructions that write
2904  * a temporary, we're free to point that temp write somewhere else.
2905  *
2906  * Note that this doesn't guarantee that the instruction generated
2907  * only reg -- it might be the size=4 destination of a texture instruction.
2908  */
2909 fs_inst *
2910 fs_visitor::get_instruction_generating_reg(fs_inst *start,
2911                                            fs_inst *end,
2912                                            fs_reg reg)
2913 {
2914    if (end == start ||
2915        end->is_partial_write() ||
2916        reg.reladdr ||
2917        !reg.equals(end->dst)) {
2918       return NULL;
2919    } else {
2920       return end;
2921    }
2922 }
2923
2924 void
2925 fs_visitor::setup_payload_gen6()
2926 {
2927    bool uses_depth =
2928       (fp->Base.InputsRead & (1 << VARYING_SLOT_POS)) != 0;
2929    unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
2930
2931    assert(brw->gen >= 6);
2932
2933    /* R0-1: masks, pixel X/Y coordinates. */
2934    c->nr_payload_regs = 2;
2935    /* R2: only for 32-pixel dispatch.*/
2936
2937    /* R3-26: barycentric interpolation coordinates.  These appear in the
2938     * same order that they appear in the brw_wm_barycentric_interp_mode
2939     * enum.  Each set of coordinates occupies 2 registers if dispatch width
2940     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
2941     * appear if they were enabled using the "Barycentric Interpolation
2942     * Mode" bits in WM_STATE.
2943     */
2944    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
2945       if (barycentric_interp_modes & (1 << i)) {
2946          c->barycentric_coord_reg[i] = c->nr_payload_regs;
2947          c->nr_payload_regs += 2;
2948          if (dispatch_width == 16) {
2949             c->nr_payload_regs += 2;
2950          }
2951       }
2952    }
2953
2954    /* R27: interpolated depth if uses source depth */
2955    if (uses_depth) {
2956       c->source_depth_reg = c->nr_payload_regs;
2957       c->nr_payload_regs++;
2958       if (dispatch_width == 16) {
2959          /* R28: interpolated depth if not 8-wide. */
2960          c->nr_payload_regs++;
2961       }
2962    }
2963    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
2964    if (uses_depth) {
2965       c->source_w_reg = c->nr_payload_regs;
2966       c->nr_payload_regs++;
2967       if (dispatch_width == 16) {
2968          /* R30: interpolated W if not 8-wide. */
2969          c->nr_payload_regs++;
2970       }
2971    }
2972    /* R31: MSAA position offsets. */
2973    /* R32-: bary for 32-pixel. */
2974    /* R58-59: interp W for 32-pixel. */
2975
2976    if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
2977       c->source_depth_to_render_target = true;
2978    }
2979 }
2980
2981 void
2982 fs_visitor::assign_binding_table_offsets()
2983 {
2984    c->prog_data.binding_table.render_target_start = SURF_INDEX_DRAW(0);
2985    c->prog_data.base.binding_table.texture_start = SURF_INDEX_TEXTURE(0);
2986    c->prog_data.base.binding_table.ubo_start = SURF_INDEX_WM_UBO(0);
2987    c->prog_data.base.binding_table.shader_time_start = SURF_INDEX_WM_SHADER_TIME;
2988    c->prog_data.base.binding_table.gather_texture_start = SURF_INDEX_GATHER_TEXTURE(0);
2989    c->prog_data.base.binding_table.pull_constants_start = SURF_INDEX_FRAG_CONST_BUFFER;
2990
2991    /* c->prog_data.base.binding_table.size will be set by mark_surface_used. */
2992 }
2993
2994 bool
2995 fs_visitor::run()
2996 {
2997    sanity_param_count = fp->Base.Parameters->NumParameters;
2998    uint32_t orig_nr_params = c->prog_data.nr_params;
2999
3000    assign_binding_table_offsets();
3001
3002    if (brw->gen >= 6)
3003       setup_payload_gen6();
3004    else
3005       setup_payload_gen4();
3006
3007    if (0) {
3008       emit_dummy_fs();
3009    } else {
3010       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3011          emit_shader_time_begin();
3012
3013       calculate_urb_setup();
3014       if (brw->gen < 6)
3015          emit_interpolation_setup_gen4();
3016       else
3017          emit_interpolation_setup_gen6();
3018
3019       /* We handle discards by keeping track of the still-live pixels in f0.1.
3020        * Initialize it with the dispatched pixels.
3021        */
3022       if (fp->UsesKill) {
3023          fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
3024          discard_init->flag_subreg = 1;
3025       }
3026
3027       /* Generate FS IR for main().  (the visitor only descends into
3028        * functions called "main").
3029        */
3030       if (shader) {
3031          foreach_list(node, &*shader->ir) {
3032             ir_instruction *ir = (ir_instruction *)node;
3033             base_ir = ir;
3034             this->result = reg_undef;
3035             ir->accept(this);
3036          }
3037       } else {
3038          emit_fragment_program_code();
3039       }
3040       base_ir = NULL;
3041       if (failed)
3042          return false;
3043
3044       emit(FS_OPCODE_PLACEHOLDER_HALT);
3045
3046       emit_fb_writes();
3047
3048       split_virtual_grfs();
3049
3050       move_uniform_array_access_to_pull_constants();
3051       setup_pull_constants();
3052
3053       bool progress;
3054       do {
3055          progress = false;
3056
3057          compact_virtual_grfs();
3058
3059          progress = remove_duplicate_mrf_writes() || progress;
3060
3061          progress = opt_algebraic() || progress;
3062          progress = opt_cse() || progress;
3063          progress = opt_copy_propagate() || progress;
3064          progress = dead_code_eliminate() || progress;
3065          progress = dead_code_eliminate_local() || progress;
3066          progress = register_coalesce() || progress;
3067          progress = register_coalesce_2() || progress;
3068          progress = compute_to_mrf() || progress;
3069       } while (progress);
3070
3071       remove_dead_constants();
3072
3073       schedule_instructions(false);
3074
3075       lower_uniform_pull_constant_loads();
3076
3077       assign_curb_setup();
3078       assign_urb_setup();
3079
3080       if (0) {
3081          /* Debug of register spilling: Go spill everything. */
3082          for (int i = 0; i < virtual_grf_count; i++) {
3083             spill_reg(i);
3084          }
3085       }
3086
3087       if (0)
3088          assign_regs_trivial();
3089       else {
3090          while (!assign_regs()) {
3091             if (failed)
3092                break;
3093          }
3094       }
3095    }
3096    assert(force_uncompressed_stack == 0);
3097    assert(force_sechalf_stack == 0);
3098
3099    /* This must come after all optimization and register allocation, since
3100     * it inserts dead code that happens to have side effects, and it does
3101     * so based on the actual physical registers in use.
3102     */
3103    insert_gen4_send_dependency_workarounds();
3104
3105    if (failed)
3106       return false;
3107
3108    schedule_instructions(true);
3109
3110    if (dispatch_width == 8) {
3111       c->prog_data.reg_blocks = brw_register_blocks(grf_used);
3112    } else {
3113       c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
3114
3115       /* Make sure we didn't try to sneak in an extra uniform */
3116       assert(orig_nr_params == c->prog_data.nr_params);
3117       (void) orig_nr_params;
3118    }
3119
3120    /* If any state parameters were appended, then ParameterValues could have
3121     * been realloced, in which case the driver uniform storage set up by
3122     * _mesa_associate_uniform_storage() would point to freed memory.  Make
3123     * sure that didn't happen.
3124     */
3125    assert(sanity_param_count == fp->Base.Parameters->NumParameters);
3126
3127    return !failed;
3128 }
3129
3130 const unsigned *
3131 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
3132                struct gl_fragment_program *fp,
3133                struct gl_shader_program *prog,
3134                unsigned *final_assembly_size)
3135 {
3136    bool start_busy = false;
3137    float start_time = 0;
3138
3139    if (unlikely(brw->perf_debug)) {
3140       start_busy = (brw->batch.last_bo &&
3141                     drm_intel_bo_busy(brw->batch.last_bo));
3142       start_time = get_time();
3143    }
3144
3145    struct brw_shader *shader = NULL;
3146    if (prog)
3147       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3148
3149    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3150       if (prog) {
3151          printf("GLSL IR for native fragment shader %d:\n", prog->Name);
3152          _mesa_print_ir(shader->ir, NULL);
3153          printf("\n\n");
3154       } else {
3155          printf("ARB_fragment_program %d ir for native fragment shader\n",
3156                 fp->Base.Id);
3157          _mesa_print_program(&fp->Base);
3158       }
3159    }
3160
3161    /* Now the main event: Visit the shader IR and generate our FS IR for it.
3162     */
3163    fs_visitor v(brw, c, prog, fp, 8);
3164    if (!v.run()) {
3165       if (prog) {
3166          prog->LinkStatus = false;
3167          ralloc_strcat(&prog->InfoLog, v.fail_msg);
3168       }
3169
3170       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3171                     v.fail_msg);
3172
3173       return NULL;
3174    }
3175
3176    exec_list *simd16_instructions = NULL;
3177    fs_visitor v2(brw, c, prog, fp, 16);
3178    if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16))) {
3179       if (c->prog_data.nr_pull_params == 0) {
3180          /* Try a 16-wide compile */
3181          v2.import_uniforms(&v);
3182          if (!v2.run()) {
3183             perf_debug("16-wide shader failed to compile, falling back to "
3184                        "8-wide at a 10-20%% performance cost: %s", v2.fail_msg);
3185          } else {
3186             simd16_instructions = &v2.instructions;
3187          }
3188       } else {
3189          perf_debug("Skipping 16-wide due to pull parameters.\n");
3190       }
3191    }
3192
3193    c->prog_data.dispatch_width = 8;
3194
3195    fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
3196    const unsigned *generated = g.generate_assembly(&v.instructions,
3197                                                    simd16_instructions,
3198                                                    final_assembly_size);
3199
3200    if (unlikely(brw->perf_debug) && shader) {
3201       if (shader->compiled_once)
3202          brw_wm_debug_recompile(brw, prog, &c->key);
3203       shader->compiled_once = true;
3204
3205       if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
3206          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3207                     (get_time() - start_time) * 1000);
3208       }
3209    }
3210
3211    return generated;
3212 }
3213
3214 bool
3215 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
3216 {
3217    struct brw_context *brw = brw_context(ctx);
3218    struct brw_wm_prog_key key;
3219
3220    if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
3221       return true;
3222
3223    struct gl_fragment_program *fp = (struct gl_fragment_program *)
3224       prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
3225    struct brw_fragment_program *bfp = brw_fragment_program(fp);
3226    bool program_uses_dfdy = fp->UsesDFdy;
3227
3228    memset(&key, 0, sizeof(key));
3229
3230    if (brw->gen < 6) {
3231       if (fp->UsesKill)
3232          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3233
3234       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3235          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3236
3237       /* Just assume depth testing. */
3238       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3239       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3240    }
3241
3242    if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
3243                                          BRW_FS_VARYING_INPUT_MASK) > 16)
3244       key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
3245
3246    key.clamp_fragment_color = ctx->API == API_OPENGL_COMPAT;
3247
3248    unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
3249    for (unsigned i = 0; i < sampler_count; i++) {
3250       if (fp->Base.ShadowSamplers & (1 << i)) {
3251          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3252          key.tex.swizzles[i] =
3253             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3254       } else {
3255          /* Color sampler: assume no swizzling. */
3256          key.tex.swizzles[i] = SWIZZLE_XYZW;
3257       }
3258    }
3259
3260    if (fp->Base.InputsRead & VARYING_BIT_POS) {
3261       key.drawable_height = ctx->DrawBuffer->Height;
3262    }
3263
3264    if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3265       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
3266    }
3267
3268    key.nr_color_regions = 1;
3269
3270    /* GL_FRAGMENT_SHADER_DERIVATIVE_HINT is almost always GL_DONT_CARE.  The
3271     * quality of the derivatives is likely to be determined by the driconf
3272     * option.
3273     */
3274    key.high_quality_derivatives = brw->disable_derivative_optimization;
3275
3276    key.program_string_id = bfp->id;
3277
3278    uint32_t old_prog_offset = brw->wm.base.prog_offset;
3279    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3280
3281    bool success = do_wm_prog(brw, prog, bfp, &key);
3282
3283    brw->wm.base.prog_offset = old_prog_offset;
3284    brw->wm.prog_data = old_prog_data;
3285
3286    return success;
3287 }