src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 extern "C" {
  32
  33 #include <sys/types.h>
  34
  35 #include "main/hash_table.h"
  36 #include "main/macros.h"
  37 #include "main/shaderobj.h"
  38 #include "main/fbobject.h"
  39 #include "program/prog_parameter.h"
  40 #include "program/prog_print.h"
  41 #include "program/register_allocate.h"
  42 #include "program/sampler.h"
  43 #include "program/hash_table.h"
  44 #include "brw_context.h"
  45 #include "brw_eu.h"
  46 #include "brw_wm.h"
  47 }
  48 #include "brw_fs.h"
  49 #include "main/uniforms.h"
  50 #include "brw_fs_live_variables.h"
  51 #include "glsl/glsl_types.h"
  52
  53 void
  54 fs_inst::init()
  55 {
  56    memset(this, 0, sizeof(*this));
  57    this->opcode = BRW_OPCODE_NOP;
  58    this->conditional_mod = BRW_CONDITIONAL_NONE;
  59
  60    this->dst = reg_undef;
  61    this->src[0] = reg_undef;
  62    this->src[1] = reg_undef;
  63    this->src[2] = reg_undef;
  64
  65    /* This will be the case for almost all instructions. */
  66    this->regs_written = 1;
  67 }
  68
  69 fs_inst::fs_inst()
  70 {
  71    init();
  72 }
  73
  74 fs_inst::fs_inst(enum opcode opcode)
  75 {
  76    init();
  77    this->opcode = opcode;
  78 }
  79
  80 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
  81 {
  82    init();
  83    this->opcode = opcode;
  84    this->dst = dst;
  85
  86    if (dst.file == GRF)
  87       assert(dst.reg_offset >= 0);
  88 }
  89
  90 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
  91 {
  92    init();
  93    this->opcode = opcode;
  94    this->dst = dst;
  95    this->src[0] = src0;
  96
  97    if (dst.file == GRF)
  98       assert(dst.reg_offset >= 0);
  99    if (src[0].file == GRF)
 100       assert(src[0].reg_offset >= 0);
 101 }
 102
 103 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 104 {
 105    init();
 106    this->opcode = opcode;
 107    this->dst = dst;
 108    this->src[0] = src0;
 109    this->src[1] = src1;
 110
 111    if (dst.file == GRF)
 112       assert(dst.reg_offset >= 0);
 113    if (src[0].file == GRF)
 114       assert(src[0].reg_offset >= 0);
 115    if (src[1].file == GRF)
 116       assert(src[1].reg_offset >= 0);
 117 }
 118
 119 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
 120                  fs_reg src0, fs_reg src1, fs_reg src2)
 121 {
 122    init();
 123    this->opcode = opcode;
 124    this->dst = dst;
 125    this->src[0] = src0;
 126    this->src[1] = src1;
 127    this->src[2] = src2;
 128
 129    if (dst.file == GRF)
 130       assert(dst.reg_offset >= 0);
 131    if (src[0].file == GRF)
 132       assert(src[0].reg_offset >= 0);
 133    if (src[1].file == GRF)
 134       assert(src[1].reg_offset >= 0);
 135    if (src[2].file == GRF)
 136       assert(src[2].reg_offset >= 0);
 137 }
 138
 139 #define ALU1(op)                                                        \
 140    fs_inst *                                                            \
 141    fs_visitor::op(fs_reg dst, fs_reg src0)                              \
 142    {                                                                    \
 143       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
 144    }
 145
 146 #define ALU2(op)                                                        \
 147    fs_inst *                                                            \
 148    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1)                 \
 149    {                                                                    \
 150       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
 151    }
 152
 153 #define ALU3(op)                                                        \
 154    fs_inst *                                                            \
 155    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1, fs_reg src2)    \
 156    {                                                                    \
 157       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
 158    }
 159
 160 ALU1(NOT)
 161 ALU1(MOV)
 162 ALU1(FRC)
 163 ALU1(RNDD)
 164 ALU1(RNDE)
 165 ALU1(RNDZ)
 166 ALU2(ADD)
 167 ALU2(MUL)
 168 ALU2(MACH)
 169 ALU2(AND)
 170 ALU2(OR)
 171 ALU2(XOR)
 172 ALU2(SHL)
 173 ALU2(SHR)
 174 ALU2(ASR)
 175 ALU3(LRP)
 176 ALU1(BFREV)
 177 ALU3(BFE)
 178 ALU2(BFI1)
 179 ALU3(BFI2)
 180 ALU1(FBH)
 181 ALU1(FBL)
 182 ALU1(CBIT)
 183 ALU3(MAD)
 184 ALU2(ADDC)
 185 ALU2(SUBB)
 186
 187 /** Gen4 predicated IF. */
 188 fs_inst *
 189 fs_visitor::IF(uint32_t predicate)
 190 {
 191    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
 192    inst->predicate = predicate;
 193    return inst;
 194 }
 195
 196 /** Gen6+ IF with embedded comparison. */
 197 fs_inst *
 198 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
 199 {
 200    assert(brw->gen >= 6);
 201    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
 202                                         reg_null_d, src0, src1);
 203    inst->conditional_mod = condition;
 204    return inst;
 205 }
 206
 207 /**
 208  * CMP: Sets the low bit of the destination channels with the result
 209  * of the comparison, while the upper bits are undefined, and updates
 210  * the flag register with the packed 16 bits of the result.
 211  */
 212 fs_inst *
 213 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
 214 {
 215    fs_inst *inst;
 216
 217    /* Take the instruction:
 218     *
 219     * CMP null<d> src0<f> src1<f>
 220     *
 221     * Original gen4 does type conversion to the destination type before
 222     * comparison, producing garbage results for floating point comparisons.
 223     * gen5 does the comparison on the execution type (resolved source types),
 224     * so dst type doesn't matter.  gen6 does comparison and then uses the
 225     * result as if it was the dst type with no conversion, which happens to
 226     * mostly work out for float-interpreted-as-int since our comparisons are
 227     * for >0, =0, <0.
 228     */
 229    if (brw->gen == 4) {
 230       dst.type = src0.type;
 231       if (dst.file == HW_REG)
 232          dst.fixed_hw_reg.type = dst.type;
 233    }
 234
 235    resolve_ud_negate(&src0);
 236    resolve_ud_negate(&src1);
 237
 238    inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
 239    inst->conditional_mod = condition;
 240
 241    return inst;
 242 }
 243
 244 exec_list
 245 fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
 246                                        fs_reg varying_offset,
 247                                        uint32_t const_offset)
 248 {
 249    exec_list instructions;
 250    fs_inst *inst;
 251
 252    /* We have our constant surface use a pitch of 4 bytes, so our index can
 253     * be any component of a vector, and then we load 4 contiguous
 254     * components starting from that.
 255     *
 256     * We break down the const_offset to a portion added to the variable
 257     * offset and a portion done using reg_offset, which means that if you
 258     * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
 259     * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
 260     * CSE can later notice that those loads are all the same and eliminate
 261     * the redundant ones.
 262     */
 263    fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
 264    instructions.push_tail(ADD(vec4_offset,
 265                               varying_offset, const_offset & ~3));
 266
 267    int scale = 1;
 268    if (brw->gen == 4 && dispatch_width == 8) {
 269       /* Pre-gen5, we can either use a SIMD8 message that requires (header,
 270        * u, v, r) as parameters, or we can just use the SIMD16 message
 271        * consisting of (header, u).  We choose the second, at the cost of a
 272        * longer return length.
 273        */
 274       scale = 2;
 275    }
 276
 277    enum opcode op;
 278    if (brw->gen >= 7)
 279       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
 280    else
 281       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
 282    fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4 * scale), dst.type);
 283    inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
 284    inst->regs_written = 4 * scale;
 285    instructions.push_tail(inst);
 286
 287    if (brw->gen < 7) {
 288       inst->base_mrf = 13;
 289       inst->header_present = true;
 290       if (brw->gen == 4)
 291          inst->mlen = 3;
 292       else
 293          inst->mlen = 1 + dispatch_width / 8;
 294    }
 295
 296    vec4_result.reg_offset += (const_offset & 3) * scale;
 297    instructions.push_tail(MOV(dst, vec4_result));
 298
 299    return instructions;
 300 }
 301
 302 /**
 303  * A helper for MOV generation for fixing up broken hardware SEND dependency
 304  * handling.
 305  */
 306 fs_inst *
 307 fs_visitor::DEP_RESOLVE_MOV(int grf)
 308 {
 309    fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
 310
 311    inst->ir = NULL;
 312    inst->annotation = "send dependency resolve";
 313
 314    /* The caller always wants uncompressed to emit the minimal extra
 315     * dependencies, and to avoid having to deal with aligning its regs to 2.
 316     */
 317    inst->force_uncompressed = true;
 318
 319    return inst;
 320 }
 321
 322 bool
 323 fs_inst::equals(fs_inst *inst)
 324 {
 325    return (opcode == inst->opcode &&
 326            dst.equals(inst->dst) &&
 327            src[0].equals(inst->src[0]) &&
 328            src[1].equals(inst->src[1]) &&
 329            src[2].equals(inst->src[2]) &&
 330            saturate == inst->saturate &&
 331            predicate == inst->predicate &&
 332            conditional_mod == inst->conditional_mod &&
 333            mlen == inst->mlen &&
 334            base_mrf == inst->base_mrf &&
 335            sampler == inst->sampler &&
 336            target == inst->target &&
 337            eot == inst->eot &&
 338            header_present == inst->header_present &&
 339            shadow_compare == inst->shadow_compare &&
 340            offset == inst->offset);
 341 }
 342
 343 bool
 344 fs_inst::overwrites_reg(const fs_reg &reg)
 345 {
 346    return (reg.file == dst.file &&
 347            reg.reg == dst.reg &&
 348            reg.reg_offset >= dst.reg_offset  &&
 349            reg.reg_offset < dst.reg_offset + regs_written);
 350 }
 351
 352 bool
 353 fs_inst::is_send_from_grf()
 354 {
 355    return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
 356            opcode == SHADER_OPCODE_SHADER_TIME_ADD ||
 357            (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
 358             src[1].file == GRF) ||
 359            (is_tex() && src[0].file == GRF));
 360 }
 361
 362 bool
 363 fs_visitor::can_do_source_mods(fs_inst *inst)
 364 {
 365    if (brw->gen == 6 && inst->is_math())
 366       return false;
 367
 368    if (inst->is_send_from_grf())
 369       return false;
 370
 371    if (!inst->can_do_source_mods())
 372       return false;
 373
 374    return true;
 375 }
 376
 377 void
 378 fs_reg::init()
 379 {
 380    memset(this, 0, sizeof(*this));
 381    this->smear = -1;
 382 }
 383
 384 /** Generic unset register constructor. */
 385 fs_reg::fs_reg()
 386 {
 387    init();
 388    this->file = BAD_FILE;
 389 }
 390
 391 /** Immediate value constructor. */
 392 fs_reg::fs_reg(float f)
 393 {
 394    init();
 395    this->file = IMM;
 396    this->type = BRW_REGISTER_TYPE_F;
 397    this->imm.f = f;
 398 }
 399
 400 /** Immediate value constructor. */
 401 fs_reg::fs_reg(int32_t i)
 402 {
 403    init();
 404    this->file = IMM;
 405    this->type = BRW_REGISTER_TYPE_D;
 406    this->imm.i = i;
 407 }
 408
 409 /** Immediate value constructor. */
 410 fs_reg::fs_reg(uint32_t u)
 411 {
 412    init();
 413    this->file = IMM;
 414    this->type = BRW_REGISTER_TYPE_UD;
 415    this->imm.u = u;
 416 }
 417
 418 /** Fixed brw_reg Immediate value constructor. */
 419 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
 420 {
 421    init();
 422    this->file = HW_REG;
 423    this->fixed_hw_reg = fixed_hw_reg;
 424    this->type = fixed_hw_reg.type;
 425 }
 426
 427 bool
 428 fs_reg::equals(const fs_reg &r) const
 429 {
 430    return (file == r.file &&
 431            reg == r.reg &&
 432            reg_offset == r.reg_offset &&
 433            type == r.type &&
 434            negate == r.negate &&
 435            abs == r.abs &&
 436            !reladdr && !r.reladdr &&
 437            memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
 438                   sizeof(fixed_hw_reg)) == 0 &&
 439            smear == r.smear &&
 440            imm.u == r.imm.u);
 441 }
 442
 443 fs_reg
 444 fs_reg::retype(uint32_t type)
 445 {
 446    fs_reg result = *this;
 447    result.type = type;
 448    return result;
 449 }
 450
 451 bool
 452 fs_reg::is_zero() const
 453 {
 454    if (file != IMM)
 455       return false;
 456
 457    return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
 458 }
 459
 460 bool
 461 fs_reg::is_one() const
 462 {
 463    if (file != IMM)
 464       return false;
 465
 466    return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
 467 }
 468
 469 bool
 470 fs_reg::is_valid_3src() const
 471 {
 472    return file == GRF || file == UNIFORM;
 473 }
 474
 475 int
 476 fs_visitor::type_size(const struct glsl_type *type)
 477 {
 478    unsigned int size, i;
 479
 480    switch (type->base_type) {
 481    case GLSL_TYPE_UINT:
 482    case GLSL_TYPE_INT:
 483    case GLSL_TYPE_FLOAT:
 484    case GLSL_TYPE_BOOL:
 485       return type->components();
 486    case GLSL_TYPE_ARRAY:
 487       return type_size(type->fields.array) * type->length;
 488    case GLSL_TYPE_STRUCT:
 489       size = 0;
 490       for (i = 0; i < type->length; i++) {
 491          size += type_size(type->fields.structure[i].type);
 492       }
 493       return size;
 494    case GLSL_TYPE_SAMPLER:
 495       /* Samplers take up no register space, since they're baked in at
 496        * link time.
 497        */
 498       return 0;
 499    case GLSL_TYPE_ATOMIC_UINT:
 500       return 0;
 501    case GLSL_TYPE_VOID:
 502    case GLSL_TYPE_ERROR:
 503    case GLSL_TYPE_INTERFACE:
 504       assert(!"not reached");
 505       break;
 506    }
 507
 508    return 0;
 509 }
 510
 511 fs_reg
 512 fs_visitor::get_timestamp()
 513 {
 514    assert(brw->gen >= 7);
 515
 516    fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 517                                           BRW_ARF_TIMESTAMP,
 518                                           0),
 519                              BRW_REGISTER_TYPE_UD));
 520
 521    fs_reg dst = fs_reg(this, glsl_type::uint_type);
 522
 523    fs_inst *mov = emit(MOV(dst, ts));
 524    /* We want to read the 3 fields we care about (mostly field 0, but also 2)
 525     * even if it's not enabled in the dispatch.
 526     */
 527    mov->force_writemask_all = true;
 528    mov->force_uncompressed = true;
 529
 530    /* The caller wants the low 32 bits of the timestamp.  Since it's running
 531     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
 532     * which is plenty of time for our purposes.  It is identical across the
 533     * EUs, but since it's tracking GPU core speed it will increment at a
 534     * varying rate as render P-states change.
 535     *
 536     * The caller could also check if render P-states have changed (or anything
 537     * else that might disrupt timing) by setting smear to 2 and checking if
 538     * that field is != 0.
 539     */
 540    dst.smear = 0;
 541
 542    return dst;
 543 }
 544
 545 void
 546 fs_visitor::emit_shader_time_begin()
 547 {
 548    current_annotation = "shader time start";
 549    shader_start_time = get_timestamp();
 550 }
 551
 552 void
 553 fs_visitor::emit_shader_time_end()
 554 {
 555    current_annotation = "shader time end";
 556
 557    enum shader_time_shader_type type, written_type, reset_type;
 558    if (dispatch_width == 8) {
 559       type = ST_FS8;
 560       written_type = ST_FS8_WRITTEN;
 561       reset_type = ST_FS8_RESET;
 562    } else {
 563       assert(dispatch_width == 16);
 564       type = ST_FS16;
 565       written_type = ST_FS16_WRITTEN;
 566       reset_type = ST_FS16_RESET;
 567    }
 568
 569    fs_reg shader_end_time = get_timestamp();
 570
 571    /* Check that there weren't any timestamp reset events (assuming these
 572     * were the only two timestamp reads that happened).
 573     */
 574    fs_reg reset = shader_end_time;
 575    reset.smear = 2;
 576    fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
 577    test->conditional_mod = BRW_CONDITIONAL_Z;
 578    emit(IF(BRW_PREDICATE_NORMAL));
 579
 580    push_force_uncompressed();
 581    fs_reg start = shader_start_time;
 582    start.negate = true;
 583    fs_reg diff = fs_reg(this, glsl_type::uint_type);
 584    emit(ADD(diff, start, shader_end_time));
 585
 586    /* If there were no instructions between the two timestamp gets, the diff
 587     * is 2 cycles.  Remove that overhead, so I can forget about that when
 588     * trying to determine the time taken for single instructions.
 589     */
 590    emit(ADD(diff, diff, fs_reg(-2u)));
 591
 592    emit_shader_time_write(type, diff);
 593    emit_shader_time_write(written_type, fs_reg(1u));
 594    emit(BRW_OPCODE_ELSE);
 595    emit_shader_time_write(reset_type, fs_reg(1u));
 596    emit(BRW_OPCODE_ENDIF);
 597
 598    pop_force_uncompressed();
 599 }
 600
 601 void
 602 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
 603                                    fs_reg value)
 604 {
 605    int shader_time_index =
 606       brw_get_shader_time_index(brw, shader_prog, &fp->Base, type);
 607    fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
 608
 609    fs_reg payload;
 610    if (dispatch_width == 8)
 611       payload = fs_reg(this, glsl_type::uvec2_type);
 612    else
 613       payload = fs_reg(this, glsl_type::uint_type);
 614
 615    emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
 616                 fs_reg(), payload, offset, value));
 617 }
 618
 619 void
 620 fs_visitor::fail(const char *format, ...)
 621 {
 622    va_list va;
 623    char *msg;
 624
 625    if (failed)
 626       return;
 627
 628    failed = true;
 629
 630    va_start(va, format);
 631    msg = ralloc_vasprintf(mem_ctx, format, va);
 632    va_end(va);
 633    msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
 634
 635    this->fail_msg = msg;
 636
 637    if (INTEL_DEBUG & DEBUG_WM) {
 638       fprintf(stderr, "%s",  msg);
 639    }
 640 }
 641
 642 fs_inst *
 643 fs_visitor::emit(enum opcode opcode)
 644 {
 645    return emit(fs_inst(opcode));
 646 }
 647
 648 fs_inst *
 649 fs_visitor::emit(enum opcode opcode, fs_reg dst)
 650 {
 651    return emit(fs_inst(opcode, dst));
 652 }
 653
 654 fs_inst *
 655 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
 656 {
 657    return emit(fs_inst(opcode, dst, src0));
 658 }
 659
 660 fs_inst *
 661 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 662 {
 663    return emit(fs_inst(opcode, dst, src0, src1));
 664 }
 665
 666 fs_inst *
 667 fs_visitor::emit(enum opcode opcode, fs_reg dst,
 668                  fs_reg src0, fs_reg src1, fs_reg src2)
 669 {
 670    return emit(fs_inst(opcode, dst, src0, src1, src2));
 671 }
 672
 673 void
 674 fs_visitor::push_force_uncompressed()
 675 {
 676    force_uncompressed_stack++;
 677 }
 678
 679 void
 680 fs_visitor::pop_force_uncompressed()
 681 {
 682    force_uncompressed_stack--;
 683    assert(force_uncompressed_stack >= 0);
 684 }
 685
 686 void
 687 fs_visitor::push_force_sechalf()
 688 {
 689    force_sechalf_stack++;
 690 }
 691
 692 void
 693 fs_visitor::pop_force_sechalf()
 694 {
 695    force_sechalf_stack--;
 696    assert(force_sechalf_stack >= 0);
 697 }
 698
 699 /**
 700  * Returns true if the instruction has a flag that means it won't
 701  * update an entire destination register.
 702  *
 703  * For example, dead code elimination and live variable analysis want to know
 704  * when a write to a variable screens off any preceding values that were in
 705  * it.
 706  */
 707 bool
 708 fs_inst::is_partial_write()
 709 {
 710    return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
 711            this->force_uncompressed ||
 712            this->force_sechalf);
 713 }
 714
 715 int
 716 fs_inst::regs_read(fs_visitor *v, int arg)
 717 {
 718    if (is_tex() && arg == 0 && src[0].file == GRF) {
 719       if (v->dispatch_width == 16)
 720          return (mlen + 1) / 2;
 721       else
 722          return mlen;
 723    }
 724    return 1;
 725 }
 726
 727 /**
 728  * Returns how many MRFs an FS opcode will write over.
 729  *
 730  * Note that this is not the 0 or 1 implied writes in an actual gen
 731  * instruction -- the FS opcodes often generate MOVs in addition.
 732  */
 733 int
 734 fs_visitor::implied_mrf_writes(fs_inst *inst)
 735 {
 736    if (inst->mlen == 0)
 737       return 0;
 738
 739    if (inst->base_mrf == -1)
 740       return 0;
 741
 742    switch (inst->opcode) {
 743    case SHADER_OPCODE_RCP:
 744    case SHADER_OPCODE_RSQ:
 745    case SHADER_OPCODE_SQRT:
 746    case SHADER_OPCODE_EXP2:
 747    case SHADER_OPCODE_LOG2:
 748    case SHADER_OPCODE_SIN:
 749    case SHADER_OPCODE_COS:
 750       return 1 * dispatch_width / 8;
 751    case SHADER_OPCODE_POW:
 752    case SHADER_OPCODE_INT_QUOTIENT:
 753    case SHADER_OPCODE_INT_REMAINDER:
 754       return 2 * dispatch_width / 8;
 755    case SHADER_OPCODE_TEX:
 756    case FS_OPCODE_TXB:
 757    case SHADER_OPCODE_TXD:
 758    case SHADER_OPCODE_TXF:
 759    case SHADER_OPCODE_TXF_MS:
 760    case SHADER_OPCODE_TG4:
 761    case SHADER_OPCODE_TG4_OFFSET:
 762    case SHADER_OPCODE_TXL:
 763    case SHADER_OPCODE_TXS:
 764    case SHADER_OPCODE_LOD:
 765       return 1;
 766    case FS_OPCODE_FB_WRITE:
 767       return 2;
 768    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 769    case SHADER_OPCODE_GEN4_SCRATCH_READ:
 770       return 1;
 771    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
 772       return inst->mlen;
 773    case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
 774       return 2;
 775    case SHADER_OPCODE_UNTYPED_ATOMIC:
 776    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
 777       return 0;
 778    default:
 779       assert(!"not reached");
 780       return inst->mlen;
 781    }
 782 }
 783
 784 int
 785 fs_visitor::virtual_grf_alloc(int size)
 786 {
 787    if (virtual_grf_array_size <= virtual_grf_count) {
 788       if (virtual_grf_array_size == 0)
 789          virtual_grf_array_size = 16;
 790       else
 791          virtual_grf_array_size *= 2;
 792       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 793                                    virtual_grf_array_size);
 794    }
 795    virtual_grf_sizes[virtual_grf_count] = size;
 796    return virtual_grf_count++;
 797 }
 798
 799 /** Fixed HW reg constructor. */
 800 fs_reg::fs_reg(enum register_file file, int reg)
 801 {
 802    init();
 803    this->file = file;
 804    this->reg = reg;
 805    this->type = BRW_REGISTER_TYPE_F;
 806 }
 807
 808 /** Fixed HW reg constructor. */
 809 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
 810 {
 811    init();
 812    this->file = file;
 813    this->reg = reg;
 814    this->type = type;
 815 }
 816
 817 /** Automatic reg constructor. */
 818 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
 819 {
 820    init();
 821
 822    this->file = GRF;
 823    this->reg = v->virtual_grf_alloc(v->type_size(type));
 824    this->reg_offset = 0;
 825    this->type = brw_type_for_base_type(type);
 826 }
 827
 828 fs_reg *
 829 fs_visitor::variable_storage(ir_variable *var)
 830 {
 831    return (fs_reg *)hash_table_find(this->variable_ht, var);
 832 }
 833
 834 void
 835 import_uniforms_callback(const void *key,
 836                          void *data,
 837                          void *closure)
 838 {
 839    struct hash_table *dst_ht = (struct hash_table *)closure;
 840    const fs_reg *reg = (const fs_reg *)data;
 841
 842    if (reg->file != UNIFORM)
 843       return;
 844
 845    hash_table_insert(dst_ht, data, key);
 846 }
 847
 848 /* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
 849  * This brings in those uniform definitions
 850  */
 851 void
 852 fs_visitor::import_uniforms(fs_visitor *v)
 853 {
 854    hash_table_call_foreach(v->variable_ht,
 855                            import_uniforms_callback,
 856                            variable_ht);
 857    this->params_remap = v->params_remap;
 858    this->nr_params_remap = v->nr_params_remap;
 859 }
 860
 861 /* Our support for uniforms is piggy-backed on the struct
 862  * gl_fragment_program, because that's where the values actually
 863  * get stored, rather than in some global gl_shader_program uniform
 864  * store.
 865  */
 866 void
 867 fs_visitor::setup_uniform_values(ir_variable *ir)
 868 {
 869    int namelen = strlen(ir->name);
 870
 871    /* The data for our (non-builtin) uniforms is stored in a series of
 872     * gl_uniform_driver_storage structs for each subcomponent that
 873     * glGetUniformLocation() could name.  We know it's been set up in the same
 874     * order we'd walk the type, so walk the list of storage and find anything
 875     * with our name, or the prefix of a component that starts with our name.
 876     */
 877    unsigned params_before = c->prog_data.nr_params;
 878    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
 879       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 880
 881       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 882           (storage->name[namelen] != 0 &&
 883            storage->name[namelen] != '.' &&
 884            storage->name[namelen] != '[')) {
 885          continue;
 886       }
 887
 888       unsigned slots = storage->type->component_slots();
 889       if (storage->array_elements)
 890          slots *= storage->array_elements;
 891
 892       for (unsigned i = 0; i < slots; i++) {
 893          c->prog_data.param[c->prog_data.nr_params++] =
 894             &storage->storage[i].f;
 895       }
 896    }
 897
 898    /* Make sure we actually initialized the right amount of stuff here. */
 899    assert(params_before + ir->type->component_slots() ==
 900           c->prog_data.nr_params);
 901    (void)params_before;
 902 }
 903
 904
 905 /* Our support for builtin uniforms is even scarier than non-builtin.
 906  * It sits on top of the PROG_STATE_VAR parameters that are
 907  * automatically updated from GL context state.
 908  */
 909 void
 910 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
 911 {
 912    const ir_state_slot *const slots = ir->state_slots;
 913    assert(ir->state_slots != NULL);
 914
 915    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 916       /* This state reference has already been setup by ir_to_mesa, but we'll
 917        * get the same index back here.
 918        */
 919       int index = _mesa_add_state_reference(this->fp->Base.Parameters,
 920                                             (gl_state_index *)slots[i].tokens);
 921
 922       /* Add each of the unique swizzles of the element as a parameter.
 923        * This'll end up matching the expected layout of the
 924        * array/matrix/structure we're trying to fill in.
 925        */
 926       int last_swiz = -1;
 927       for (unsigned int j = 0; j < 4; j++) {
 928          int swiz = GET_SWZ(slots[i].swizzle, j);
 929          if (swiz == last_swiz)
 930             break;
 931          last_swiz = swiz;
 932
 933          c->prog_data.param[c->prog_data.nr_params++] =
 934             &fp->Base.Parameters->ParameterValues[index][swiz].f;
 935       }
 936    }
 937 }
 938
 939 fs_reg *
 940 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
 941 {
 942    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 943    fs_reg wpos = *reg;
 944    bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
 945
 946    /* gl_FragCoord.x */
 947    if (ir->pixel_center_integer) {
 948       emit(MOV(wpos, this->pixel_x));
 949    } else {
 950       emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
 951    }
 952    wpos.reg_offset++;
 953
 954    /* gl_FragCoord.y */
 955    if (!flip && ir->pixel_center_integer) {
 956       emit(MOV(wpos, this->pixel_y));
 957    } else {
 958       fs_reg pixel_y = this->pixel_y;
 959       float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
 960
 961       if (flip) {
 962          pixel_y.negate = true;
 963          offset += c->key.drawable_height - 1.0;
 964       }
 965
 966       emit(ADD(wpos, pixel_y, fs_reg(offset)));
 967    }
 968    wpos.reg_offset++;
 969
 970    /* gl_FragCoord.z */
 971    if (brw->gen >= 6) {
 972       emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
 973    } else {
 974       emit(FS_OPCODE_LINTERP, wpos,
 975            this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
 976            this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
 977            interp_reg(VARYING_SLOT_POS, 2));
 978    }
 979    wpos.reg_offset++;
 980
 981    /* gl_FragCoord.w: Already set up in emit_interpolation */
 982    emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
 983
 984    return reg;
 985 }
 986
 987 fs_inst *
 988 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
 989                          glsl_interp_qualifier interpolation_mode,
 990                          bool is_centroid)
 991 {
 992    brw_wm_barycentric_interp_mode barycoord_mode;
 993    if (brw->gen >= 6) {
 994       if (is_centroid) {
 995          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
 996             barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
 997          else
 998             barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
 999       } else {
1000          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1001             barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1002          else
1003             barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1004       }
1005    } else {
1006       /* On Ironlake and below, there is only one interpolation mode.
1007        * Centroid interpolation doesn't mean anything on this hardware --
1008        * there is no multisampling.
1009        */
1010       barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1011    }
1012    return emit(FS_OPCODE_LINTERP, attr,
1013                this->delta_x[barycoord_mode],
1014                this->delta_y[barycoord_mode], interp);
1015 }
1016
1017 fs_reg *
1018 fs_visitor::emit_general_interpolation(ir_variable *ir)
1019 {
1020    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1021    reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
1022    fs_reg attr = *reg;
1023
1024    unsigned int array_elements;
1025    const glsl_type *type;
1026
1027    if (ir->type->is_array()) {
1028       array_elements = ir->type->length;
1029       if (array_elements == 0) {
1030          fail("dereferenced array '%s' has length 0\n", ir->name);
1031       }
1032       type = ir->type->fields.array;
1033    } else {
1034       array_elements = 1;
1035       type = ir->type;
1036    }
1037
1038    glsl_interp_qualifier interpolation_mode =
1039       ir->determine_interpolation_mode(c->key.flat_shade);
1040
1041    int location = ir->location;
1042    for (unsigned int i = 0; i < array_elements; i++) {
1043       for (unsigned int j = 0; j < type->matrix_columns; j++) {
1044          if (c->prog_data.urb_setup[location] == -1) {
1045             /* If there's no incoming setup data for this slot, don't
1046              * emit interpolation for it.
1047              */
1048             attr.reg_offset += type->vector_elements;
1049             location++;
1050             continue;
1051          }
1052
1053          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1054             /* Constant interpolation (flat shading) case. The SF has
1055              * handed us defined values in only the constant offset
1056              * field of the setup reg.
1057              */
1058             for (unsigned int k = 0; k < type->vector_elements; k++) {
1059                struct brw_reg interp = interp_reg(location, k);
1060                interp = suboffset(interp, 3);
1061                interp.type = reg->type;
1062                emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1063                attr.reg_offset++;
1064             }
1065          } else {
1066             /* Smooth/noperspective interpolation case. */
1067             for (unsigned int k = 0; k < type->vector_elements; k++) {
1068                /* FINISHME: At some point we probably want to push
1069                 * this farther by giving similar treatment to the
1070                 * other potentially constant components of the
1071                 * attribute, as well as making brw_vs_constval.c
1072                 * handle varyings other than gl_TexCoord.
1073                 */
1074                struct brw_reg interp = interp_reg(location, k);
1075                emit_linterp(attr, fs_reg(interp), interpolation_mode,
1076                             ir->centroid);
1077                if (brw->needs_unlit_centroid_workaround && ir->centroid) {
1078                   /* Get the pixel/sample mask into f0 so that we know
1079                    * which pixels are lit.  Then, for each channel that is
1080                    * unlit, replace the centroid data with non-centroid
1081                    * data.
1082                    */
1083                   emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1084                   fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1085                                                interpolation_mode, false);
1086                   inst->predicate = BRW_PREDICATE_NORMAL;
1087                   inst->predicate_inverse = true;
1088                }
1089                if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1090                   emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1091                }
1092                attr.reg_offset++;
1093             }
1094
1095          }
1096          location++;
1097       }
1098    }
1099
1100    return reg;
1101 }
1102
1103 fs_reg *
1104 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1105 {
1106    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1107
1108    /* The frontfacing comes in as a bit in the thread payload. */
1109    if (brw->gen >= 6) {
1110       emit(BRW_OPCODE_ASR, *reg,
1111            fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1112            fs_reg(15));
1113       emit(BRW_OPCODE_NOT, *reg, *reg);
1114       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1115    } else {
1116       struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1117       /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1118        * us front face
1119        */
1120       emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1121       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1122    }
1123
1124    return reg;
1125 }
1126
1127 fs_reg
1128 fs_visitor::fix_math_operand(fs_reg src)
1129 {
1130    /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1131     * might be able to do better by doing execsize = 1 math and then
1132     * expanding that result out, but we would need to be careful with
1133     * masking.
1134     *
1135     * The hardware ignores source modifiers (negate and abs) on math
1136     * instructions, so we also move to a temp to set those up.
1137     */
1138    if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1139        !src.abs && !src.negate)
1140       return src;
1141
1142    /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1143     * operands to math
1144     */
1145    if (brw->gen >= 7 && src.file != IMM)
1146       return src;
1147
1148    fs_reg expanded = fs_reg(this, glsl_type::float_type);
1149    expanded.type = src.type;
1150    emit(BRW_OPCODE_MOV, expanded, src);
1151    return expanded;
1152 }
1153
1154 fs_inst *
1155 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1156 {
1157    switch (opcode) {
1158    case SHADER_OPCODE_RCP:
1159    case SHADER_OPCODE_RSQ:
1160    case SHADER_OPCODE_SQRT:
1161    case SHADER_OPCODE_EXP2:
1162    case SHADER_OPCODE_LOG2:
1163    case SHADER_OPCODE_SIN:
1164    case SHADER_OPCODE_COS:
1165       break;
1166    default:
1167       assert(!"not reached: bad math opcode");
1168       return NULL;
1169    }
1170
1171    /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
1172     * might be able to do better by doing execsize = 1 math and then
1173     * expanding that result out, but we would need to be careful with
1174     * masking.
1175     *
1176     * Gen 6 hardware ignores source modifiers (negate and abs) on math
1177     * instructions, so we also move to a temp to set those up.
1178     */
1179    if (brw->gen >= 6)
1180       src = fix_math_operand(src);
1181
1182    fs_inst *inst = emit(opcode, dst, src);
1183
1184    if (brw->gen < 6) {
1185       inst->base_mrf = 2;
1186       inst->mlen = dispatch_width / 8;
1187    }
1188
1189    return inst;
1190 }
1191
1192 fs_inst *
1193 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1194 {
1195    int base_mrf = 2;
1196    fs_inst *inst;
1197
1198    switch (opcode) {
1199    case SHADER_OPCODE_INT_QUOTIENT:
1200    case SHADER_OPCODE_INT_REMAINDER:
1201       if (brw->gen >= 7 && dispatch_width == 16)
1202          fail("16-wide INTDIV unsupported\n");
1203       break;
1204    case SHADER_OPCODE_POW:
1205       break;
1206    default:
1207       assert(!"not reached: unsupported binary math opcode.");
1208       return NULL;
1209    }
1210
1211    if (brw->gen >= 6) {
1212       src0 = fix_math_operand(src0);
1213       src1 = fix_math_operand(src1);
1214
1215       inst = emit(opcode, dst, src0, src1);
1216    } else {
1217       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1218        * "Message Payload":
1219        *
1220        * "Operand0[7].  For the INT DIV functions, this operand is the
1221        *  denominator."
1222        *  ...
1223        * "Operand1[7].  For the INT DIV functions, this operand is the
1224        *  numerator."
1225        */
1226       bool is_int_div = opcode != SHADER_OPCODE_POW;
1227       fs_reg &op0 = is_int_div ? src1 : src0;
1228       fs_reg &op1 = is_int_div ? src0 : src1;
1229
1230       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1231       inst = emit(opcode, dst, op0, reg_null_f);
1232
1233       inst->base_mrf = base_mrf;
1234       inst->mlen = 2 * dispatch_width / 8;
1235    }
1236    return inst;
1237 }
1238
1239 void
1240 fs_visitor::assign_curb_setup()
1241 {
1242    c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
1243    if (dispatch_width == 8) {
1244       c->prog_data.first_curbe_grf = c->nr_payload_regs;
1245    } else {
1246       c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1247    }
1248
1249    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1250    foreach_list(node, &this->instructions) {
1251       fs_inst *inst = (fs_inst *)node;
1252
1253       for (unsigned int i = 0; i < 3; i++) {
1254          if (inst->src[i].file == UNIFORM) {
1255             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1256             struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1257                                                   constant_nr / 8,
1258                                                   constant_nr % 8);
1259
1260             inst->src[i].file = HW_REG;
1261             inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
1262          }
1263       }
1264    }
1265 }
1266
1267 void
1268 fs_visitor::calculate_urb_setup()
1269 {
1270    for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1271       c->prog_data.urb_setup[i] = -1;
1272    }
1273
1274    int urb_next = 0;
1275    /* Figure out where each of the incoming setup attributes lands. */
1276    if (brw->gen >= 6) {
1277       if (_mesa_bitcount_64(fp->Base.InputsRead &
1278                             BRW_FS_VARYING_INPUT_MASK) <= 16) {
1279          /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1280           * first 16 varying inputs, so we can put them wherever we want.
1281           * Just put them in order.
1282           *
1283           * This is useful because it means that (a) inputs not used by the
1284           * fragment shader won't take up valuable register space, and (b) we
1285           * won't have to recompile the fragment shader if it gets paired with
1286           * a different vertex (or geometry) shader.
1287           */
1288          for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1289             if (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1290                 BITFIELD64_BIT(i)) {
1291                c->prog_data.urb_setup[i] = urb_next++;
1292             }
1293          }
1294       } else {
1295          /* We have enough input varyings that the SF/SBE pipeline stage can't
1296           * arbitrarily rearrange them to suit our whim; we have to put them
1297           * in an order that matches the output of the previous pipeline stage
1298           * (geometry or vertex shader).
1299           */
1300          struct brw_vue_map prev_stage_vue_map;
1301          brw_compute_vue_map(brw, &prev_stage_vue_map,
1302                              c->key.input_slots_valid);
1303          int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1304          assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1305          for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1306               slot++) {
1307             int varying = prev_stage_vue_map.slot_to_varying[slot];
1308             /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1309              * unused.
1310              */
1311             if (varying != BRW_VARYING_SLOT_COUNT &&
1312                 (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1313                  BITFIELD64_BIT(varying))) {
1314                c->prog_data.urb_setup[varying] = slot - first_slot;
1315             }
1316          }
1317          urb_next = prev_stage_vue_map.num_slots - first_slot;
1318       }
1319    } else {
1320       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1321       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1322          /* Point size is packed into the header, not as a general attribute */
1323          if (i == VARYING_SLOT_PSIZ)
1324             continue;
1325
1326          if (c->key.input_slots_valid & BITFIELD64_BIT(i)) {
1327             /* The back color slot is skipped when the front color is
1328              * also written to.  In addition, some slots can be
1329              * written in the vertex shader and not read in the
1330              * fragment shader.  So the register number must always be
1331              * incremented, mapped or not.
1332              */
1333             if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1334                c->prog_data.urb_setup[i] = urb_next;
1335             urb_next++;
1336          }
1337       }
1338
1339       /*
1340        * It's a FS only attribute, and we did interpolation for this attribute
1341        * in SF thread. So, count it here, too.
1342        *
1343        * See compile_sf_prog() for more info.
1344        */
1345       if (fp->Base.InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1346          c->prog_data.urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1347    }
1348
1349    c->prog_data.num_varying_inputs = urb_next;
1350 }
1351
1352 void
1353 fs_visitor::assign_urb_setup()
1354 {
1355    int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1356
1357    /* Offset all the urb_setup[] index by the actual position of the
1358     * setup regs, now that the location of the constants has been chosen.
1359     */
1360    foreach_list(node, &this->instructions) {
1361       fs_inst *inst = (fs_inst *)node;
1362
1363       if (inst->opcode == FS_OPCODE_LINTERP) {
1364          assert(inst->src[2].file == HW_REG);
1365          inst->src[2].fixed_hw_reg.nr += urb_start;
1366       }
1367
1368       if (inst->opcode == FS_OPCODE_CINTERP) {
1369          assert(inst->src[0].file == HW_REG);
1370          inst->src[0].fixed_hw_reg.nr += urb_start;
1371       }
1372    }
1373
1374    /* Each attribute is 4 setup channels, each of which is half a reg. */
1375    this->first_non_payload_grf =
1376       urb_start + c->prog_data.num_varying_inputs * 2;
1377 }
1378
1379 /**
1380  * Split large virtual GRFs into separate components if we can.
1381  *
1382  * This is mostly duplicated with what brw_fs_vector_splitting does,
1383  * but that's really conservative because it's afraid of doing
1384  * splitting that doesn't result in real progress after the rest of
1385  * the optimization phases, which would cause infinite looping in
1386  * optimization.  We can do it once here, safely.  This also has the
1387  * opportunity to split interpolated values, or maybe even uniforms,
1388  * which we don't have at the IR level.
1389  *
1390  * We want to split, because virtual GRFs are what we register
1391  * allocate and spill (due to contiguousness requirements for some
1392  * instructions), and they're what we naturally generate in the
1393  * codegen process, but most virtual GRFs don't actually need to be
1394  * contiguous sets of GRFs.  If we split, we'll end up with reduced
1395  * live intervals and better dead code elimination and coalescing.
1396  */
1397 void
1398 fs_visitor::split_virtual_grfs()
1399 {
1400    int num_vars = this->virtual_grf_count;
1401    bool split_grf[num_vars];
1402    int new_virtual_grf[num_vars];
1403
1404    /* Try to split anything > 0 sized. */
1405    for (int i = 0; i < num_vars; i++) {
1406       if (this->virtual_grf_sizes[i] != 1)
1407          split_grf[i] = true;
1408       else
1409          split_grf[i] = false;
1410    }
1411
1412    if (brw->has_pln &&
1413        this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1414       /* PLN opcodes rely on the delta_xy being contiguous.  We only have to
1415        * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1416        * Gen6, that was the only supported interpolation mode, and since Gen6,
1417        * delta_x and delta_y are in fixed hardware registers.
1418        */
1419       split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1420          false;
1421    }
1422
1423    foreach_list(node, &this->instructions) {
1424       fs_inst *inst = (fs_inst *)node;
1425
1426       /* If there's a SEND message that requires contiguous destination
1427        * registers, no splitting is allowed.
1428        */
1429       if (inst->regs_written > 1) {
1430          split_grf[inst->dst.reg] = false;
1431       }
1432
1433       /* If we're sending from a GRF, don't split it, on the assumption that
1434        * the send is reading the whole thing.
1435        */
1436       if (inst->is_send_from_grf()) {
1437          for (int i = 0; i < 3; i++) {
1438             if (inst->src[i].file == GRF) {
1439                split_grf[inst->src[i].reg] = false;
1440             }
1441          }
1442       }
1443    }
1444
1445    /* Allocate new space for split regs.  Note that the virtual
1446     * numbers will be contiguous.
1447     */
1448    for (int i = 0; i < num_vars; i++) {
1449       if (split_grf[i]) {
1450          new_virtual_grf[i] = virtual_grf_alloc(1);
1451          for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1452             int reg = virtual_grf_alloc(1);
1453             assert(reg == new_virtual_grf[i] + j - 1);
1454             (void) reg;
1455          }
1456          this->virtual_grf_sizes[i] = 1;
1457       }
1458    }
1459
1460    foreach_list(node, &this->instructions) {
1461       fs_inst *inst = (fs_inst *)node;
1462
1463       if (inst->dst.file == GRF &&
1464           split_grf[inst->dst.reg] &&
1465           inst->dst.reg_offset != 0) {
1466          inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1467                           inst->dst.reg_offset - 1);
1468          inst->dst.reg_offset = 0;
1469       }
1470       for (int i = 0; i < 3; i++) {
1471          if (inst->src[i].file == GRF &&
1472              split_grf[inst->src[i].reg] &&
1473              inst->src[i].reg_offset != 0) {
1474             inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1475                                 inst->src[i].reg_offset - 1);
1476             inst->src[i].reg_offset = 0;
1477          }
1478       }
1479    }
1480    invalidate_live_intervals();
1481 }
1482
1483 /**
1484  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1485  *
1486  * During code generation, we create tons of temporary variables, many of
1487  * which get immediately killed and are never used again.  Yet, in later
1488  * optimization and analysis passes, such as compute_live_intervals, we need
1489  * to loop over all the virtual GRFs.  Compacting them can save a lot of
1490  * overhead.
1491  */
1492 void
1493 fs_visitor::compact_virtual_grfs()
1494 {
1495    /* Mark which virtual GRFs are used, and count how many. */
1496    int remap_table[this->virtual_grf_count];
1497    memset(remap_table, -1, sizeof(remap_table));
1498
1499    foreach_list(node, &this->instructions) {
1500       const fs_inst *inst = (const fs_inst *) node;
1501
1502       if (inst->dst.file == GRF)
1503          remap_table[inst->dst.reg] = 0;
1504
1505       for (int i = 0; i < 3; i++) {
1506          if (inst->src[i].file == GRF)
1507             remap_table[inst->src[i].reg] = 0;
1508       }
1509    }
1510
1511    /* In addition to registers used in instructions, fs_visitor keeps
1512     * direct references to certain special values which must be patched:
1513     */
1514    fs_reg *special[] = {
1515       &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1516       &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1517       &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1518       &delta_x[0], &delta_x[1], &delta_x[2],
1519       &delta_x[3], &delta_x[4], &delta_x[5],
1520       &delta_y[0], &delta_y[1], &delta_y[2],
1521       &delta_y[3], &delta_y[4], &delta_y[5],
1522    };
1523    STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1524    STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1525
1526    /* Treat all special values as used, to be conservative */
1527    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1528       if (special[i]->file == GRF)
1529          remap_table[special[i]->reg] = 0;
1530    }
1531
1532    /* Compact the GRF arrays. */
1533    int new_index = 0;
1534    for (int i = 0; i < this->virtual_grf_count; i++) {
1535       if (remap_table[i] != -1) {
1536          remap_table[i] = new_index;
1537          virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1538          invalidate_live_intervals();
1539          ++new_index;
1540       }
1541    }
1542
1543    this->virtual_grf_count = new_index;
1544
1545    /* Patch all the instructions to use the newly renumbered registers */
1546    foreach_list(node, &this->instructions) {
1547       fs_inst *inst = (fs_inst *) node;
1548
1549       if (inst->dst.file == GRF)
1550          inst->dst.reg = remap_table[inst->dst.reg];
1551
1552       for (int i = 0; i < 3; i++) {
1553          if (inst->src[i].file == GRF)
1554             inst->src[i].reg = remap_table[inst->src[i].reg];
1555       }
1556    }
1557
1558    /* Patch all the references to special values */
1559    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1560       if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1561          special[i]->reg = remap_table[special[i]->reg];
1562    }
1563 }
1564
1565 bool
1566 fs_visitor::remove_dead_constants()
1567 {
1568    if (dispatch_width == 8) {
1569       this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
1570       this->nr_params_remap = c->prog_data.nr_params;
1571
1572       for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
1573          this->params_remap[i] = -1;
1574
1575       /* Find which params are still in use. */
1576       foreach_list(node, &this->instructions) {
1577          fs_inst *inst = (fs_inst *)node;
1578
1579          for (int i = 0; i < 3; i++) {
1580             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1581
1582             if (inst->src[i].file != UNIFORM)
1583                continue;
1584
1585             /* Section 5.11 of the OpenGL 4.3 spec says:
1586              *
1587              *     "Out-of-bounds reads return undefined values, which include
1588              *     values from other variables of the active program or zero."
1589              */
1590             if (constant_nr < 0 || constant_nr >= (int)c->prog_data.nr_params) {
1591                constant_nr = 0;
1592             }
1593
1594             /* For now, set this to non-negative.  We'll give it the
1595              * actual new number in a moment, in order to keep the
1596              * register numbers nicely ordered.
1597              */
1598             this->params_remap[constant_nr] = 0;
1599          }
1600       }
1601
1602       /* Figure out what the new numbers for the params will be.  At some
1603        * point when we're doing uniform array access, we're going to want
1604        * to keep the distinction between .reg and .reg_offset, but for
1605        * now we don't care.
1606        */
1607       unsigned int new_nr_params = 0;
1608       for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1609          if (this->params_remap[i] != -1) {
1610             this->params_remap[i] = new_nr_params++;
1611          }
1612       }
1613
1614       /* Update the list of params to be uploaded to match our new numbering. */
1615       for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1616          int remapped = this->params_remap[i];
1617
1618          if (remapped == -1)
1619             continue;
1620
1621          c->prog_data.param[remapped] = c->prog_data.param[i];
1622       }
1623
1624       c->prog_data.nr_params = new_nr_params;
1625    } else {
1626       /* This should have been generated in the 8-wide pass already. */
1627       assert(this->params_remap);
1628    }
1629
1630    /* Now do the renumbering of the shader to remove unused params. */
1631    foreach_list(node, &this->instructions) {
1632       fs_inst *inst = (fs_inst *)node;
1633
1634       for (int i = 0; i < 3; i++) {
1635          int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1636
1637          if (inst->src[i].file != UNIFORM)
1638             continue;
1639
1640          /* as above alias to 0 */
1641          if (constant_nr < 0 || constant_nr >= (int)this->nr_params_remap) {
1642             constant_nr = 0;
1643          }
1644          assert(this->params_remap[constant_nr] != -1);
1645          inst->src[i].reg = this->params_remap[constant_nr];
1646          inst->src[i].reg_offset = 0;
1647       }
1648    }
1649
1650    return true;
1651 }
1652
1653 /*
1654  * Implements array access of uniforms by inserting a
1655  * PULL_CONSTANT_LOAD instruction.
1656  *
1657  * Unlike temporary GRF array access (where we don't support it due to
1658  * the difficulty of doing relative addressing on instruction
1659  * destinations), we could potentially do array access of uniforms
1660  * that were loaded in GRF space as push constants.  In real-world
1661  * usage we've seen, though, the arrays being used are always larger
1662  * than we could load as push constants, so just always move all
1663  * uniform array access out to a pull constant buffer.
1664  */
1665 void
1666 fs_visitor::move_uniform_array_access_to_pull_constants()
1667 {
1668    int pull_constant_loc[c->prog_data.nr_params];
1669
1670    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1671       pull_constant_loc[i] = -1;
1672    }
1673
1674    /* Walk through and find array access of uniforms.  Put a copy of that
1675     * uniform in the pull constant buffer.
1676     *
1677     * Note that we don't move constant-indexed accesses to arrays.  No
1678     * testing has been done of the performance impact of this choice.
1679     */
1680    foreach_list_safe(node, &this->instructions) {
1681       fs_inst *inst = (fs_inst *)node;
1682
1683       for (int i = 0 ; i < 3; i++) {
1684          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1685             continue;
1686
1687          int uniform = inst->src[i].reg;
1688
1689          /* If this array isn't already present in the pull constant buffer,
1690           * add it.
1691           */
1692          if (pull_constant_loc[uniform] == -1) {
1693             const float **values = &c->prog_data.param[uniform];
1694
1695             pull_constant_loc[uniform] = c->prog_data.nr_pull_params;
1696
1697             assert(param_size[uniform]);
1698
1699             for (int j = 0; j < param_size[uniform]; j++) {
1700                c->prog_data.pull_param[c->prog_data.nr_pull_params++] =
1701                   values[j];
1702             }
1703          }
1704
1705          /* Set up the annotation tracking for new generated instructions. */
1706          base_ir = inst->ir;
1707          current_annotation = inst->annotation;
1708
1709          fs_reg surf_index = fs_reg(c->prog_data.base.binding_table.pull_constants_start);
1710          fs_reg temp = fs_reg(this, glsl_type::float_type);
1711          exec_list list = VARYING_PULL_CONSTANT_LOAD(temp,
1712                                                      surf_index,
1713                                                      *inst->src[i].reladdr,
1714                                                      pull_constant_loc[uniform] +
1715                                                      inst->src[i].reg_offset);
1716          inst->insert_before(&list);
1717
1718          inst->src[i].file = temp.file;
1719          inst->src[i].reg = temp.reg;
1720          inst->src[i].reg_offset = temp.reg_offset;
1721          inst->src[i].reladdr = NULL;
1722       }
1723    }
1724 }
1725
1726 /**
1727  * Choose accesses from the UNIFORM file to demote to using the pull
1728  * constant buffer.
1729  *
1730  * We allow a fragment shader to have more than the specified minimum
1731  * maximum number of fragment shader uniform components (64).  If
1732  * there are too many of these, they'd fill up all of register space.
1733  * So, this will push some of them out to the pull constant buffer and
1734  * update the program to load them.
1735  */
1736 void
1737 fs_visitor::setup_pull_constants()
1738 {
1739    /* Only allow 16 registers (128 uniform components) as push constants. */
1740    unsigned int max_uniform_components = 16 * 8;
1741    if (c->prog_data.nr_params <= max_uniform_components)
1742       return;
1743
1744    if (dispatch_width == 16) {
1745       fail("Pull constants not supported in 16-wide\n");
1746       return;
1747    }
1748
1749    /* Just demote the end of the list.  We could probably do better
1750     * here, demoting things that are rarely used in the program first.
1751     */
1752    unsigned int pull_uniform_base = max_uniform_components;
1753
1754    int pull_constant_loc[c->prog_data.nr_params];
1755    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1756       if (i < pull_uniform_base) {
1757          pull_constant_loc[i] = -1;
1758       } else {
1759          pull_constant_loc[i] = -1;
1760          /* If our constant is already being uploaded for reladdr purposes,
1761           * reuse it.
1762           */
1763          for (unsigned int j = 0; j < c->prog_data.nr_pull_params; j++) {
1764             if (c->prog_data.pull_param[j] == c->prog_data.param[i]) {
1765                pull_constant_loc[i] = j;
1766                break;
1767             }
1768          }
1769          if (pull_constant_loc[i] == -1) {
1770             int pull_index = c->prog_data.nr_pull_params++;
1771             c->prog_data.pull_param[pull_index] = c->prog_data.param[i];
1772             pull_constant_loc[i] = pull_index;;
1773          }
1774       }
1775    }
1776    c->prog_data.nr_params = pull_uniform_base;
1777
1778    foreach_list(node, &this->instructions) {
1779       fs_inst *inst = (fs_inst *)node;
1780
1781       for (int i = 0; i < 3; i++) {
1782          if (inst->src[i].file != UNIFORM)
1783             continue;
1784
1785          int pull_index = pull_constant_loc[inst->src[i].reg +
1786                                             inst->src[i].reg_offset];
1787          if (pull_index == -1)
1788             continue;
1789
1790          assert(!inst->src[i].reladdr);
1791
1792          fs_reg dst = fs_reg(this, glsl_type::float_type);
1793          fs_reg index = fs_reg(c->prog_data.base.binding_table.pull_constants_start);
1794          fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1795          fs_inst *pull =
1796             new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1797                                  dst, index, offset);
1798          pull->ir = inst->ir;
1799          pull->annotation = inst->annotation;
1800
1801          inst->insert_before(pull);
1802
1803          inst->src[i].file = GRF;
1804          inst->src[i].reg = dst.reg;
1805          inst->src[i].reg_offset = 0;
1806          inst->src[i].smear = pull_index & 3;
1807       }
1808    }
1809 }
1810
1811 bool
1812 fs_visitor::opt_algebraic()
1813 {
1814    bool progress = false;
1815
1816    foreach_list(node, &this->instructions) {
1817       fs_inst *inst = (fs_inst *)node;
1818
1819       switch (inst->opcode) {
1820       case BRW_OPCODE_MUL:
1821          if (inst->src[1].file != IMM)
1822             continue;
1823
1824          /* a * 1.0 = a */
1825          if (inst->src[1].is_one()) {
1826             inst->opcode = BRW_OPCODE_MOV;
1827             inst->src[1] = reg_undef;
1828             progress = true;
1829             break;
1830          }
1831
1832          /* a * 0.0 = 0.0 */
1833          if (inst->src[1].is_zero()) {
1834             inst->opcode = BRW_OPCODE_MOV;
1835             inst->src[0] = inst->src[1];
1836             inst->src[1] = reg_undef;
1837             progress = true;
1838             break;
1839          }
1840
1841          break;
1842       case BRW_OPCODE_ADD:
1843          if (inst->src[1].file != IMM)
1844             continue;
1845
1846          /* a + 0.0 = a */
1847          if (inst->src[1].is_zero()) {
1848             inst->opcode = BRW_OPCODE_MOV;
1849             inst->src[1] = reg_undef;
1850             progress = true;
1851             break;
1852          }
1853          break;
1854       default:
1855          break;
1856       }
1857    }
1858
1859    return progress;
1860 }
1861
1862 /**
1863  * Removes any instructions writing a VGRF where that VGRF is not used by any
1864  * later instruction.
1865  */
1866 bool
1867 fs_visitor::dead_code_eliminate()
1868 {
1869    bool progress = false;
1870    int pc = 0;
1871
1872    calculate_live_intervals();
1873
1874    foreach_list_safe(node, &this->instructions) {
1875       fs_inst *inst = (fs_inst *)node;
1876
1877       if (inst->dst.file == GRF) {
1878          bool dead = true;
1879
1880          for (int i = 0; i < inst->regs_written; i++) {
1881             int var = live_intervals->var_from_vgrf[inst->dst.reg];
1882             assert(live_intervals->end[var + inst->dst.reg_offset + i] >= pc);
1883             if (live_intervals->end[var + inst->dst.reg_offset + i] != pc) {
1884                dead = false;
1885                break;
1886             }
1887          }
1888
1889          if (dead) {
1890             /* Don't dead code eliminate instructions that write to the
1891              * accumulator as a side-effect. Instead just set the destination
1892              * to the null register to free it.
1893              */
1894             switch (inst->opcode) {
1895             case BRW_OPCODE_ADDC:
1896             case BRW_OPCODE_SUBB:
1897             case BRW_OPCODE_MACH:
1898                inst->dst = fs_reg(retype(brw_null_reg(), inst->dst.type));
1899                break;
1900             default:
1901                inst->remove();
1902                progress = true;
1903                break;
1904             }
1905          }
1906       }
1907
1908       pc++;
1909    }
1910
1911    if (progress)
1912       invalidate_live_intervals();
1913
1914    return progress;
1915 }
1916
1917 struct dead_code_hash_key
1918 {
1919    int vgrf;
1920    int reg_offset;
1921 };
1922
1923 static bool
1924 dead_code_hash_compare(const void *a, const void *b)
1925 {
1926    return memcmp(a, b, sizeof(struct dead_code_hash_key)) == 0;
1927 }
1928
1929 static void
1930 clear_dead_code_hash(struct hash_table *ht)
1931 {
1932    struct hash_entry *entry;
1933
1934    hash_table_foreach(ht, entry) {
1935       _mesa_hash_table_remove(ht, entry);
1936    }
1937 }
1938
1939 static void
1940 insert_dead_code_hash(struct hash_table *ht,
1941                       int vgrf, int reg_offset, fs_inst *inst)
1942 {
1943    /* We don't bother freeing keys, because they'll be GCed with the ht. */
1944    struct dead_code_hash_key *key = ralloc(ht, struct dead_code_hash_key);
1945
1946    key->vgrf = vgrf;
1947    key->reg_offset = reg_offset;
1948
1949    _mesa_hash_table_insert(ht, _mesa_hash_data(key, sizeof(*key)), key, inst);
1950 }
1951
1952 static struct hash_entry *
1953 get_dead_code_hash_entry(struct hash_table *ht, int vgrf, int reg_offset)
1954 {
1955    struct dead_code_hash_key key;
1956
1957    key.vgrf = vgrf;
1958    key.reg_offset = reg_offset;
1959
1960    return _mesa_hash_table_search(ht, _mesa_hash_data(&key, sizeof(key)), &key);
1961 }
1962
1963 static void
1964 remove_dead_code_hash(struct hash_table *ht,
1965                       int vgrf, int reg_offset)
1966 {
1967    struct hash_entry *entry = get_dead_code_hash_entry(ht, vgrf, reg_offset);
1968    if (!entry)
1969       return;
1970
1971    _mesa_hash_table_remove(ht, entry);
1972 }
1973
1974 /**
1975  * Walks basic blocks, removing any regs that are written but not read before
1976  * being redefined.
1977  *
1978  * The dead_code_eliminate() function implements a global dead code
1979  * elimination, but it only handles the removing the last write to a register
1980  * if it's never read.  This one can handle intermediate writes, but only
1981  * within a basic block.
1982  */
1983 bool
1984 fs_visitor::dead_code_eliminate_local()
1985 {
1986    struct hash_table *ht;
1987    bool progress = false;
1988
1989    ht = _mesa_hash_table_create(mem_ctx, dead_code_hash_compare);
1990
1991    foreach_list_safe(node, &this->instructions) {
1992       fs_inst *inst = (fs_inst *)node;
1993
1994       /* At a basic block, empty the HT since we don't understand dataflow
1995        * here.
1996        */
1997       if (inst->is_control_flow()) {
1998          clear_dead_code_hash(ht);
1999          continue;
2000       }
2001
2002       /* Clear the HT of any instructions that got read. */
2003       for (int i = 0; i < 3; i++) {
2004          fs_reg src = inst->src[i];
2005          if (src.file != GRF)
2006             continue;
2007
2008          int read = 1;
2009          if (inst->is_send_from_grf())
2010             read = virtual_grf_sizes[src.reg] - src.reg_offset;
2011
2012          for (int reg_offset = src.reg_offset;
2013               reg_offset < src.reg_offset + read;
2014               reg_offset++) {
2015             remove_dead_code_hash(ht, src.reg, reg_offset);
2016          }
2017       }
2018
2019       /* Add any update of a GRF to the HT, removing a previous write if it
2020        * wasn't read.
2021        */
2022       if (inst->dst.file == GRF) {
2023          if (inst->regs_written > 1) {
2024             /* We don't know how to trim channels from an instruction's
2025              * writes, so we can't incrementally remove unread channels from
2026              * it.  Just remove whatever it overwrites from the table
2027              */
2028             for (int i = 0; i < inst->regs_written; i++) {
2029                remove_dead_code_hash(ht,
2030                                      inst->dst.reg,
2031                                      inst->dst.reg_offset + i);
2032             }
2033          } else {
2034             struct hash_entry *entry =
2035                get_dead_code_hash_entry(ht, inst->dst.reg,
2036                                         inst->dst.reg_offset);
2037
2038             if (inst->is_partial_write()) {
2039                /* For a partial write, we can't remove any previous dead code
2040                 * candidate, since we're just modifying their result, but we can
2041                 * be dead code eliminiated ourselves.
2042                 */
2043                if (entry) {
2044                   entry->data = inst;
2045                } else {
2046                   insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
2047                                         inst);
2048                }
2049             } else {
2050                if (entry) {
2051                   /* We're completely updating a channel, and there was a
2052                    * previous write to the channel that wasn't read.  Kill it!
2053                    */
2054                   fs_inst *inst = (fs_inst *)entry->data;
2055                   inst->remove();
2056                   progress = true;
2057                   _mesa_hash_table_remove(ht, entry);
2058                }
2059
2060                insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
2061                                      inst);
2062             }
2063          }
2064       }
2065    }
2066
2067    _mesa_hash_table_destroy(ht, NULL);
2068
2069    if (progress)
2070       invalidate_live_intervals();
2071
2072    return progress;
2073 }
2074
2075 /**
2076  * Implements a second type of register coalescing: This one checks if
2077  * the two regs involved in a raw move don't interfere, in which case
2078  * they can both by stored in the same place and the MOV removed.
2079  */
2080 bool
2081 fs_visitor::register_coalesce_2()
2082 {
2083    bool progress = false;
2084
2085    calculate_live_intervals();
2086
2087    foreach_list_safe(node, &this->instructions) {
2088       fs_inst *inst = (fs_inst *)node;
2089
2090       if (inst->opcode != BRW_OPCODE_MOV ||
2091           inst->is_partial_write() ||
2092           inst->saturate ||
2093           inst->src[0].file != GRF ||
2094           inst->src[0].negate ||
2095           inst->src[0].abs ||
2096           inst->src[0].smear != -1 ||
2097           inst->dst.file != GRF ||
2098           inst->dst.type != inst->src[0].type ||
2099           virtual_grf_sizes[inst->src[0].reg] != 1) {
2100          continue;
2101       }
2102
2103       int var_from = live_intervals->var_from_reg(&inst->src[0]);
2104       int var_to = live_intervals->var_from_reg(&inst->dst);
2105
2106       if (live_intervals->vars_interfere(var_from, var_to))
2107          continue;
2108
2109       int reg_from = inst->src[0].reg;
2110       assert(inst->src[0].reg_offset == 0);
2111       int reg_to = inst->dst.reg;
2112       int reg_to_offset = inst->dst.reg_offset;
2113
2114       foreach_list(node, &this->instructions) {
2115          fs_inst *scan_inst = (fs_inst *)node;
2116
2117          if (scan_inst->dst.file == GRF &&
2118              scan_inst->dst.reg == reg_from) {
2119             scan_inst->dst.reg = reg_to;
2120             scan_inst->dst.reg_offset = reg_to_offset;
2121          }
2122          for (int i = 0; i < 3; i++) {
2123             if (scan_inst->src[i].file == GRF &&
2124                 scan_inst->src[i].reg == reg_from) {
2125                scan_inst->src[i].reg = reg_to;
2126                scan_inst->src[i].reg_offset = reg_to_offset;
2127             }
2128          }
2129       }
2130
2131       inst->remove();
2132       progress = true;
2133       continue;
2134    }
2135
2136    if (progress)
2137       invalidate_live_intervals();
2138
2139    return progress;
2140 }
2141
2142 bool
2143 fs_visitor::register_coalesce()
2144 {
2145    bool progress = false;
2146    int if_depth = 0;
2147    int loop_depth = 0;
2148
2149    foreach_list_safe(node, &this->instructions) {
2150       fs_inst *inst = (fs_inst *)node;
2151
2152       /* Make sure that we dominate the instructions we're going to
2153        * scan for interfering with our coalescing, or we won't have
2154        * scanned enough to see if anything interferes with our
2155        * coalescing.  We don't dominate the following instructions if
2156        * we're in a loop or an if block.
2157        */
2158       switch (inst->opcode) {
2159       case BRW_OPCODE_DO:
2160          loop_depth++;
2161          break;
2162       case BRW_OPCODE_WHILE:
2163          loop_depth--;
2164          break;
2165       case BRW_OPCODE_IF:
2166          if_depth++;
2167          break;
2168       case BRW_OPCODE_ENDIF:
2169          if_depth--;
2170          break;
2171       default:
2172          break;
2173       }
2174       if (loop_depth || if_depth)
2175          continue;
2176
2177       if (inst->opcode != BRW_OPCODE_MOV ||
2178           inst->is_partial_write() ||
2179           inst->saturate ||
2180           inst->dst.file != GRF || (inst->src[0].file != GRF &&
2181                                     inst->src[0].file != UNIFORM)||
2182           inst->dst.type != inst->src[0].type)
2183          continue;
2184
2185       bool has_source_modifiers = (inst->src[0].abs ||
2186                                    inst->src[0].negate ||
2187                                    inst->src[0].smear != -1 ||
2188                                    inst->src[0].file == UNIFORM);
2189
2190       /* Found a move of a GRF to a GRF.  Let's see if we can coalesce
2191        * them: check for no writes to either one until the exit of the
2192        * program.
2193        */
2194       bool interfered = false;
2195
2196       for (fs_inst *scan_inst = (fs_inst *)inst->next;
2197            !scan_inst->is_tail_sentinel();
2198            scan_inst = (fs_inst *)scan_inst->next) {
2199          if (scan_inst->dst.file == GRF) {
2200             if (scan_inst->overwrites_reg(inst->dst) ||
2201                 scan_inst->overwrites_reg(inst->src[0])) {
2202                interfered = true;
2203                break;
2204             }
2205          }
2206
2207          if (has_source_modifiers) {
2208             for (int i = 0; i < 3; i++) {
2209                if (scan_inst->src[i].file == GRF &&
2210                    scan_inst->src[i].reg == inst->dst.reg &&
2211                    scan_inst->src[i].reg_offset == inst->dst.reg_offset &&
2212                    inst->dst.type != scan_inst->src[i].type)
2213                {
2214                  interfered = true;
2215                  break;
2216                }
2217             }
2218          }
2219
2220
2221          /* The gen6 MATH instruction can't handle source modifiers or
2222           * unusual register regions, so avoid coalescing those for
2223           * now.  We should do something more specific.
2224           */
2225          if (has_source_modifiers && !can_do_source_mods(scan_inst)) {
2226             interfered = true;
2227             break;
2228          }
2229
2230          if (scan_inst->mlen > 0 && scan_inst->base_mrf == -1 &&
2231              scan_inst->src[0].file == GRF &&
2232              scan_inst->src[0].reg == inst->dst.reg) {
2233             interfered = true;
2234             break;
2235          }
2236
2237          /* The accumulator result appears to get used for the
2238           * conditional modifier generation.  When negating a UD
2239           * value, there is a 33rd bit generated for the sign in the
2240           * accumulator value, so now you can't check, for example,
2241           * equality with a 32-bit value.  See piglit fs-op-neg-uint.
2242           */
2243          if (scan_inst->conditional_mod &&
2244              inst->src[0].negate &&
2245              inst->src[0].type == BRW_REGISTER_TYPE_UD) {
2246             interfered = true;
2247             break;
2248          }
2249       }
2250       if (interfered) {
2251          continue;
2252       }
2253
2254       /* Rewrite the later usage to point at the source of the move to
2255        * be removed.
2256        */
2257       for (fs_inst *scan_inst = inst;
2258            !scan_inst->is_tail_sentinel();
2259            scan_inst = (fs_inst *)scan_inst->next) {
2260          for (int i = 0; i < 3; i++) {
2261             if (scan_inst->src[i].file == GRF &&
2262                 scan_inst->src[i].reg == inst->dst.reg &&
2263                 scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
2264                fs_reg new_src = inst->src[0];
2265                if (scan_inst->src[i].abs) {
2266                   new_src.negate = 0;
2267                   new_src.abs = 1;
2268                }
2269                new_src.negate ^= scan_inst->src[i].negate;
2270                new_src.sechalf = scan_inst->src[i].sechalf;
2271                scan_inst->src[i] = new_src;
2272             }
2273          }
2274       }
2275
2276       inst->remove();
2277       progress = true;
2278    }
2279
2280    if (progress)
2281       invalidate_live_intervals();
2282
2283    return progress;
2284 }
2285
2286
2287 bool
2288 fs_visitor::compute_to_mrf()
2289 {
2290    bool progress = false;
2291    int next_ip = 0;
2292
2293    calculate_live_intervals();
2294
2295    foreach_list_safe(node, &this->instructions) {
2296       fs_inst *inst = (fs_inst *)node;
2297
2298       int ip = next_ip;
2299       next_ip++;
2300
2301       if (inst->opcode != BRW_OPCODE_MOV ||
2302           inst->is_partial_write() ||
2303           inst->dst.file != MRF || inst->src[0].file != GRF ||
2304           inst->dst.type != inst->src[0].type ||
2305           inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
2306          continue;
2307
2308       /* Work out which hardware MRF registers are written by this
2309        * instruction.
2310        */
2311       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2312       int mrf_high;
2313       if (inst->dst.reg & BRW_MRF_COMPR4) {
2314          mrf_high = mrf_low + 4;
2315       } else if (dispatch_width == 16 &&
2316                  (!inst->force_uncompressed && !inst->force_sechalf)) {
2317          mrf_high = mrf_low + 1;
2318       } else {
2319          mrf_high = mrf_low;
2320       }
2321
2322       /* Can't compute-to-MRF this GRF if someone else was going to
2323        * read it later.
2324        */
2325       if (this->virtual_grf_end[inst->src[0].reg] > ip)
2326          continue;
2327
2328       /* Found a move of a GRF to a MRF.  Let's see if we can go
2329        * rewrite the thing that made this GRF to write into the MRF.
2330        */
2331       fs_inst *scan_inst;
2332       for (scan_inst = (fs_inst *)inst->prev;
2333            scan_inst->prev != NULL;
2334            scan_inst = (fs_inst *)scan_inst->prev) {
2335          if (scan_inst->dst.file == GRF &&
2336              scan_inst->dst.reg == inst->src[0].reg) {
2337             /* Found the last thing to write our reg we want to turn
2338              * into a compute-to-MRF.
2339              */
2340
2341             /* If this one instruction didn't populate all the
2342              * channels, bail.  We might be able to rewrite everything
2343              * that writes that reg, but it would require smarter
2344              * tracking to delay the rewriting until complete success.
2345              */
2346             if (scan_inst->is_partial_write())
2347                break;
2348
2349             /* Things returning more than one register would need us to
2350              * understand coalescing out more than one MOV at a time.
2351              */
2352             if (scan_inst->regs_written > 1)
2353                break;
2354
2355             /* SEND instructions can't have MRF as a destination. */
2356             if (scan_inst->mlen)
2357                break;
2358
2359             if (brw->gen == 6) {
2360                /* gen6 math instructions must have the destination be
2361                 * GRF, so no compute-to-MRF for them.
2362                 */
2363                if (scan_inst->is_math()) {
2364                   break;
2365                }
2366             }
2367
2368             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2369                /* Found the creator of our MRF's source value. */
2370                scan_inst->dst.file = MRF;
2371                scan_inst->dst.reg = inst->dst.reg;
2372                scan_inst->saturate |= inst->saturate;
2373                inst->remove();
2374                progress = true;
2375             }
2376             break;
2377          }
2378
2379          /* We don't handle control flow here.  Most computation of
2380           * values that end up in MRFs are shortly before the MRF
2381           * write anyway.
2382           */
2383          if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2384             break;
2385
2386          /* You can't read from an MRF, so if someone else reads our
2387           * MRF's source GRF that we wanted to rewrite, that stops us.
2388           */
2389          bool interfered = false;
2390          for (int i = 0; i < 3; i++) {
2391             if (scan_inst->src[i].file == GRF &&
2392                 scan_inst->src[i].reg == inst->src[0].reg &&
2393                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2394                interfered = true;
2395             }
2396          }
2397          if (interfered)
2398             break;
2399
2400          if (scan_inst->dst.file == MRF) {
2401             /* If somebody else writes our MRF here, we can't
2402              * compute-to-MRF before that.
2403              */
2404             int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2405             int scan_mrf_high;
2406
2407             if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2408                scan_mrf_high = scan_mrf_low + 4;
2409             } else if (dispatch_width == 16 &&
2410                        (!scan_inst->force_uncompressed &&
2411                         !scan_inst->force_sechalf)) {
2412                scan_mrf_high = scan_mrf_low + 1;
2413             } else {
2414                scan_mrf_high = scan_mrf_low;
2415             }
2416
2417             if (mrf_low == scan_mrf_low ||
2418                 mrf_low == scan_mrf_high ||
2419                 mrf_high == scan_mrf_low ||
2420                 mrf_high == scan_mrf_high) {
2421                break;
2422             }
2423          }
2424
2425          if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2426             /* Found a SEND instruction, which means that there are
2427              * live values in MRFs from base_mrf to base_mrf +
2428              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
2429              * above it.
2430              */
2431             if (mrf_low >= scan_inst->base_mrf &&
2432                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2433                break;
2434             }
2435             if (mrf_high >= scan_inst->base_mrf &&
2436                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2437                break;
2438             }
2439          }
2440       }
2441    }
2442
2443    if (progress)
2444       invalidate_live_intervals();
2445
2446    return progress;
2447 }
2448
2449 /**
2450  * Walks through basic blocks, looking for repeated MRF writes and
2451  * removing the later ones.
2452  */
2453 bool
2454 fs_visitor::remove_duplicate_mrf_writes()
2455 {
2456    fs_inst *last_mrf_move[16];
2457    bool progress = false;
2458
2459    /* Need to update the MRF tracking for compressed instructions. */
2460    if (dispatch_width == 16)
2461       return false;
2462
2463    memset(last_mrf_move, 0, sizeof(last_mrf_move));
2464
2465    foreach_list_safe(node, &this->instructions) {
2466       fs_inst *inst = (fs_inst *)node;
2467
2468       if (inst->is_control_flow()) {
2469          memset(last_mrf_move, 0, sizeof(last_mrf_move));
2470       }
2471
2472       if (inst->opcode == BRW_OPCODE_MOV &&
2473           inst->dst.file == MRF) {
2474          fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2475          if (prev_inst && inst->equals(prev_inst)) {
2476             inst->remove();
2477             progress = true;
2478             continue;
2479          }
2480       }
2481
2482       /* Clear out the last-write records for MRFs that were overwritten. */
2483       if (inst->dst.file == MRF) {
2484          last_mrf_move[inst->dst.reg] = NULL;
2485       }
2486
2487       if (inst->mlen > 0 && inst->base_mrf != -1) {
2488          /* Found a SEND instruction, which will include two or fewer
2489           * implied MRF writes.  We could do better here.
2490           */
2491          for (int i = 0; i < implied_mrf_writes(inst); i++) {
2492             last_mrf_move[inst->base_mrf + i] = NULL;
2493          }
2494       }
2495
2496       /* Clear out any MRF move records whose sources got overwritten. */
2497       if (inst->dst.file == GRF) {
2498          for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2499             if (last_mrf_move[i] &&
2500                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2501                last_mrf_move[i] = NULL;
2502             }
2503          }
2504       }
2505
2506       if (inst->opcode == BRW_OPCODE_MOV &&
2507           inst->dst.file == MRF &&
2508           inst->src[0].file == GRF &&
2509           !inst->is_partial_write()) {
2510          last_mrf_move[inst->dst.reg] = inst;
2511       }
2512    }
2513
2514    if (progress)
2515       invalidate_live_intervals();
2516
2517    return progress;
2518 }
2519
2520 static void
2521 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2522                         int first_grf, int grf_len)
2523 {
2524    bool inst_16wide = (dispatch_width > 8 &&
2525                        !inst->force_uncompressed &&
2526                        !inst->force_sechalf);
2527
2528    /* Clear the flag for registers that actually got read (as expected). */
2529    for (int i = 0; i < 3; i++) {
2530       int grf;
2531       if (inst->src[i].file == GRF) {
2532          grf = inst->src[i].reg;
2533       } else if (inst->src[i].file == HW_REG &&
2534                  inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2535          grf = inst->src[i].fixed_hw_reg.nr;
2536       } else {
2537          continue;
2538       }
2539
2540       if (grf >= first_grf &&
2541           grf < first_grf + grf_len) {
2542          deps[grf - first_grf] = false;
2543          if (inst_16wide)
2544             deps[grf - first_grf + 1] = false;
2545       }
2546    }
2547 }
2548
2549 /**
2550  * Implements this workaround for the original 965:
2551  *
2552  *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2553  *      check for post destination dependencies on this instruction, software
2554  *      must ensure that there is no destination hazard for the case of ‘write
2555  *      followed by a posted write’ shown in the following example.
2556  *
2557  *      1. mov r3 0
2558  *      2. send r3.xy <rest of send instruction>
2559  *      3. mov r2 r3
2560  *
2561  *      Due to no post-destination dependency check on the ‘send’, the above
2562  *      code sequence could have two instructions (1 and 2) in flight at the
2563  *      same time that both consider ‘r3’ as the target of their final writes.
2564  */
2565 void
2566 fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
2567 {
2568    int reg_size = dispatch_width / 8;
2569    int write_len = inst->regs_written * reg_size;
2570    int first_write_grf = inst->dst.reg;
2571    bool needs_dep[BRW_MAX_MRF];
2572    assert(write_len < (int)sizeof(needs_dep) - 1);
2573
2574    memset(needs_dep, false, sizeof(needs_dep));
2575    memset(needs_dep, true, write_len);
2576
2577    clear_deps_for_inst_src(inst, dispatch_width,
2578                            needs_dep, first_write_grf, write_len);
2579
2580    /* Walk backwards looking for writes to registers we're writing which
2581     * aren't read since being written.  If we hit the start of the program,
2582     * we assume that there are no outstanding dependencies on entry to the
2583     * program.
2584     */
2585    for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2586         scan_inst != NULL;
2587         scan_inst = (fs_inst *)scan_inst->prev) {
2588
2589       /* If we hit control flow, assume that there *are* outstanding
2590        * dependencies, and force their cleanup before our instruction.
2591        */
2592       if (scan_inst->is_control_flow()) {
2593          for (int i = 0; i < write_len; i++) {
2594             if (needs_dep[i]) {
2595                inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2596             }
2597          }
2598          return;
2599       }
2600
2601       bool scan_inst_16wide = (dispatch_width > 8 &&
2602                                !scan_inst->force_uncompressed &&
2603                                !scan_inst->force_sechalf);
2604
2605       /* We insert our reads as late as possible on the assumption that any
2606        * instruction but a MOV that might have left us an outstanding
2607        * dependency has more latency than a MOV.
2608        */
2609       if (scan_inst->dst.file == GRF) {
2610          for (int i = 0; i < scan_inst->regs_written; i++) {
2611             int reg = scan_inst->dst.reg + i * reg_size;
2612
2613             if (reg >= first_write_grf &&
2614                 reg < first_write_grf + write_len &&
2615                 needs_dep[reg - first_write_grf]) {
2616                inst->insert_before(DEP_RESOLVE_MOV(reg));
2617                needs_dep[reg - first_write_grf] = false;
2618                if (scan_inst_16wide)
2619                   needs_dep[reg - first_write_grf + 1] = false;
2620             }
2621          }
2622       }
2623
2624       /* Clear the flag for registers that actually got read (as expected). */
2625       clear_deps_for_inst_src(scan_inst, dispatch_width,
2626                               needs_dep, first_write_grf, write_len);
2627
2628       /* Continue the loop only if we haven't resolved all the dependencies */
2629       int i;
2630       for (i = 0; i < write_len; i++) {
2631          if (needs_dep[i])
2632             break;
2633       }
2634       if (i == write_len)
2635          return;
2636    }
2637 }
2638
2639 /**
2640  * Implements this workaround for the original 965:
2641  *
2642  *     "[DevBW, DevCL] Errata: A destination register from a send can not be
2643  *      used as a destination register until after it has been sourced by an
2644  *      instruction with a different destination register.
2645  */
2646 void
2647 fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
2648 {
2649    int write_len = inst->regs_written * dispatch_width / 8;
2650    int first_write_grf = inst->dst.reg;
2651    bool needs_dep[BRW_MAX_MRF];
2652    assert(write_len < (int)sizeof(needs_dep) - 1);
2653
2654    memset(needs_dep, false, sizeof(needs_dep));
2655    memset(needs_dep, true, write_len);
2656    /* Walk forwards looking for writes to registers we're writing which aren't
2657     * read before being written.
2658     */
2659    for (fs_inst *scan_inst = (fs_inst *)inst->next;
2660         !scan_inst->is_tail_sentinel();
2661         scan_inst = (fs_inst *)scan_inst->next) {
2662       /* If we hit control flow, force resolve all remaining dependencies. */
2663       if (scan_inst->is_control_flow()) {
2664          for (int i = 0; i < write_len; i++) {
2665             if (needs_dep[i])
2666                scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2667          }
2668          return;
2669       }
2670
2671       /* Clear the flag for registers that actually got read (as expected). */
2672       clear_deps_for_inst_src(scan_inst, dispatch_width,
2673                               needs_dep, first_write_grf, write_len);
2674
2675       /* We insert our reads as late as possible since they're reading the
2676        * result of a SEND, which has massive latency.
2677        */
2678       if (scan_inst->dst.file == GRF &&
2679           scan_inst->dst.reg >= first_write_grf &&
2680           scan_inst->dst.reg < first_write_grf + write_len &&
2681           needs_dep[scan_inst->dst.reg - first_write_grf]) {
2682          scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2683          needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2684       }
2685
2686       /* Continue the loop only if we haven't resolved all the dependencies */
2687       int i;
2688       for (i = 0; i < write_len; i++) {
2689          if (needs_dep[i])
2690             break;
2691       }
2692       if (i == write_len)
2693          return;
2694    }
2695
2696    /* If we hit the end of the program, resolve all remaining dependencies out
2697     * of paranoia.
2698     */
2699    fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2700    assert(last_inst->eot);
2701    for (int i = 0; i < write_len; i++) {
2702       if (needs_dep[i])
2703          last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2704    }
2705 }
2706
2707 void
2708 fs_visitor::insert_gen4_send_dependency_workarounds()
2709 {
2710    if (brw->gen != 4 || brw->is_g4x)
2711       return;
2712
2713    /* Note that we're done with register allocation, so GRF fs_regs always
2714     * have a .reg_offset of 0.
2715     */
2716
2717    foreach_list_safe(node, &this->instructions) {
2718       fs_inst *inst = (fs_inst *)node;
2719
2720       if (inst->mlen != 0 && inst->dst.file == GRF) {
2721          insert_gen4_pre_send_dependency_workarounds(inst);
2722          insert_gen4_post_send_dependency_workarounds(inst);
2723       }
2724    }
2725 }
2726
2727 /**
2728  * Turns the generic expression-style uniform pull constant load instruction
2729  * into a hardware-specific series of instructions for loading a pull
2730  * constant.
2731  *
2732  * The expression style allows the CSE pass before this to optimize out
2733  * repeated loads from the same offset, and gives the pre-register-allocation
2734  * scheduling full flexibility, while the conversion to native instructions
2735  * allows the post-register-allocation scheduler the best information
2736  * possible.
2737  *
2738  * Note that execution masking for setting up pull constant loads is special:
2739  * the channels that need to be written are unrelated to the current execution
2740  * mask, since a later instruction will use one of the result channels as a
2741  * source operand for all 8 or 16 of its channels.
2742  */
2743 void
2744 fs_visitor::lower_uniform_pull_constant_loads()
2745 {
2746    foreach_list(node, &this->instructions) {
2747       fs_inst *inst = (fs_inst *)node;
2748
2749       if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2750          continue;
2751
2752       if (brw->gen >= 7) {
2753          /* The offset arg before was a vec4-aligned byte offset.  We need to
2754           * turn it into a dword offset.
2755           */
2756          fs_reg const_offset_reg = inst->src[1];
2757          assert(const_offset_reg.file == IMM &&
2758                 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2759          const_offset_reg.imm.u /= 4;
2760          fs_reg payload = fs_reg(this, glsl_type::uint_type);
2761
2762          /* This is actually going to be a MOV, but since only the first dword
2763           * is accessed, we have a special opcode to do just that one.  Note
2764           * that this needs to be an operation that will be considered a def
2765           * by live variable analysis, or register allocation will explode.
2766           */
2767          fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2768                                                payload, const_offset_reg);
2769          setup->force_writemask_all = true;
2770
2771          setup->ir = inst->ir;
2772          setup->annotation = inst->annotation;
2773          inst->insert_before(setup);
2774
2775          /* Similarly, this will only populate the first 4 channels of the
2776           * result register (since we only use smear values from 0-3), but we
2777           * don't tell the optimizer.
2778           */
2779          inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2780          inst->src[1] = payload;
2781
2782          invalidate_live_intervals();
2783       } else {
2784          /* Before register allocation, we didn't tell the scheduler about the
2785           * MRF we use.  We know it's safe to use this MRF because nothing
2786           * else does except for register spill/unspill, which generates and
2787           * uses its MRF within a single IR instruction.
2788           */
2789          inst->base_mrf = 14;
2790          inst->mlen = 1;
2791       }
2792    }
2793 }
2794
2795 void
2796 fs_visitor::dump_instruction(backend_instruction *be_inst)
2797 {
2798    fs_inst *inst = (fs_inst *)be_inst;
2799
2800    if (inst->predicate) {
2801       printf("(%cf0.%d) ",
2802              inst->predicate_inverse ? '-' : '+',
2803              inst->flag_subreg);
2804    }
2805
2806    printf("%s", brw_instruction_name(inst->opcode));
2807    if (inst->saturate)
2808       printf(".sat");
2809    if (inst->conditional_mod) {
2810       printf(".cmod");
2811       if (!inst->predicate &&
2812           (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2813                               inst->opcode != BRW_OPCODE_IF &&
2814                               inst->opcode != BRW_OPCODE_WHILE))) {
2815          printf(".f0.%d", inst->flag_subreg);
2816       }
2817    }
2818    printf(" ");
2819
2820
2821    switch (inst->dst.file) {
2822    case GRF:
2823       printf("vgrf%d", inst->dst.reg);
2824       if (inst->dst.reg_offset)
2825          printf("+%d", inst->dst.reg_offset);
2826       break;
2827    case MRF:
2828       printf("m%d", inst->dst.reg);
2829       break;
2830    case BAD_FILE:
2831       printf("(null)");
2832       break;
2833    case UNIFORM:
2834       printf("***u%d***", inst->dst.reg);
2835       break;
2836    case HW_REG:
2837       printf("hw_reg%d", inst->dst.fixed_hw_reg.nr);
2838       if (inst->dst.fixed_hw_reg.subnr)
2839          printf("+%d", inst->dst.fixed_hw_reg.subnr);
2840       break;
2841    default:
2842       printf("???");
2843       break;
2844    }
2845    printf(", ");
2846
2847    for (int i = 0; i < 3; i++) {
2848       if (inst->src[i].negate)
2849          printf("-");
2850       if (inst->src[i].abs)
2851          printf("|");
2852       switch (inst->src[i].file) {
2853       case GRF:
2854          printf("vgrf%d", inst->src[i].reg);
2855          if (inst->src[i].reg_offset)
2856             printf("+%d", inst->src[i].reg_offset);
2857          break;
2858       case MRF:
2859          printf("***m%d***", inst->src[i].reg);
2860          break;
2861       case UNIFORM:
2862          printf("u%d", inst->src[i].reg);
2863          if (inst->src[i].reg_offset)
2864             printf(".%d", inst->src[i].reg_offset);
2865          break;
2866       case BAD_FILE:
2867          printf("(null)");
2868          break;
2869       case IMM:
2870          switch (inst->src[i].type) {
2871          case BRW_REGISTER_TYPE_F:
2872             printf("%ff", inst->src[i].imm.f);
2873             break;
2874          case BRW_REGISTER_TYPE_D:
2875             printf("%dd", inst->src[i].imm.i);
2876             break;
2877          case BRW_REGISTER_TYPE_UD:
2878             printf("%uu", inst->src[i].imm.u);
2879             break;
2880          default:
2881             printf("???");
2882             break;
2883          }
2884          break;
2885       case HW_REG:
2886          if (inst->src[i].fixed_hw_reg.negate)
2887             printf("-");
2888          if (inst->src[i].fixed_hw_reg.abs)
2889             printf("|");
2890          printf("hw_reg%d", inst->src[i].fixed_hw_reg.nr);
2891          if (inst->src[i].fixed_hw_reg.subnr)
2892             printf("+%d", inst->src[i].fixed_hw_reg.subnr);
2893          if (inst->src[i].fixed_hw_reg.abs)
2894             printf("|");
2895          break;
2896       default:
2897          printf("???");
2898          break;
2899       }
2900       if (inst->src[i].abs)
2901          printf("|");
2902
2903       if (i < 3)
2904          printf(", ");
2905    }
2906
2907    printf(" ");
2908
2909    if (inst->force_uncompressed)
2910       printf("1sthalf ");
2911
2912    if (inst->force_sechalf)
2913       printf("2ndhalf ");
2914
2915    printf("\n");
2916 }
2917
2918 /**
2919  * Possibly returns an instruction that set up @param reg.
2920  *
2921  * Sometimes we want to take the result of some expression/variable
2922  * dereference tree and rewrite the instruction generating the result
2923  * of the tree.  When processing the tree, we know that the
2924  * instructions generated are all writing temporaries that are dead
2925  * outside of this tree.  So, if we have some instructions that write
2926  * a temporary, we're free to point that temp write somewhere else.
2927  *
2928  * Note that this doesn't guarantee that the instruction generated
2929  * only reg -- it might be the size=4 destination of a texture instruction.
2930  */
2931 fs_inst *
2932 fs_visitor::get_instruction_generating_reg(fs_inst *start,
2933                                            fs_inst *end,
2934                                            fs_reg reg)
2935 {
2936    if (end == start ||
2937        end->is_partial_write() ||
2938        reg.reladdr ||
2939        !reg.equals(end->dst)) {
2940       return NULL;
2941    } else {
2942       return end;
2943    }
2944 }
2945
2946 void
2947 fs_visitor::setup_payload_gen6()
2948 {
2949    bool uses_depth =
2950       (fp->Base.InputsRead & (1 << VARYING_SLOT_POS)) != 0;
2951    unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
2952
2953    assert(brw->gen >= 6);
2954
2955    /* R0-1: masks, pixel X/Y coordinates. */
2956    c->nr_payload_regs = 2;
2957    /* R2: only for 32-pixel dispatch.*/
2958
2959    /* R3-26: barycentric interpolation coordinates.  These appear in the
2960     * same order that they appear in the brw_wm_barycentric_interp_mode
2961     * enum.  Each set of coordinates occupies 2 registers if dispatch width
2962     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
2963     * appear if they were enabled using the "Barycentric Interpolation
2964     * Mode" bits in WM_STATE.
2965     */
2966    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
2967       if (barycentric_interp_modes & (1 << i)) {
2968          c->barycentric_coord_reg[i] = c->nr_payload_regs;
2969          c->nr_payload_regs += 2;
2970          if (dispatch_width == 16) {
2971             c->nr_payload_regs += 2;
2972          }
2973       }
2974    }
2975
2976    /* R27: interpolated depth if uses source depth */
2977    if (uses_depth) {
2978       c->source_depth_reg = c->nr_payload_regs;
2979       c->nr_payload_regs++;
2980       if (dispatch_width == 16) {
2981          /* R28: interpolated depth if not 8-wide. */
2982          c->nr_payload_regs++;
2983       }
2984    }
2985    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
2986    if (uses_depth) {
2987       c->source_w_reg = c->nr_payload_regs;
2988       c->nr_payload_regs++;
2989       if (dispatch_width == 16) {
2990          /* R30: interpolated W if not 8-wide. */
2991          c->nr_payload_regs++;
2992       }
2993    }
2994    /* R31: MSAA position offsets. */
2995    /* R32-: bary for 32-pixel. */
2996    /* R58-59: interp W for 32-pixel. */
2997
2998    if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
2999       c->source_depth_to_render_target = true;
3000    }
3001 }
3002
3003 void
3004 fs_visitor::assign_binding_table_offsets()
3005 {
3006    uint32_t next_binding_table_offset = 0;
3007
3008    c->prog_data.binding_table.render_target_start = next_binding_table_offset;
3009    next_binding_table_offset += c->key.nr_color_regions;
3010
3011    assign_common_binding_table_offsets(next_binding_table_offset);
3012 }
3013
3014 bool
3015 fs_visitor::run()
3016 {
3017    sanity_param_count = fp->Base.Parameters->NumParameters;
3018    uint32_t orig_nr_params = c->prog_data.nr_params;
3019
3020    assign_binding_table_offsets();
3021
3022    if (brw->gen >= 6)
3023       setup_payload_gen6();
3024    else
3025       setup_payload_gen4();
3026
3027    if (0) {
3028       emit_dummy_fs();
3029    } else {
3030       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3031          emit_shader_time_begin();
3032
3033       calculate_urb_setup();
3034       if (fp->Base.InputsRead > 0) {
3035          if (brw->gen < 6)
3036             emit_interpolation_setup_gen4();
3037          else
3038             emit_interpolation_setup_gen6();
3039       }
3040
3041       /* We handle discards by keeping track of the still-live pixels in f0.1.
3042        * Initialize it with the dispatched pixels.
3043        */
3044       if (fp->UsesKill) {
3045          fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
3046          discard_init->flag_subreg = 1;
3047       }
3048
3049       /* Generate FS IR for main().  (the visitor only descends into
3050        * functions called "main").
3051        */
3052       if (shader) {
3053          foreach_list(node, &*shader->ir) {
3054             ir_instruction *ir = (ir_instruction *)node;
3055             base_ir = ir;
3056             this->result = reg_undef;
3057             ir->accept(this);
3058          }
3059       } else {
3060          emit_fragment_program_code();
3061       }
3062       base_ir = NULL;
3063       if (failed)
3064          return false;
3065
3066       emit(FS_OPCODE_PLACEHOLDER_HALT);
3067
3068       emit_fb_writes();
3069
3070       split_virtual_grfs();
3071
3072       move_uniform_array_access_to_pull_constants();
3073       remove_dead_constants();
3074       setup_pull_constants();
3075
3076       bool progress;
3077       do {
3078          progress = false;
3079
3080          compact_virtual_grfs();
3081
3082          progress = remove_duplicate_mrf_writes() || progress;
3083
3084          progress = opt_algebraic() || progress;
3085          progress = opt_cse() || progress;
3086          progress = opt_copy_propagate() || progress;
3087          progress = dead_code_eliminate() || progress;
3088          progress = dead_code_eliminate_local() || progress;
3089          progress = register_coalesce() || progress;
3090          progress = register_coalesce_2() || progress;
3091          progress = compute_to_mrf() || progress;
3092       } while (progress);
3093
3094       schedule_instructions(false);
3095
3096       lower_uniform_pull_constant_loads();
3097
3098       assign_curb_setup();
3099       assign_urb_setup();
3100
3101       if (0)
3102          assign_regs_trivial();
3103       else {
3104          while (!assign_regs()) {
3105             if (failed)
3106                break;
3107          }
3108       }
3109    }
3110    assert(force_uncompressed_stack == 0);
3111    assert(force_sechalf_stack == 0);
3112
3113    /* This must come after all optimization and register allocation, since
3114     * it inserts dead code that happens to have side effects, and it does
3115     * so based on the actual physical registers in use.
3116     */
3117    insert_gen4_send_dependency_workarounds();
3118
3119    if (failed)
3120       return false;
3121
3122    schedule_instructions(true);
3123
3124    if (dispatch_width == 8) {
3125       c->prog_data.reg_blocks = brw_register_blocks(grf_used);
3126    } else {
3127       c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
3128
3129       /* Make sure we didn't try to sneak in an extra uniform */
3130       assert(orig_nr_params == c->prog_data.nr_params);
3131       (void) orig_nr_params;
3132    }
3133
3134    /* If any state parameters were appended, then ParameterValues could have
3135     * been realloced, in which case the driver uniform storage set up by
3136     * _mesa_associate_uniform_storage() would point to freed memory.  Make
3137     * sure that didn't happen.
3138     */
3139    assert(sanity_param_count == fp->Base.Parameters->NumParameters);
3140
3141    return !failed;
3142 }
3143
3144 const unsigned *
3145 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
3146                struct gl_fragment_program *fp,
3147                struct gl_shader_program *prog,
3148                unsigned *final_assembly_size)
3149 {
3150    bool start_busy = false;
3151    float start_time = 0;
3152
3153    if (unlikely(brw->perf_debug)) {
3154       start_busy = (brw->batch.last_bo &&
3155                     drm_intel_bo_busy(brw->batch.last_bo));
3156       start_time = get_time();
3157    }
3158
3159    struct brw_shader *shader = NULL;
3160    if (prog)
3161       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3162
3163    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3164       if (prog) {
3165          printf("GLSL IR for native fragment shader %d:\n", prog->Name);
3166          _mesa_print_ir(shader->ir, NULL);
3167          printf("\n\n");
3168       } else {
3169          printf("ARB_fragment_program %d ir for native fragment shader\n",
3170                 fp->Base.Id);
3171          _mesa_print_program(&fp->Base);
3172       }
3173    }
3174
3175    /* Now the main event: Visit the shader IR and generate our FS IR for it.
3176     */
3177    fs_visitor v(brw, c, prog, fp, 8);
3178    if (!v.run()) {
3179       if (prog) {
3180          prog->LinkStatus = false;
3181          ralloc_strcat(&prog->InfoLog, v.fail_msg);
3182       }
3183
3184       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3185                     v.fail_msg);
3186
3187       return NULL;
3188    }
3189
3190    exec_list *simd16_instructions = NULL;
3191    fs_visitor v2(brw, c, prog, fp, 16);
3192    if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16))) {
3193       if (c->prog_data.nr_pull_params == 0) {
3194          /* Try a 16-wide compile */
3195          v2.import_uniforms(&v);
3196          if (!v2.run()) {
3197             perf_debug("16-wide shader failed to compile, falling back to "
3198                        "8-wide at a 10-20%% performance cost: %s", v2.fail_msg);
3199          } else {
3200             simd16_instructions = &v2.instructions;
3201          }
3202       } else {
3203          perf_debug("Skipping 16-wide due to pull parameters.\n");
3204       }
3205    }
3206
3207    fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
3208    const unsigned *generated = g.generate_assembly(&v.instructions,
3209                                                    simd16_instructions,
3210                                                    final_assembly_size);
3211
3212    if (unlikely(brw->perf_debug) && shader) {
3213       if (shader->compiled_once)
3214          brw_wm_debug_recompile(brw, prog, &c->key);
3215       shader->compiled_once = true;
3216
3217       if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
3218          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3219                     (get_time() - start_time) * 1000);
3220       }
3221    }
3222
3223    return generated;
3224 }
3225
3226 bool
3227 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
3228 {
3229    struct brw_context *brw = brw_context(ctx);
3230    struct brw_wm_prog_key key;
3231
3232    if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
3233       return true;
3234
3235    struct gl_fragment_program *fp = (struct gl_fragment_program *)
3236       prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
3237    struct brw_fragment_program *bfp = brw_fragment_program(fp);
3238    bool program_uses_dfdy = fp->UsesDFdy;
3239
3240    memset(&key, 0, sizeof(key));
3241
3242    if (brw->gen < 6) {
3243       if (fp->UsesKill)
3244          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3245
3246       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3247          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3248
3249       /* Just assume depth testing. */
3250       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3251       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3252    }
3253
3254    if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
3255                                          BRW_FS_VARYING_INPUT_MASK) > 16)
3256       key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
3257
3258    key.clamp_fragment_color = ctx->API == API_OPENGL_COMPAT;
3259
3260    unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
3261    for (unsigned i = 0; i < sampler_count; i++) {
3262       if (fp->Base.ShadowSamplers & (1 << i)) {
3263          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3264          key.tex.swizzles[i] =
3265             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3266       } else {
3267          /* Color sampler: assume no swizzling. */
3268          key.tex.swizzles[i] = SWIZZLE_XYZW;
3269       }
3270    }
3271
3272    if (fp->Base.InputsRead & VARYING_BIT_POS) {
3273       key.drawable_height = ctx->DrawBuffer->Height;
3274    }
3275
3276    if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3277       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
3278    }
3279
3280    key.nr_color_regions = 1;
3281
3282    /* GL_FRAGMENT_SHADER_DERIVATIVE_HINT is almost always GL_DONT_CARE.  The
3283     * quality of the derivatives is likely to be determined by the driconf
3284     * option.
3285     */
3286    key.high_quality_derivatives = brw->disable_derivative_optimization;
3287
3288    key.program_string_id = bfp->id;
3289
3290    uint32_t old_prog_offset = brw->wm.base.prog_offset;
3291    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3292
3293    bool success = do_wm_prog(brw, prog, bfp, &key);
3294
3295    brw->wm.base.prog_offset = old_prog_offset;
3296    brw->wm.prog_data = old_prog_data;
3297
3298    return success;
3299 }