src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 extern "C" {
  32
  33 #include <sys/types.h>
  34
  35 #include "main/hash_table.h"
  36 #include "main/macros.h"
  37 #include "main/shaderobj.h"
  38 #include "main/fbobject.h"
  39 #include "program/prog_parameter.h"
  40 #include "program/prog_print.h"
  41 #include "program/register_allocate.h"
  42 #include "program/sampler.h"
  43 #include "program/hash_table.h"
  44 #include "brw_context.h"
  45 #include "brw_eu.h"
  46 #include "brw_wm.h"
  47 }
  48 #include "brw_fs.h"
  49 #include "main/uniforms.h"
  50 #include "brw_fs_live_variables.h"
  51 #include "glsl/glsl_types.h"
  52
  53 void
  54 fs_inst::init()
  55 {
  56    memset(this, 0, sizeof(*this));
  57    this->opcode = BRW_OPCODE_NOP;
  58    this->conditional_mod = BRW_CONDITIONAL_NONE;
  59
  60    this->dst = reg_undef;
  61    this->src[0] = reg_undef;
  62    this->src[1] = reg_undef;
  63    this->src[2] = reg_undef;
  64
  65    /* This will be the case for almost all instructions. */
  66    this->regs_written = 1;
  67 }
  68
  69 fs_inst::fs_inst()
  70 {
  71    init();
  72 }
  73
  74 fs_inst::fs_inst(enum opcode opcode)
  75 {
  76    init();
  77    this->opcode = opcode;
  78 }
  79
  80 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
  81 {
  82    init();
  83    this->opcode = opcode;
  84    this->dst = dst;
  85
  86    if (dst.file == GRF)
  87       assert(dst.reg_offset >= 0);
  88 }
  89
  90 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
  91 {
  92    init();
  93    this->opcode = opcode;
  94    this->dst = dst;
  95    this->src[0] = src0;
  96
  97    if (dst.file == GRF)
  98       assert(dst.reg_offset >= 0);
  99    if (src[0].file == GRF)
 100       assert(src[0].reg_offset >= 0);
 101 }
 102
 103 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 104 {
 105    init();
 106    this->opcode = opcode;
 107    this->dst = dst;
 108    this->src[0] = src0;
 109    this->src[1] = src1;
 110
 111    if (dst.file == GRF)
 112       assert(dst.reg_offset >= 0);
 113    if (src[0].file == GRF)
 114       assert(src[0].reg_offset >= 0);
 115    if (src[1].file == GRF)
 116       assert(src[1].reg_offset >= 0);
 117 }
 118
 119 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
 120                  fs_reg src0, fs_reg src1, fs_reg src2)
 121 {
 122    init();
 123    this->opcode = opcode;
 124    this->dst = dst;
 125    this->src[0] = src0;
 126    this->src[1] = src1;
 127    this->src[2] = src2;
 128
 129    if (dst.file == GRF)
 130       assert(dst.reg_offset >= 0);
 131    if (src[0].file == GRF)
 132       assert(src[0].reg_offset >= 0);
 133    if (src[1].file == GRF)
 134       assert(src[1].reg_offset >= 0);
 135    if (src[2].file == GRF)
 136       assert(src[2].reg_offset >= 0);
 137 }
 138
 139 #define ALU1(op)                                                        \
 140    fs_inst *                                                            \
 141    fs_visitor::op(fs_reg dst, fs_reg src0)                              \
 142    {                                                                    \
 143       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
 144    }
 145
 146 #define ALU2(op)                                                        \
 147    fs_inst *                                                            \
 148    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1)                 \
 149    {                                                                    \
 150       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
 151    }
 152
 153 #define ALU3(op)                                                        \
 154    fs_inst *                                                            \
 155    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1, fs_reg src2)    \
 156    {                                                                    \
 157       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
 158    }
 159
 160 ALU1(NOT)
 161 ALU1(MOV)
 162 ALU1(FRC)
 163 ALU1(RNDD)
 164 ALU1(RNDE)
 165 ALU1(RNDZ)
 166 ALU2(ADD)
 167 ALU2(MUL)
 168 ALU2(MACH)
 169 ALU2(AND)
 170 ALU2(OR)
 171 ALU2(XOR)
 172 ALU2(SHL)
 173 ALU2(SHR)
 174 ALU2(ASR)
 175 ALU3(LRP)
 176 ALU1(BFREV)
 177 ALU3(BFE)
 178 ALU2(BFI1)
 179 ALU3(BFI2)
 180 ALU1(FBH)
 181 ALU1(FBL)
 182 ALU1(CBIT)
 183 ALU3(MAD)
 184 ALU2(ADDC)
 185 ALU2(SUBB)
 186
 187 /** Gen4 predicated IF. */
 188 fs_inst *
 189 fs_visitor::IF(uint32_t predicate)
 190 {
 191    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
 192    inst->predicate = predicate;
 193    return inst;
 194 }
 195
 196 /** Gen6+ IF with embedded comparison. */
 197 fs_inst *
 198 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
 199 {
 200    assert(brw->gen >= 6);
 201    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
 202                                         reg_null_d, src0, src1);
 203    inst->conditional_mod = condition;
 204    return inst;
 205 }
 206
 207 /**
 208  * CMP: Sets the low bit of the destination channels with the result
 209  * of the comparison, while the upper bits are undefined, and updates
 210  * the flag register with the packed 16 bits of the result.
 211  */
 212 fs_inst *
 213 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
 214 {
 215    fs_inst *inst;
 216
 217    /* Take the instruction:
 218     *
 219     * CMP null<d> src0<f> src1<f>
 220     *
 221     * Original gen4 does type conversion to the destination type before
 222     * comparison, producing garbage results for floating point comparisons.
 223     * gen5 does the comparison on the execution type (resolved source types),
 224     * so dst type doesn't matter.  gen6 does comparison and then uses the
 225     * result as if it was the dst type with no conversion, which happens to
 226     * mostly work out for float-interpreted-as-int since our comparisons are
 227     * for >0, =0, <0.
 228     */
 229    if (brw->gen == 4) {
 230       dst.type = src0.type;
 231       if (dst.file == HW_REG)
 232          dst.fixed_hw_reg.type = dst.type;
 233    }
 234
 235    resolve_ud_negate(&src0);
 236    resolve_ud_negate(&src1);
 237
 238    inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
 239    inst->conditional_mod = condition;
 240
 241    return inst;
 242 }
 243
 244 exec_list
 245 fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
 246                                        fs_reg varying_offset,
 247                                        uint32_t const_offset)
 248 {
 249    exec_list instructions;
 250    fs_inst *inst;
 251
 252    /* We have our constant surface use a pitch of 4 bytes, so our index can
 253     * be any component of a vector, and then we load 4 contiguous
 254     * components starting from that.
 255     *
 256     * We break down the const_offset to a portion added to the variable
 257     * offset and a portion done using reg_offset, which means that if you
 258     * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
 259     * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
 260     * CSE can later notice that those loads are all the same and eliminate
 261     * the redundant ones.
 262     */
 263    fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
 264    instructions.push_tail(ADD(vec4_offset,
 265                               varying_offset, const_offset & ~3));
 266
 267    int scale = 1;
 268    if (brw->gen == 4 && dispatch_width == 8) {
 269       /* Pre-gen5, we can either use a SIMD8 message that requires (header,
 270        * u, v, r) as parameters, or we can just use the SIMD16 message
 271        * consisting of (header, u).  We choose the second, at the cost of a
 272        * longer return length.
 273        */
 274       scale = 2;
 275    }
 276
 277    enum opcode op;
 278    if (brw->gen >= 7)
 279       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
 280    else
 281       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
 282    fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4 * scale), dst.type);
 283    inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
 284    inst->regs_written = 4 * scale;
 285    instructions.push_tail(inst);
 286
 287    if (brw->gen < 7) {
 288       inst->base_mrf = 13;
 289       inst->header_present = true;
 290       if (brw->gen == 4)
 291          inst->mlen = 3;
 292       else
 293          inst->mlen = 1 + dispatch_width / 8;
 294    }
 295
 296    vec4_result.reg_offset += (const_offset & 3) * scale;
 297    instructions.push_tail(MOV(dst, vec4_result));
 298
 299    return instructions;
 300 }
 301
 302 /**
 303  * A helper for MOV generation for fixing up broken hardware SEND dependency
 304  * handling.
 305  */
 306 fs_inst *
 307 fs_visitor::DEP_RESOLVE_MOV(int grf)
 308 {
 309    fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
 310
 311    inst->ir = NULL;
 312    inst->annotation = "send dependency resolve";
 313
 314    /* The caller always wants uncompressed to emit the minimal extra
 315     * dependencies, and to avoid having to deal with aligning its regs to 2.
 316     */
 317    inst->force_uncompressed = true;
 318
 319    return inst;
 320 }
 321
 322 bool
 323 fs_inst::equals(fs_inst *inst)
 324 {
 325    return (opcode == inst->opcode &&
 326            dst.equals(inst->dst) &&
 327            src[0].equals(inst->src[0]) &&
 328            src[1].equals(inst->src[1]) &&
 329            src[2].equals(inst->src[2]) &&
 330            saturate == inst->saturate &&
 331            predicate == inst->predicate &&
 332            conditional_mod == inst->conditional_mod &&
 333            mlen == inst->mlen &&
 334            base_mrf == inst->base_mrf &&
 335            sampler == inst->sampler &&
 336            target == inst->target &&
 337            eot == inst->eot &&
 338            header_present == inst->header_present &&
 339            shadow_compare == inst->shadow_compare &&
 340            offset == inst->offset);
 341 }
 342
 343 bool
 344 fs_inst::overwrites_reg(const fs_reg &reg)
 345 {
 346    return (reg.file == dst.file &&
 347            reg.reg == dst.reg &&
 348            reg.reg_offset >= dst.reg_offset  &&
 349            reg.reg_offset < dst.reg_offset + regs_written);
 350 }
 351
 352 bool
 353 fs_inst::is_send_from_grf()
 354 {
 355    return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
 356            opcode == SHADER_OPCODE_SHADER_TIME_ADD ||
 357            (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
 358             src[1].file == GRF) ||
 359            (is_tex() && src[0].file == GRF));
 360 }
 361
 362 bool
 363 fs_visitor::can_do_source_mods(fs_inst *inst)
 364 {
 365    if (brw->gen == 6 && inst->is_math())
 366       return false;
 367
 368    if (inst->is_send_from_grf())
 369       return false;
 370
 371    if (!inst->can_do_source_mods())
 372       return false;
 373
 374    return true;
 375 }
 376
 377 void
 378 fs_reg::init()
 379 {
 380    memset(this, 0, sizeof(*this));
 381    this->smear = -1;
 382 }
 383
 384 /** Generic unset register constructor. */
 385 fs_reg::fs_reg()
 386 {
 387    init();
 388    this->file = BAD_FILE;
 389 }
 390
 391 /** Immediate value constructor. */
 392 fs_reg::fs_reg(float f)
 393 {
 394    init();
 395    this->file = IMM;
 396    this->type = BRW_REGISTER_TYPE_F;
 397    this->imm.f = f;
 398 }
 399
 400 /** Immediate value constructor. */
 401 fs_reg::fs_reg(int32_t i)
 402 {
 403    init();
 404    this->file = IMM;
 405    this->type = BRW_REGISTER_TYPE_D;
 406    this->imm.i = i;
 407 }
 408
 409 /** Immediate value constructor. */
 410 fs_reg::fs_reg(uint32_t u)
 411 {
 412    init();
 413    this->file = IMM;
 414    this->type = BRW_REGISTER_TYPE_UD;
 415    this->imm.u = u;
 416 }
 417
 418 /** Fixed brw_reg Immediate value constructor. */
 419 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
 420 {
 421    init();
 422    this->file = HW_REG;
 423    this->fixed_hw_reg = fixed_hw_reg;
 424    this->type = fixed_hw_reg.type;
 425 }
 426
 427 bool
 428 fs_reg::equals(const fs_reg &r) const
 429 {
 430    return (file == r.file &&
 431            reg == r.reg &&
 432            reg_offset == r.reg_offset &&
 433            type == r.type &&
 434            negate == r.negate &&
 435            abs == r.abs &&
 436            !reladdr && !r.reladdr &&
 437            memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
 438                   sizeof(fixed_hw_reg)) == 0 &&
 439            smear == r.smear &&
 440            imm.u == r.imm.u);
 441 }
 442
 443 fs_reg
 444 fs_reg::retype(uint32_t type)
 445 {
 446    fs_reg result = *this;
 447    result.type = type;
 448    return result;
 449 }
 450
 451 bool
 452 fs_reg::is_zero() const
 453 {
 454    if (file != IMM)
 455       return false;
 456
 457    return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
 458 }
 459
 460 bool
 461 fs_reg::is_one() const
 462 {
 463    if (file != IMM)
 464       return false;
 465
 466    return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
 467 }
 468
 469 bool
 470 fs_reg::is_valid_3src() const
 471 {
 472    return file == GRF || file == UNIFORM;
 473 }
 474
 475 int
 476 fs_visitor::type_size(const struct glsl_type *type)
 477 {
 478    unsigned int size, i;
 479
 480    switch (type->base_type) {
 481    case GLSL_TYPE_UINT:
 482    case GLSL_TYPE_INT:
 483    case GLSL_TYPE_FLOAT:
 484    case GLSL_TYPE_BOOL:
 485       return type->components();
 486    case GLSL_TYPE_ARRAY:
 487       return type_size(type->fields.array) * type->length;
 488    case GLSL_TYPE_STRUCT:
 489       size = 0;
 490       for (i = 0; i < type->length; i++) {
 491          size += type_size(type->fields.structure[i].type);
 492       }
 493       return size;
 494    case GLSL_TYPE_SAMPLER:
 495       /* Samplers take up no register space, since they're baked in at
 496        * link time.
 497        */
 498       return 0;
 499    case GLSL_TYPE_VOID:
 500    case GLSL_TYPE_ERROR:
 501    case GLSL_TYPE_INTERFACE:
 502       assert(!"not reached");
 503       break;
 504    }
 505
 506    return 0;
 507 }
 508
 509 fs_reg
 510 fs_visitor::get_timestamp()
 511 {
 512    assert(brw->gen >= 7);
 513
 514    fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 515                                           BRW_ARF_TIMESTAMP,
 516                                           0),
 517                              BRW_REGISTER_TYPE_UD));
 518
 519    fs_reg dst = fs_reg(this, glsl_type::uint_type);
 520
 521    fs_inst *mov = emit(MOV(dst, ts));
 522    /* We want to read the 3 fields we care about (mostly field 0, but also 2)
 523     * even if it's not enabled in the dispatch.
 524     */
 525    mov->force_writemask_all = true;
 526    mov->force_uncompressed = true;
 527
 528    /* The caller wants the low 32 bits of the timestamp.  Since it's running
 529     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
 530     * which is plenty of time for our purposes.  It is identical across the
 531     * EUs, but since it's tracking GPU core speed it will increment at a
 532     * varying rate as render P-states change.
 533     *
 534     * The caller could also check if render P-states have changed (or anything
 535     * else that might disrupt timing) by setting smear to 2 and checking if
 536     * that field is != 0.
 537     */
 538    dst.smear = 0;
 539
 540    return dst;
 541 }
 542
 543 void
 544 fs_visitor::emit_shader_time_begin()
 545 {
 546    current_annotation = "shader time start";
 547    shader_start_time = get_timestamp();
 548 }
 549
 550 void
 551 fs_visitor::emit_shader_time_end()
 552 {
 553    current_annotation = "shader time end";
 554
 555    enum shader_time_shader_type type, written_type, reset_type;
 556    if (dispatch_width == 8) {
 557       type = ST_FS8;
 558       written_type = ST_FS8_WRITTEN;
 559       reset_type = ST_FS8_RESET;
 560    } else {
 561       assert(dispatch_width == 16);
 562       type = ST_FS16;
 563       written_type = ST_FS16_WRITTEN;
 564       reset_type = ST_FS16_RESET;
 565    }
 566
 567    fs_reg shader_end_time = get_timestamp();
 568
 569    /* Check that there weren't any timestamp reset events (assuming these
 570     * were the only two timestamp reads that happened).
 571     */
 572    fs_reg reset = shader_end_time;
 573    reset.smear = 2;
 574    fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
 575    test->conditional_mod = BRW_CONDITIONAL_Z;
 576    emit(IF(BRW_PREDICATE_NORMAL));
 577
 578    push_force_uncompressed();
 579    fs_reg start = shader_start_time;
 580    start.negate = true;
 581    fs_reg diff = fs_reg(this, glsl_type::uint_type);
 582    emit(ADD(diff, start, shader_end_time));
 583
 584    /* If there were no instructions between the two timestamp gets, the diff
 585     * is 2 cycles.  Remove that overhead, so I can forget about that when
 586     * trying to determine the time taken for single instructions.
 587     */
 588    emit(ADD(diff, diff, fs_reg(-2u)));
 589
 590    emit_shader_time_write(type, diff);
 591    emit_shader_time_write(written_type, fs_reg(1u));
 592    emit(BRW_OPCODE_ELSE);
 593    emit_shader_time_write(reset_type, fs_reg(1u));
 594    emit(BRW_OPCODE_ENDIF);
 595
 596    pop_force_uncompressed();
 597 }
 598
 599 void
 600 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
 601                                    fs_reg value)
 602 {
 603    int shader_time_index =
 604       brw_get_shader_time_index(brw, shader_prog, &fp->Base, type);
 605    fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
 606
 607    fs_reg payload;
 608    if (dispatch_width == 8)
 609       payload = fs_reg(this, glsl_type::uvec2_type);
 610    else
 611       payload = fs_reg(this, glsl_type::uint_type);
 612
 613    emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
 614                 fs_reg(), payload, offset, value));
 615 }
 616
 617 void
 618 fs_visitor::fail(const char *format, ...)
 619 {
 620    va_list va;
 621    char *msg;
 622
 623    if (failed)
 624       return;
 625
 626    failed = true;
 627
 628    va_start(va, format);
 629    msg = ralloc_vasprintf(mem_ctx, format, va);
 630    va_end(va);
 631    msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
 632
 633    this->fail_msg = msg;
 634
 635    if (INTEL_DEBUG & DEBUG_WM) {
 636       fprintf(stderr, "%s",  msg);
 637    }
 638 }
 639
 640 fs_inst *
 641 fs_visitor::emit(enum opcode opcode)
 642 {
 643    return emit(fs_inst(opcode));
 644 }
 645
 646 fs_inst *
 647 fs_visitor::emit(enum opcode opcode, fs_reg dst)
 648 {
 649    return emit(fs_inst(opcode, dst));
 650 }
 651
 652 fs_inst *
 653 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
 654 {
 655    return emit(fs_inst(opcode, dst, src0));
 656 }
 657
 658 fs_inst *
 659 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 660 {
 661    return emit(fs_inst(opcode, dst, src0, src1));
 662 }
 663
 664 fs_inst *
 665 fs_visitor::emit(enum opcode opcode, fs_reg dst,
 666                  fs_reg src0, fs_reg src1, fs_reg src2)
 667 {
 668    return emit(fs_inst(opcode, dst, src0, src1, src2));
 669 }
 670
 671 void
 672 fs_visitor::push_force_uncompressed()
 673 {
 674    force_uncompressed_stack++;
 675 }
 676
 677 void
 678 fs_visitor::pop_force_uncompressed()
 679 {
 680    force_uncompressed_stack--;
 681    assert(force_uncompressed_stack >= 0);
 682 }
 683
 684 void
 685 fs_visitor::push_force_sechalf()
 686 {
 687    force_sechalf_stack++;
 688 }
 689
 690 void
 691 fs_visitor::pop_force_sechalf()
 692 {
 693    force_sechalf_stack--;
 694    assert(force_sechalf_stack >= 0);
 695 }
 696
 697 /**
 698  * Returns true if the instruction has a flag that means it won't
 699  * update an entire destination register.
 700  *
 701  * For example, dead code elimination and live variable analysis want to know
 702  * when a write to a variable screens off any preceding values that were in
 703  * it.
 704  */
 705 bool
 706 fs_inst::is_partial_write()
 707 {
 708    return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
 709            this->force_uncompressed ||
 710            this->force_sechalf);
 711 }
 712
 713 int
 714 fs_inst::regs_read(fs_visitor *v, int arg)
 715 {
 716    if (is_tex() && arg == 0 && src[0].file == GRF) {
 717       if (v->dispatch_width == 16)
 718          return (mlen + 1) / 2;
 719       else
 720          return mlen;
 721    }
 722    return 1;
 723 }
 724
 725 /**
 726  * Returns how many MRFs an FS opcode will write over.
 727  *
 728  * Note that this is not the 0 or 1 implied writes in an actual gen
 729  * instruction -- the FS opcodes often generate MOVs in addition.
 730  */
 731 int
 732 fs_visitor::implied_mrf_writes(fs_inst *inst)
 733 {
 734    if (inst->mlen == 0)
 735       return 0;
 736
 737    if (inst->base_mrf == -1)
 738       return 0;
 739
 740    switch (inst->opcode) {
 741    case SHADER_OPCODE_RCP:
 742    case SHADER_OPCODE_RSQ:
 743    case SHADER_OPCODE_SQRT:
 744    case SHADER_OPCODE_EXP2:
 745    case SHADER_OPCODE_LOG2:
 746    case SHADER_OPCODE_SIN:
 747    case SHADER_OPCODE_COS:
 748       return 1 * dispatch_width / 8;
 749    case SHADER_OPCODE_POW:
 750    case SHADER_OPCODE_INT_QUOTIENT:
 751    case SHADER_OPCODE_INT_REMAINDER:
 752       return 2 * dispatch_width / 8;
 753    case SHADER_OPCODE_TEX:
 754    case FS_OPCODE_TXB:
 755    case SHADER_OPCODE_TXD:
 756    case SHADER_OPCODE_TXF:
 757    case SHADER_OPCODE_TXF_MS:
 758    case SHADER_OPCODE_TG4:
 759    case SHADER_OPCODE_TG4_OFFSET:
 760    case SHADER_OPCODE_TXL:
 761    case SHADER_OPCODE_TXS:
 762    case SHADER_OPCODE_LOD:
 763       return 1;
 764    case FS_OPCODE_FB_WRITE:
 765       return 2;
 766    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 767    case FS_OPCODE_UNSPILL:
 768       return 1;
 769    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
 770       return inst->mlen;
 771    case FS_OPCODE_SPILL:
 772       return 2;
 773    default:
 774       assert(!"not reached");
 775       return inst->mlen;
 776    }
 777 }
 778
 779 int
 780 fs_visitor::virtual_grf_alloc(int size)
 781 {
 782    if (virtual_grf_array_size <= virtual_grf_count) {
 783       if (virtual_grf_array_size == 0)
 784          virtual_grf_array_size = 16;
 785       else
 786          virtual_grf_array_size *= 2;
 787       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 788                                    virtual_grf_array_size);
 789    }
 790    virtual_grf_sizes[virtual_grf_count] = size;
 791    return virtual_grf_count++;
 792 }
 793
 794 /** Fixed HW reg constructor. */
 795 fs_reg::fs_reg(enum register_file file, int reg)
 796 {
 797    init();
 798    this->file = file;
 799    this->reg = reg;
 800    this->type = BRW_REGISTER_TYPE_F;
 801 }
 802
 803 /** Fixed HW reg constructor. */
 804 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
 805 {
 806    init();
 807    this->file = file;
 808    this->reg = reg;
 809    this->type = type;
 810 }
 811
 812 /** Automatic reg constructor. */
 813 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
 814 {
 815    init();
 816
 817    this->file = GRF;
 818    this->reg = v->virtual_grf_alloc(v->type_size(type));
 819    this->reg_offset = 0;
 820    this->type = brw_type_for_base_type(type);
 821 }
 822
 823 fs_reg *
 824 fs_visitor::variable_storage(ir_variable *var)
 825 {
 826    return (fs_reg *)hash_table_find(this->variable_ht, var);
 827 }
 828
 829 void
 830 import_uniforms_callback(const void *key,
 831                          void *data,
 832                          void *closure)
 833 {
 834    struct hash_table *dst_ht = (struct hash_table *)closure;
 835    const fs_reg *reg = (const fs_reg *)data;
 836
 837    if (reg->file != UNIFORM)
 838       return;
 839
 840    hash_table_insert(dst_ht, data, key);
 841 }
 842
 843 /* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
 844  * This brings in those uniform definitions
 845  */
 846 void
 847 fs_visitor::import_uniforms(fs_visitor *v)
 848 {
 849    hash_table_call_foreach(v->variable_ht,
 850                            import_uniforms_callback,
 851                            variable_ht);
 852    this->params_remap = v->params_remap;
 853    this->nr_params_remap = v->nr_params_remap;
 854 }
 855
 856 /* Our support for uniforms is piggy-backed on the struct
 857  * gl_fragment_program, because that's where the values actually
 858  * get stored, rather than in some global gl_shader_program uniform
 859  * store.
 860  */
 861 void
 862 fs_visitor::setup_uniform_values(ir_variable *ir)
 863 {
 864    int namelen = strlen(ir->name);
 865
 866    /* The data for our (non-builtin) uniforms is stored in a series of
 867     * gl_uniform_driver_storage structs for each subcomponent that
 868     * glGetUniformLocation() could name.  We know it's been set up in the same
 869     * order we'd walk the type, so walk the list of storage and find anything
 870     * with our name, or the prefix of a component that starts with our name.
 871     */
 872    unsigned params_before = c->prog_data.nr_params;
 873    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
 874       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 875
 876       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 877           (storage->name[namelen] != 0 &&
 878            storage->name[namelen] != '.' &&
 879            storage->name[namelen] != '[')) {
 880          continue;
 881       }
 882
 883       unsigned slots = storage->type->component_slots();
 884       if (storage->array_elements)
 885          slots *= storage->array_elements;
 886
 887       for (unsigned i = 0; i < slots; i++) {
 888          c->prog_data.param[c->prog_data.nr_params++] =
 889             &storage->storage[i].f;
 890       }
 891    }
 892
 893    /* Make sure we actually initialized the right amount of stuff here. */
 894    assert(params_before + ir->type->component_slots() ==
 895           c->prog_data.nr_params);
 896    (void)params_before;
 897 }
 898
 899
 900 /* Our support for builtin uniforms is even scarier than non-builtin.
 901  * It sits on top of the PROG_STATE_VAR parameters that are
 902  * automatically updated from GL context state.
 903  */
 904 void
 905 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
 906 {
 907    const ir_state_slot *const slots = ir->state_slots;
 908    assert(ir->state_slots != NULL);
 909
 910    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 911       /* This state reference has already been setup by ir_to_mesa, but we'll
 912        * get the same index back here.
 913        */
 914       int index = _mesa_add_state_reference(this->fp->Base.Parameters,
 915                                             (gl_state_index *)slots[i].tokens);
 916
 917       /* Add each of the unique swizzles of the element as a parameter.
 918        * This'll end up matching the expected layout of the
 919        * array/matrix/structure we're trying to fill in.
 920        */
 921       int last_swiz = -1;
 922       for (unsigned int j = 0; j < 4; j++) {
 923          int swiz = GET_SWZ(slots[i].swizzle, j);
 924          if (swiz == last_swiz)
 925             break;
 926          last_swiz = swiz;
 927
 928          c->prog_data.param[c->prog_data.nr_params++] =
 929             &fp->Base.Parameters->ParameterValues[index][swiz].f;
 930       }
 931    }
 932 }
 933
 934 fs_reg *
 935 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
 936 {
 937    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 938    fs_reg wpos = *reg;
 939    bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
 940
 941    /* gl_FragCoord.x */
 942    if (ir->pixel_center_integer) {
 943       emit(MOV(wpos, this->pixel_x));
 944    } else {
 945       emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
 946    }
 947    wpos.reg_offset++;
 948
 949    /* gl_FragCoord.y */
 950    if (!flip && ir->pixel_center_integer) {
 951       emit(MOV(wpos, this->pixel_y));
 952    } else {
 953       fs_reg pixel_y = this->pixel_y;
 954       float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
 955
 956       if (flip) {
 957          pixel_y.negate = true;
 958          offset += c->key.drawable_height - 1.0;
 959       }
 960
 961       emit(ADD(wpos, pixel_y, fs_reg(offset)));
 962    }
 963    wpos.reg_offset++;
 964
 965    /* gl_FragCoord.z */
 966    if (brw->gen >= 6) {
 967       emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
 968    } else {
 969       emit(FS_OPCODE_LINTERP, wpos,
 970            this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
 971            this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
 972            interp_reg(VARYING_SLOT_POS, 2));
 973    }
 974    wpos.reg_offset++;
 975
 976    /* gl_FragCoord.w: Already set up in emit_interpolation */
 977    emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
 978
 979    return reg;
 980 }
 981
 982 fs_inst *
 983 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
 984                          glsl_interp_qualifier interpolation_mode,
 985                          bool is_centroid)
 986 {
 987    brw_wm_barycentric_interp_mode barycoord_mode;
 988    if (brw->gen >= 6) {
 989       if (is_centroid) {
 990          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
 991             barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
 992          else
 993             barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
 994       } else {
 995          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
 996             barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
 997          else
 998             barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
 999       }
1000    } else {
1001       /* On Ironlake and below, there is only one interpolation mode.
1002        * Centroid interpolation doesn't mean anything on this hardware --
1003        * there is no multisampling.
1004        */
1005       barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1006    }
1007    return emit(FS_OPCODE_LINTERP, attr,
1008                this->delta_x[barycoord_mode],
1009                this->delta_y[barycoord_mode], interp);
1010 }
1011
1012 fs_reg *
1013 fs_visitor::emit_general_interpolation(ir_variable *ir)
1014 {
1015    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1016    reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
1017    fs_reg attr = *reg;
1018
1019    unsigned int array_elements;
1020    const glsl_type *type;
1021
1022    if (ir->type->is_array()) {
1023       array_elements = ir->type->length;
1024       if (array_elements == 0) {
1025          fail("dereferenced array '%s' has length 0\n", ir->name);
1026       }
1027       type = ir->type->fields.array;
1028    } else {
1029       array_elements = 1;
1030       type = ir->type;
1031    }
1032
1033    glsl_interp_qualifier interpolation_mode =
1034       ir->determine_interpolation_mode(c->key.flat_shade);
1035
1036    int location = ir->location;
1037    for (unsigned int i = 0; i < array_elements; i++) {
1038       for (unsigned int j = 0; j < type->matrix_columns; j++) {
1039          if (c->prog_data.urb_setup[location] == -1) {
1040             /* If there's no incoming setup data for this slot, don't
1041              * emit interpolation for it.
1042              */
1043             attr.reg_offset += type->vector_elements;
1044             location++;
1045             continue;
1046          }
1047
1048          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1049             /* Constant interpolation (flat shading) case. The SF has
1050              * handed us defined values in only the constant offset
1051              * field of the setup reg.
1052              */
1053             for (unsigned int k = 0; k < type->vector_elements; k++) {
1054                struct brw_reg interp = interp_reg(location, k);
1055                interp = suboffset(interp, 3);
1056                interp.type = reg->type;
1057                emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1058                attr.reg_offset++;
1059             }
1060          } else {
1061             /* Smooth/noperspective interpolation case. */
1062             for (unsigned int k = 0; k < type->vector_elements; k++) {
1063                /* FINISHME: At some point we probably want to push
1064                 * this farther by giving similar treatment to the
1065                 * other potentially constant components of the
1066                 * attribute, as well as making brw_vs_constval.c
1067                 * handle varyings other than gl_TexCoord.
1068                 */
1069                struct brw_reg interp = interp_reg(location, k);
1070                emit_linterp(attr, fs_reg(interp), interpolation_mode,
1071                             ir->centroid);
1072                if (brw->needs_unlit_centroid_workaround && ir->centroid) {
1073                   /* Get the pixel/sample mask into f0 so that we know
1074                    * which pixels are lit.  Then, for each channel that is
1075                    * unlit, replace the centroid data with non-centroid
1076                    * data.
1077                    */
1078                   emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1079                   fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1080                                                interpolation_mode, false);
1081                   inst->predicate = BRW_PREDICATE_NORMAL;
1082                   inst->predicate_inverse = true;
1083                }
1084                if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1085                   emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1086                }
1087                attr.reg_offset++;
1088             }
1089
1090          }
1091          location++;
1092       }
1093    }
1094
1095    return reg;
1096 }
1097
1098 fs_reg *
1099 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1100 {
1101    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1102
1103    /* The frontfacing comes in as a bit in the thread payload. */
1104    if (brw->gen >= 6) {
1105       emit(BRW_OPCODE_ASR, *reg,
1106            fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1107            fs_reg(15));
1108       emit(BRW_OPCODE_NOT, *reg, *reg);
1109       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1110    } else {
1111       struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1112       /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1113        * us front face
1114        */
1115       emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1116       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1117    }
1118
1119    return reg;
1120 }
1121
1122 fs_reg
1123 fs_visitor::fix_math_operand(fs_reg src)
1124 {
1125    /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1126     * might be able to do better by doing execsize = 1 math and then
1127     * expanding that result out, but we would need to be careful with
1128     * masking.
1129     *
1130     * The hardware ignores source modifiers (negate and abs) on math
1131     * instructions, so we also move to a temp to set those up.
1132     */
1133    if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1134        !src.abs && !src.negate)
1135       return src;
1136
1137    /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1138     * operands to math
1139     */
1140    if (brw->gen >= 7 && src.file != IMM)
1141       return src;
1142
1143    fs_reg expanded = fs_reg(this, glsl_type::float_type);
1144    expanded.type = src.type;
1145    emit(BRW_OPCODE_MOV, expanded, src);
1146    return expanded;
1147 }
1148
1149 fs_inst *
1150 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1151 {
1152    switch (opcode) {
1153    case SHADER_OPCODE_RCP:
1154    case SHADER_OPCODE_RSQ:
1155    case SHADER_OPCODE_SQRT:
1156    case SHADER_OPCODE_EXP2:
1157    case SHADER_OPCODE_LOG2:
1158    case SHADER_OPCODE_SIN:
1159    case SHADER_OPCODE_COS:
1160       break;
1161    default:
1162       assert(!"not reached: bad math opcode");
1163       return NULL;
1164    }
1165
1166    /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
1167     * might be able to do better by doing execsize = 1 math and then
1168     * expanding that result out, but we would need to be careful with
1169     * masking.
1170     *
1171     * Gen 6 hardware ignores source modifiers (negate and abs) on math
1172     * instructions, so we also move to a temp to set those up.
1173     */
1174    if (brw->gen >= 6)
1175       src = fix_math_operand(src);
1176
1177    fs_inst *inst = emit(opcode, dst, src);
1178
1179    if (brw->gen < 6) {
1180       inst->base_mrf = 2;
1181       inst->mlen = dispatch_width / 8;
1182    }
1183
1184    return inst;
1185 }
1186
1187 fs_inst *
1188 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1189 {
1190    int base_mrf = 2;
1191    fs_inst *inst;
1192
1193    switch (opcode) {
1194    case SHADER_OPCODE_INT_QUOTIENT:
1195    case SHADER_OPCODE_INT_REMAINDER:
1196       if (brw->gen >= 7 && dispatch_width == 16)
1197          fail("16-wide INTDIV unsupported\n");
1198       break;
1199    case SHADER_OPCODE_POW:
1200       break;
1201    default:
1202       assert(!"not reached: unsupported binary math opcode.");
1203       return NULL;
1204    }
1205
1206    if (brw->gen >= 6) {
1207       src0 = fix_math_operand(src0);
1208       src1 = fix_math_operand(src1);
1209
1210       inst = emit(opcode, dst, src0, src1);
1211    } else {
1212       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1213        * "Message Payload":
1214        *
1215        * "Operand0[7].  For the INT DIV functions, this operand is the
1216        *  denominator."
1217        *  ...
1218        * "Operand1[7].  For the INT DIV functions, this operand is the
1219        *  numerator."
1220        */
1221       bool is_int_div = opcode != SHADER_OPCODE_POW;
1222       fs_reg &op0 = is_int_div ? src1 : src0;
1223       fs_reg &op1 = is_int_div ? src0 : src1;
1224
1225       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1226       inst = emit(opcode, dst, op0, reg_null_f);
1227
1228       inst->base_mrf = base_mrf;
1229       inst->mlen = 2 * dispatch_width / 8;
1230    }
1231    return inst;
1232 }
1233
1234 void
1235 fs_visitor::assign_curb_setup()
1236 {
1237    c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
1238    if (dispatch_width == 8) {
1239       c->prog_data.first_curbe_grf = c->nr_payload_regs;
1240    } else {
1241       c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1242    }
1243
1244    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1245    foreach_list(node, &this->instructions) {
1246       fs_inst *inst = (fs_inst *)node;
1247
1248       for (unsigned int i = 0; i < 3; i++) {
1249          if (inst->src[i].file == UNIFORM) {
1250             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1251             struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1252                                                   constant_nr / 8,
1253                                                   constant_nr % 8);
1254
1255             inst->src[i].file = HW_REG;
1256             inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
1257          }
1258       }
1259    }
1260 }
1261
1262 void
1263 fs_visitor::calculate_urb_setup()
1264 {
1265    for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1266       c->prog_data.urb_setup[i] = -1;
1267    }
1268
1269    int urb_next = 0;
1270    /* Figure out where each of the incoming setup attributes lands. */
1271    if (brw->gen >= 6) {
1272       if (_mesa_bitcount_64(fp->Base.InputsRead &
1273                             BRW_FS_VARYING_INPUT_MASK) <= 16) {
1274          /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1275           * first 16 varying inputs, so we can put them wherever we want.
1276           * Just put them in order.
1277           *
1278           * This is useful because it means that (a) inputs not used by the
1279           * fragment shader won't take up valuable register space, and (b) we
1280           * won't have to recompile the fragment shader if it gets paired with
1281           * a different vertex (or geometry) shader.
1282           */
1283          for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1284             if (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1285                 BITFIELD64_BIT(i)) {
1286                c->prog_data.urb_setup[i] = urb_next++;
1287             }
1288          }
1289       } else {
1290          /* We have enough input varyings that the SF/SBE pipeline stage can't
1291           * arbitrarily rearrange them to suit our whim; we have to put them
1292           * in an order that matches the output of the previous pipeline stage
1293           * (geometry or vertex shader).
1294           */
1295          struct brw_vue_map prev_stage_vue_map;
1296          brw_compute_vue_map(brw, &prev_stage_vue_map,
1297                              c->key.input_slots_valid);
1298          int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1299          assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1300          for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1301               slot++) {
1302             int varying = prev_stage_vue_map.slot_to_varying[slot];
1303             /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1304              * unused.
1305              */
1306             if (varying != BRW_VARYING_SLOT_COUNT &&
1307                 (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1308                  BITFIELD64_BIT(varying))) {
1309                c->prog_data.urb_setup[varying] = slot - first_slot;
1310             }
1311          }
1312          urb_next = prev_stage_vue_map.num_slots - first_slot;
1313       }
1314    } else {
1315       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1316       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1317          /* Point size is packed into the header, not as a general attribute */
1318          if (i == VARYING_SLOT_PSIZ)
1319             continue;
1320
1321          if (c->key.input_slots_valid & BITFIELD64_BIT(i)) {
1322             /* The back color slot is skipped when the front color is
1323              * also written to.  In addition, some slots can be
1324              * written in the vertex shader and not read in the
1325              * fragment shader.  So the register number must always be
1326              * incremented, mapped or not.
1327              */
1328             if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1329                c->prog_data.urb_setup[i] = urb_next;
1330             urb_next++;
1331          }
1332       }
1333
1334       /*
1335        * It's a FS only attribute, and we did interpolation for this attribute
1336        * in SF thread. So, count it here, too.
1337        *
1338        * See compile_sf_prog() for more info.
1339        */
1340       if (fp->Base.InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1341          c->prog_data.urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1342    }
1343
1344    c->prog_data.num_varying_inputs = urb_next;
1345 }
1346
1347 void
1348 fs_visitor::assign_urb_setup()
1349 {
1350    int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1351
1352    /* Offset all the urb_setup[] index by the actual position of the
1353     * setup regs, now that the location of the constants has been chosen.
1354     */
1355    foreach_list(node, &this->instructions) {
1356       fs_inst *inst = (fs_inst *)node;
1357
1358       if (inst->opcode == FS_OPCODE_LINTERP) {
1359          assert(inst->src[2].file == HW_REG);
1360          inst->src[2].fixed_hw_reg.nr += urb_start;
1361       }
1362
1363       if (inst->opcode == FS_OPCODE_CINTERP) {
1364          assert(inst->src[0].file == HW_REG);
1365          inst->src[0].fixed_hw_reg.nr += urb_start;
1366       }
1367    }
1368
1369    /* Each attribute is 4 setup channels, each of which is half a reg. */
1370    this->first_non_payload_grf =
1371       urb_start + c->prog_data.num_varying_inputs * 2;
1372 }
1373
1374 /**
1375  * Split large virtual GRFs into separate components if we can.
1376  *
1377  * This is mostly duplicated with what brw_fs_vector_splitting does,
1378  * but that's really conservative because it's afraid of doing
1379  * splitting that doesn't result in real progress after the rest of
1380  * the optimization phases, which would cause infinite looping in
1381  * optimization.  We can do it once here, safely.  This also has the
1382  * opportunity to split interpolated values, or maybe even uniforms,
1383  * which we don't have at the IR level.
1384  *
1385  * We want to split, because virtual GRFs are what we register
1386  * allocate and spill (due to contiguousness requirements for some
1387  * instructions), and they're what we naturally generate in the
1388  * codegen process, but most virtual GRFs don't actually need to be
1389  * contiguous sets of GRFs.  If we split, we'll end up with reduced
1390  * live intervals and better dead code elimination and coalescing.
1391  */
1392 void
1393 fs_visitor::split_virtual_grfs()
1394 {
1395    int num_vars = this->virtual_grf_count;
1396    bool split_grf[num_vars];
1397    int new_virtual_grf[num_vars];
1398
1399    /* Try to split anything > 0 sized. */
1400    for (int i = 0; i < num_vars; i++) {
1401       if (this->virtual_grf_sizes[i] != 1)
1402          split_grf[i] = true;
1403       else
1404          split_grf[i] = false;
1405    }
1406
1407    if (brw->has_pln &&
1408        this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1409       /* PLN opcodes rely on the delta_xy being contiguous.  We only have to
1410        * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1411        * Gen6, that was the only supported interpolation mode, and since Gen6,
1412        * delta_x and delta_y are in fixed hardware registers.
1413        */
1414       split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1415          false;
1416    }
1417
1418    foreach_list(node, &this->instructions) {
1419       fs_inst *inst = (fs_inst *)node;
1420
1421       /* If there's a SEND message that requires contiguous destination
1422        * registers, no splitting is allowed.
1423        */
1424       if (inst->regs_written > 1) {
1425          split_grf[inst->dst.reg] = false;
1426       }
1427
1428       /* If we're sending from a GRF, don't split it, on the assumption that
1429        * the send is reading the whole thing.
1430        */
1431       if (inst->is_send_from_grf()) {
1432          for (int i = 0; i < 3; i++) {
1433             if (inst->src[i].file == GRF) {
1434                split_grf[inst->src[i].reg] = false;
1435             }
1436          }
1437       }
1438    }
1439
1440    /* Allocate new space for split regs.  Note that the virtual
1441     * numbers will be contiguous.
1442     */
1443    for (int i = 0; i < num_vars; i++) {
1444       if (split_grf[i]) {
1445          new_virtual_grf[i] = virtual_grf_alloc(1);
1446          for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1447             int reg = virtual_grf_alloc(1);
1448             assert(reg == new_virtual_grf[i] + j - 1);
1449             (void) reg;
1450          }
1451          this->virtual_grf_sizes[i] = 1;
1452       }
1453    }
1454
1455    foreach_list(node, &this->instructions) {
1456       fs_inst *inst = (fs_inst *)node;
1457
1458       if (inst->dst.file == GRF &&
1459           split_grf[inst->dst.reg] &&
1460           inst->dst.reg_offset != 0) {
1461          inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1462                           inst->dst.reg_offset - 1);
1463          inst->dst.reg_offset = 0;
1464       }
1465       for (int i = 0; i < 3; i++) {
1466          if (inst->src[i].file == GRF &&
1467              split_grf[inst->src[i].reg] &&
1468              inst->src[i].reg_offset != 0) {
1469             inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1470                                 inst->src[i].reg_offset - 1);
1471             inst->src[i].reg_offset = 0;
1472          }
1473       }
1474    }
1475    invalidate_live_intervals();
1476 }
1477
1478 /**
1479  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1480  *
1481  * During code generation, we create tons of temporary variables, many of
1482  * which get immediately killed and are never used again.  Yet, in later
1483  * optimization and analysis passes, such as compute_live_intervals, we need
1484  * to loop over all the virtual GRFs.  Compacting them can save a lot of
1485  * overhead.
1486  */
1487 void
1488 fs_visitor::compact_virtual_grfs()
1489 {
1490    /* Mark which virtual GRFs are used, and count how many. */
1491    int remap_table[this->virtual_grf_count];
1492    memset(remap_table, -1, sizeof(remap_table));
1493
1494    foreach_list(node, &this->instructions) {
1495       const fs_inst *inst = (const fs_inst *) node;
1496
1497       if (inst->dst.file == GRF)
1498          remap_table[inst->dst.reg] = 0;
1499
1500       for (int i = 0; i < 3; i++) {
1501          if (inst->src[i].file == GRF)
1502             remap_table[inst->src[i].reg] = 0;
1503       }
1504    }
1505
1506    /* In addition to registers used in instructions, fs_visitor keeps
1507     * direct references to certain special values which must be patched:
1508     */
1509    fs_reg *special[] = {
1510       &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1511       &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1512       &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1513       &delta_x[0], &delta_x[1], &delta_x[2],
1514       &delta_x[3], &delta_x[4], &delta_x[5],
1515       &delta_y[0], &delta_y[1], &delta_y[2],
1516       &delta_y[3], &delta_y[4], &delta_y[5],
1517    };
1518    STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1519    STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1520
1521    /* Treat all special values as used, to be conservative */
1522    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1523       if (special[i]->file == GRF)
1524          remap_table[special[i]->reg] = 0;
1525    }
1526
1527    /* Compact the GRF arrays. */
1528    int new_index = 0;
1529    for (int i = 0; i < this->virtual_grf_count; i++) {
1530       if (remap_table[i] != -1) {
1531          remap_table[i] = new_index;
1532          virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1533          invalidate_live_intervals();
1534          ++new_index;
1535       }
1536    }
1537
1538    this->virtual_grf_count = new_index;
1539
1540    /* Patch all the instructions to use the newly renumbered registers */
1541    foreach_list(node, &this->instructions) {
1542       fs_inst *inst = (fs_inst *) node;
1543
1544       if (inst->dst.file == GRF)
1545          inst->dst.reg = remap_table[inst->dst.reg];
1546
1547       for (int i = 0; i < 3; i++) {
1548          if (inst->src[i].file == GRF)
1549             inst->src[i].reg = remap_table[inst->src[i].reg];
1550       }
1551    }
1552
1553    /* Patch all the references to special values */
1554    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1555       if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1556          special[i]->reg = remap_table[special[i]->reg];
1557    }
1558 }
1559
1560 bool
1561 fs_visitor::remove_dead_constants()
1562 {
1563    if (dispatch_width == 8) {
1564       this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
1565       this->nr_params_remap = c->prog_data.nr_params;
1566
1567       for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
1568          this->params_remap[i] = -1;
1569
1570       /* Find which params are still in use. */
1571       foreach_list(node, &this->instructions) {
1572          fs_inst *inst = (fs_inst *)node;
1573
1574          for (int i = 0; i < 3; i++) {
1575             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1576
1577             if (inst->src[i].file != UNIFORM)
1578                continue;
1579
1580             /* Section 5.11 of the OpenGL 4.3 spec says:
1581              *
1582              *     "Out-of-bounds reads return undefined values, which include
1583              *     values from other variables of the active program or zero."
1584              */
1585             if (constant_nr < 0 || constant_nr >= (int)c->prog_data.nr_params) {
1586                constant_nr = 0;
1587             }
1588
1589             /* For now, set this to non-negative.  We'll give it the
1590              * actual new number in a moment, in order to keep the
1591              * register numbers nicely ordered.
1592              */
1593             this->params_remap[constant_nr] = 0;
1594          }
1595       }
1596
1597       /* Figure out what the new numbers for the params will be.  At some
1598        * point when we're doing uniform array access, we're going to want
1599        * to keep the distinction between .reg and .reg_offset, but for
1600        * now we don't care.
1601        */
1602       unsigned int new_nr_params = 0;
1603       for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1604          if (this->params_remap[i] != -1) {
1605             this->params_remap[i] = new_nr_params++;
1606          }
1607       }
1608
1609       /* Update the list of params to be uploaded to match our new numbering. */
1610       for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1611          int remapped = this->params_remap[i];
1612
1613          if (remapped == -1)
1614             continue;
1615
1616          c->prog_data.param[remapped] = c->prog_data.param[i];
1617       }
1618
1619       c->prog_data.nr_params = new_nr_params;
1620    } else {
1621       /* This should have been generated in the 8-wide pass already. */
1622       assert(this->params_remap);
1623    }
1624
1625    /* Now do the renumbering of the shader to remove unused params. */
1626    foreach_list(node, &this->instructions) {
1627       fs_inst *inst = (fs_inst *)node;
1628
1629       for (int i = 0; i < 3; i++) {
1630          int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1631
1632          if (inst->src[i].file != UNIFORM)
1633             continue;
1634
1635          /* as above alias to 0 */
1636          if (constant_nr < 0 || constant_nr >= (int)this->nr_params_remap) {
1637             constant_nr = 0;
1638          }
1639          assert(this->params_remap[constant_nr] != -1);
1640          inst->src[i].reg = this->params_remap[constant_nr];
1641          inst->src[i].reg_offset = 0;
1642       }
1643    }
1644
1645    return true;
1646 }
1647
1648 /*
1649  * Implements array access of uniforms by inserting a
1650  * PULL_CONSTANT_LOAD instruction.
1651  *
1652  * Unlike temporary GRF array access (where we don't support it due to
1653  * the difficulty of doing relative addressing on instruction
1654  * destinations), we could potentially do array access of uniforms
1655  * that were loaded in GRF space as push constants.  In real-world
1656  * usage we've seen, though, the arrays being used are always larger
1657  * than we could load as push constants, so just always move all
1658  * uniform array access out to a pull constant buffer.
1659  */
1660 void
1661 fs_visitor::move_uniform_array_access_to_pull_constants()
1662 {
1663    int pull_constant_loc[c->prog_data.nr_params];
1664
1665    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1666       pull_constant_loc[i] = -1;
1667    }
1668
1669    /* Walk through and find array access of uniforms.  Put a copy of that
1670     * uniform in the pull constant buffer.
1671     *
1672     * Note that we don't move constant-indexed accesses to arrays.  No
1673     * testing has been done of the performance impact of this choice.
1674     */
1675    foreach_list_safe(node, &this->instructions) {
1676       fs_inst *inst = (fs_inst *)node;
1677
1678       for (int i = 0 ; i < 3; i++) {
1679          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1680             continue;
1681
1682          int uniform = inst->src[i].reg;
1683
1684          /* If this array isn't already present in the pull constant buffer,
1685           * add it.
1686           */
1687          if (pull_constant_loc[uniform] == -1) {
1688             const float **values = &c->prog_data.param[uniform];
1689
1690             pull_constant_loc[uniform] = c->prog_data.nr_pull_params;
1691
1692             assert(param_size[uniform]);
1693
1694             for (int j = 0; j < param_size[uniform]; j++) {
1695                c->prog_data.pull_param[c->prog_data.nr_pull_params++] =
1696                   values[j];
1697             }
1698          }
1699
1700          /* Set up the annotation tracking for new generated instructions. */
1701          base_ir = inst->ir;
1702          current_annotation = inst->annotation;
1703
1704          fs_reg surf_index = fs_reg(c->prog_data.base.binding_table.pull_constants_start);
1705          fs_reg temp = fs_reg(this, glsl_type::float_type);
1706          exec_list list = VARYING_PULL_CONSTANT_LOAD(temp,
1707                                                      surf_index,
1708                                                      *inst->src[i].reladdr,
1709                                                      pull_constant_loc[uniform] +
1710                                                      inst->src[i].reg_offset);
1711          inst->insert_before(&list);
1712
1713          inst->src[i].file = temp.file;
1714          inst->src[i].reg = temp.reg;
1715          inst->src[i].reg_offset = temp.reg_offset;
1716          inst->src[i].reladdr = NULL;
1717       }
1718    }
1719 }
1720
1721 /**
1722  * Choose accesses from the UNIFORM file to demote to using the pull
1723  * constant buffer.
1724  *
1725  * We allow a fragment shader to have more than the specified minimum
1726  * maximum number of fragment shader uniform components (64).  If
1727  * there are too many of these, they'd fill up all of register space.
1728  * So, this will push some of them out to the pull constant buffer and
1729  * update the program to load them.
1730  */
1731 void
1732 fs_visitor::setup_pull_constants()
1733 {
1734    /* Only allow 16 registers (128 uniform components) as push constants. */
1735    unsigned int max_uniform_components = 16 * 8;
1736    if (c->prog_data.nr_params <= max_uniform_components)
1737       return;
1738
1739    if (dispatch_width == 16) {
1740       fail("Pull constants not supported in 16-wide\n");
1741       return;
1742    }
1743
1744    /* Just demote the end of the list.  We could probably do better
1745     * here, demoting things that are rarely used in the program first.
1746     */
1747    unsigned int pull_uniform_base = max_uniform_components;
1748
1749    int pull_constant_loc[c->prog_data.nr_params];
1750    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1751       if (i < pull_uniform_base) {
1752          pull_constant_loc[i] = -1;
1753       } else {
1754          pull_constant_loc[i] = -1;
1755          /* If our constant is already being uploaded for reladdr purposes,
1756           * reuse it.
1757           */
1758          for (unsigned int j = 0; j < c->prog_data.nr_pull_params; j++) {
1759             if (c->prog_data.pull_param[j] == c->prog_data.param[i]) {
1760                pull_constant_loc[i] = j;
1761                break;
1762             }
1763          }
1764          if (pull_constant_loc[i] == -1) {
1765             int pull_index = c->prog_data.nr_pull_params++;
1766             c->prog_data.pull_param[pull_index] = c->prog_data.param[i];
1767             pull_constant_loc[i] = pull_index;;
1768          }
1769       }
1770    }
1771    c->prog_data.nr_params = pull_uniform_base;
1772
1773    foreach_list(node, &this->instructions) {
1774       fs_inst *inst = (fs_inst *)node;
1775
1776       for (int i = 0; i < 3; i++) {
1777          if (inst->src[i].file != UNIFORM)
1778             continue;
1779
1780          int pull_index = pull_constant_loc[inst->src[i].reg +
1781                                             inst->src[i].reg_offset];
1782          if (pull_index == -1)
1783             continue;
1784
1785          assert(!inst->src[i].reladdr);
1786
1787          fs_reg dst = fs_reg(this, glsl_type::float_type);
1788          fs_reg index = fs_reg(c->prog_data.base.binding_table.pull_constants_start);
1789          fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1790          fs_inst *pull =
1791             new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1792                                  dst, index, offset);
1793          pull->ir = inst->ir;
1794          pull->annotation = inst->annotation;
1795
1796          inst->insert_before(pull);
1797
1798          inst->src[i].file = GRF;
1799          inst->src[i].reg = dst.reg;
1800          inst->src[i].reg_offset = 0;
1801          inst->src[i].smear = pull_index & 3;
1802       }
1803    }
1804 }
1805
1806 bool
1807 fs_visitor::opt_algebraic()
1808 {
1809    bool progress = false;
1810
1811    foreach_list(node, &this->instructions) {
1812       fs_inst *inst = (fs_inst *)node;
1813
1814       switch (inst->opcode) {
1815       case BRW_OPCODE_MUL:
1816          if (inst->src[1].file != IMM)
1817             continue;
1818
1819          /* a * 1.0 = a */
1820          if (inst->src[1].is_one()) {
1821             inst->opcode = BRW_OPCODE_MOV;
1822             inst->src[1] = reg_undef;
1823             progress = true;
1824             break;
1825          }
1826
1827          /* a * 0.0 = 0.0 */
1828          if (inst->src[1].is_zero()) {
1829             inst->opcode = BRW_OPCODE_MOV;
1830             inst->src[0] = inst->src[1];
1831             inst->src[1] = reg_undef;
1832             progress = true;
1833             break;
1834          }
1835
1836          break;
1837       case BRW_OPCODE_ADD:
1838          if (inst->src[1].file != IMM)
1839             continue;
1840
1841          /* a + 0.0 = a */
1842          if (inst->src[1].is_zero()) {
1843             inst->opcode = BRW_OPCODE_MOV;
1844             inst->src[1] = reg_undef;
1845             progress = true;
1846             break;
1847          }
1848          break;
1849       default:
1850          break;
1851       }
1852    }
1853
1854    return progress;
1855 }
1856
1857 /**
1858  * Removes any instructions writing a VGRF where that VGRF is not used by any
1859  * later instruction.
1860  */
1861 bool
1862 fs_visitor::dead_code_eliminate()
1863 {
1864    bool progress = false;
1865    int pc = 0;
1866
1867    calculate_live_intervals();
1868
1869    foreach_list_safe(node, &this->instructions) {
1870       fs_inst *inst = (fs_inst *)node;
1871
1872       if (inst->dst.file == GRF) {
1873          bool dead = true;
1874
1875          for (int i = 0; i < inst->regs_written; i++) {
1876             int var = live_intervals->var_from_vgrf[inst->dst.reg];
1877             assert(live_intervals->end[var + inst->dst.reg_offset + i] >= pc);
1878             if (live_intervals->end[var + inst->dst.reg_offset + i] != pc) {
1879                dead = false;
1880                break;
1881             }
1882          }
1883
1884          if (dead) {
1885             /* Don't dead code eliminate instructions that write to the
1886              * accumulator as a side-effect. Instead just set the destination
1887              * to the null register to free it.
1888              */
1889             switch (inst->opcode) {
1890             case BRW_OPCODE_ADDC:
1891             case BRW_OPCODE_SUBB:
1892             case BRW_OPCODE_MACH:
1893                inst->dst = fs_reg(retype(brw_null_reg(), inst->dst.type));
1894                break;
1895             default:
1896                inst->remove();
1897                progress = true;
1898                break;
1899             }
1900          }
1901       }
1902
1903       pc++;
1904    }
1905
1906    if (progress)
1907       invalidate_live_intervals();
1908
1909    return progress;
1910 }
1911
1912 struct dead_code_hash_key
1913 {
1914    int vgrf;
1915    int reg_offset;
1916 };
1917
1918 static bool
1919 dead_code_hash_compare(const void *a, const void *b)
1920 {
1921    return memcmp(a, b, sizeof(struct dead_code_hash_key)) == 0;
1922 }
1923
1924 static void
1925 clear_dead_code_hash(struct hash_table *ht)
1926 {
1927    struct hash_entry *entry;
1928
1929    hash_table_foreach(ht, entry) {
1930       _mesa_hash_table_remove(ht, entry);
1931    }
1932 }
1933
1934 static void
1935 insert_dead_code_hash(struct hash_table *ht,
1936                       int vgrf, int reg_offset, fs_inst *inst)
1937 {
1938    /* We don't bother freeing keys, because they'll be GCed with the ht. */
1939    struct dead_code_hash_key *key = ralloc(ht, struct dead_code_hash_key);
1940
1941    key->vgrf = vgrf;
1942    key->reg_offset = reg_offset;
1943
1944    _mesa_hash_table_insert(ht, _mesa_hash_data(key, sizeof(*key)), key, inst);
1945 }
1946
1947 static struct hash_entry *
1948 get_dead_code_hash_entry(struct hash_table *ht, int vgrf, int reg_offset)
1949 {
1950    struct dead_code_hash_key key;
1951
1952    key.vgrf = vgrf;
1953    key.reg_offset = reg_offset;
1954
1955    return _mesa_hash_table_search(ht, _mesa_hash_data(&key, sizeof(key)), &key);
1956 }
1957
1958 static void
1959 remove_dead_code_hash(struct hash_table *ht,
1960                       int vgrf, int reg_offset)
1961 {
1962    struct hash_entry *entry = get_dead_code_hash_entry(ht, vgrf, reg_offset);
1963    if (!entry)
1964       return;
1965
1966    _mesa_hash_table_remove(ht, entry);
1967 }
1968
1969 /**
1970  * Walks basic blocks, removing any regs that are written but not read before
1971  * being redefined.
1972  *
1973  * The dead_code_eliminate() function implements a global dead code
1974  * elimination, but it only handles the removing the last write to a register
1975  * if it's never read.  This one can handle intermediate writes, but only
1976  * within a basic block.
1977  */
1978 bool
1979 fs_visitor::dead_code_eliminate_local()
1980 {
1981    struct hash_table *ht;
1982    bool progress = false;
1983
1984    ht = _mesa_hash_table_create(mem_ctx, dead_code_hash_compare);
1985
1986    foreach_list_safe(node, &this->instructions) {
1987       fs_inst *inst = (fs_inst *)node;
1988
1989       /* At a basic block, empty the HT since we don't understand dataflow
1990        * here.
1991        */
1992       if (inst->is_control_flow()) {
1993          clear_dead_code_hash(ht);
1994          continue;
1995       }
1996
1997       /* Clear the HT of any instructions that got read. */
1998       for (int i = 0; i < 3; i++) {
1999          fs_reg src = inst->src[i];
2000          if (src.file != GRF)
2001             continue;
2002
2003          int read = 1;
2004          if (inst->is_send_from_grf())
2005             read = virtual_grf_sizes[src.reg] - src.reg_offset;
2006
2007          for (int reg_offset = src.reg_offset;
2008               reg_offset < src.reg_offset + read;
2009               reg_offset++) {
2010             remove_dead_code_hash(ht, src.reg, reg_offset);
2011          }
2012       }
2013
2014       /* Add any update of a GRF to the HT, removing a previous write if it
2015        * wasn't read.
2016        */
2017       if (inst->dst.file == GRF) {
2018          if (inst->regs_written > 1) {
2019             /* We don't know how to trim channels from an instruction's
2020              * writes, so we can't incrementally remove unread channels from
2021              * it.  Just remove whatever it overwrites from the table
2022              */
2023             for (int i = 0; i < inst->regs_written; i++) {
2024                remove_dead_code_hash(ht,
2025                                      inst->dst.reg,
2026                                      inst->dst.reg_offset + i);
2027             }
2028          } else {
2029             struct hash_entry *entry =
2030                get_dead_code_hash_entry(ht, inst->dst.reg,
2031                                         inst->dst.reg_offset);
2032
2033             if (inst->is_partial_write()) {
2034                /* For a partial write, we can't remove any previous dead code
2035                 * candidate, since we're just modifying their result, but we can
2036                 * be dead code eliminiated ourselves.
2037                 */
2038                if (entry) {
2039                   entry->data = inst;
2040                } else {
2041                   insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
2042                                         inst);
2043                }
2044             } else {
2045                if (entry) {
2046                   /* We're completely updating a channel, and there was a
2047                    * previous write to the channel that wasn't read.  Kill it!
2048                    */
2049                   fs_inst *inst = (fs_inst *)entry->data;
2050                   inst->remove();
2051                   progress = true;
2052                   _mesa_hash_table_remove(ht, entry);
2053                }
2054
2055                insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
2056                                      inst);
2057             }
2058          }
2059       }
2060    }
2061
2062    _mesa_hash_table_destroy(ht, NULL);
2063
2064    if (progress)
2065       invalidate_live_intervals();
2066
2067    return progress;
2068 }
2069
2070 /**
2071  * Implements a second type of register coalescing: This one checks if
2072  * the two regs involved in a raw move don't interfere, in which case
2073  * they can both by stored in the same place and the MOV removed.
2074  */
2075 bool
2076 fs_visitor::register_coalesce_2()
2077 {
2078    bool progress = false;
2079
2080    calculate_live_intervals();
2081
2082    foreach_list_safe(node, &this->instructions) {
2083       fs_inst *inst = (fs_inst *)node;
2084
2085       if (inst->opcode != BRW_OPCODE_MOV ||
2086           inst->is_partial_write() ||
2087           inst->saturate ||
2088           inst->src[0].file != GRF ||
2089           inst->src[0].negate ||
2090           inst->src[0].abs ||
2091           inst->src[0].smear != -1 ||
2092           inst->dst.file != GRF ||
2093           inst->dst.type != inst->src[0].type ||
2094           virtual_grf_sizes[inst->src[0].reg] != 1) {
2095          continue;
2096       }
2097
2098       int var_from = live_intervals->var_from_reg(&inst->src[0]);
2099       int var_to = live_intervals->var_from_reg(&inst->dst);
2100
2101       if (live_intervals->vars_interfere(var_from, var_to))
2102          continue;
2103
2104       int reg_from = inst->src[0].reg;
2105       assert(inst->src[0].reg_offset == 0);
2106       int reg_to = inst->dst.reg;
2107       int reg_to_offset = inst->dst.reg_offset;
2108
2109       foreach_list(node, &this->instructions) {
2110          fs_inst *scan_inst = (fs_inst *)node;
2111
2112          if (scan_inst->dst.file == GRF &&
2113              scan_inst->dst.reg == reg_from) {
2114             scan_inst->dst.reg = reg_to;
2115             scan_inst->dst.reg_offset = reg_to_offset;
2116          }
2117          for (int i = 0; i < 3; i++) {
2118             if (scan_inst->src[i].file == GRF &&
2119                 scan_inst->src[i].reg == reg_from) {
2120                scan_inst->src[i].reg = reg_to;
2121                scan_inst->src[i].reg_offset = reg_to_offset;
2122             }
2123          }
2124       }
2125
2126       inst->remove();
2127       progress = true;
2128       continue;
2129    }
2130
2131    if (progress)
2132       invalidate_live_intervals();
2133
2134    return progress;
2135 }
2136
2137 bool
2138 fs_visitor::register_coalesce()
2139 {
2140    bool progress = false;
2141    int if_depth = 0;
2142    int loop_depth = 0;
2143
2144    foreach_list_safe(node, &this->instructions) {
2145       fs_inst *inst = (fs_inst *)node;
2146
2147       /* Make sure that we dominate the instructions we're going to
2148        * scan for interfering with our coalescing, or we won't have
2149        * scanned enough to see if anything interferes with our
2150        * coalescing.  We don't dominate the following instructions if
2151        * we're in a loop or an if block.
2152        */
2153       switch (inst->opcode) {
2154       case BRW_OPCODE_DO:
2155          loop_depth++;
2156          break;
2157       case BRW_OPCODE_WHILE:
2158          loop_depth--;
2159          break;
2160       case BRW_OPCODE_IF:
2161          if_depth++;
2162          break;
2163       case BRW_OPCODE_ENDIF:
2164          if_depth--;
2165          break;
2166       default:
2167          break;
2168       }
2169       if (loop_depth || if_depth)
2170          continue;
2171
2172       if (inst->opcode != BRW_OPCODE_MOV ||
2173           inst->is_partial_write() ||
2174           inst->saturate ||
2175           inst->dst.file != GRF || (inst->src[0].file != GRF &&
2176                                     inst->src[0].file != UNIFORM)||
2177           inst->dst.type != inst->src[0].type)
2178          continue;
2179
2180       bool has_source_modifiers = (inst->src[0].abs ||
2181                                    inst->src[0].negate ||
2182                                    inst->src[0].smear != -1 ||
2183                                    inst->src[0].file == UNIFORM);
2184
2185       /* Found a move of a GRF to a GRF.  Let's see if we can coalesce
2186        * them: check for no writes to either one until the exit of the
2187        * program.
2188        */
2189       bool interfered = false;
2190
2191       for (fs_inst *scan_inst = (fs_inst *)inst->next;
2192            !scan_inst->is_tail_sentinel();
2193            scan_inst = (fs_inst *)scan_inst->next) {
2194          if (scan_inst->dst.file == GRF) {
2195             if (scan_inst->overwrites_reg(inst->dst) ||
2196                 scan_inst->overwrites_reg(inst->src[0])) {
2197                interfered = true;
2198                break;
2199             }
2200          }
2201
2202          if (has_source_modifiers) {
2203             for (int i = 0; i < 3; i++) {
2204                if (scan_inst->src[i].file == GRF &&
2205                    scan_inst->src[i].reg == inst->dst.reg &&
2206                    scan_inst->src[i].reg_offset == inst->dst.reg_offset &&
2207                    inst->dst.type != scan_inst->src[i].type)
2208                {
2209                  interfered = true;
2210                  break;
2211                }
2212             }
2213          }
2214
2215
2216          /* The gen6 MATH instruction can't handle source modifiers or
2217           * unusual register regions, so avoid coalescing those for
2218           * now.  We should do something more specific.
2219           */
2220          if (has_source_modifiers && !can_do_source_mods(scan_inst)) {
2221             interfered = true;
2222             break;
2223          }
2224
2225          if (scan_inst->mlen > 0 && scan_inst->base_mrf == -1 &&
2226              scan_inst->src[0].file == GRF &&
2227              scan_inst->src[0].reg == inst->dst.reg) {
2228             interfered = true;
2229             break;
2230          }
2231
2232          /* The accumulator result appears to get used for the
2233           * conditional modifier generation.  When negating a UD
2234           * value, there is a 33rd bit generated for the sign in the
2235           * accumulator value, so now you can't check, for example,
2236           * equality with a 32-bit value.  See piglit fs-op-neg-uint.
2237           */
2238          if (scan_inst->conditional_mod &&
2239              inst->src[0].negate &&
2240              inst->src[0].type == BRW_REGISTER_TYPE_UD) {
2241             interfered = true;
2242             break;
2243          }
2244       }
2245       if (interfered) {
2246          continue;
2247       }
2248
2249       /* Rewrite the later usage to point at the source of the move to
2250        * be removed.
2251        */
2252       for (fs_inst *scan_inst = inst;
2253            !scan_inst->is_tail_sentinel();
2254            scan_inst = (fs_inst *)scan_inst->next) {
2255          for (int i = 0; i < 3; i++) {
2256             if (scan_inst->src[i].file == GRF &&
2257                 scan_inst->src[i].reg == inst->dst.reg &&
2258                 scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
2259                fs_reg new_src = inst->src[0];
2260                if (scan_inst->src[i].abs) {
2261                   new_src.negate = 0;
2262                   new_src.abs = 1;
2263                }
2264                new_src.negate ^= scan_inst->src[i].negate;
2265                new_src.sechalf = scan_inst->src[i].sechalf;
2266                scan_inst->src[i] = new_src;
2267             }
2268          }
2269       }
2270
2271       inst->remove();
2272       progress = true;
2273    }
2274
2275    if (progress)
2276       invalidate_live_intervals();
2277
2278    return progress;
2279 }
2280
2281
2282 bool
2283 fs_visitor::compute_to_mrf()
2284 {
2285    bool progress = false;
2286    int next_ip = 0;
2287
2288    calculate_live_intervals();
2289
2290    foreach_list_safe(node, &this->instructions) {
2291       fs_inst *inst = (fs_inst *)node;
2292
2293       int ip = next_ip;
2294       next_ip++;
2295
2296       if (inst->opcode != BRW_OPCODE_MOV ||
2297           inst->is_partial_write() ||
2298           inst->dst.file != MRF || inst->src[0].file != GRF ||
2299           inst->dst.type != inst->src[0].type ||
2300           inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
2301          continue;
2302
2303       /* Work out which hardware MRF registers are written by this
2304        * instruction.
2305        */
2306       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2307       int mrf_high;
2308       if (inst->dst.reg & BRW_MRF_COMPR4) {
2309          mrf_high = mrf_low + 4;
2310       } else if (dispatch_width == 16 &&
2311                  (!inst->force_uncompressed && !inst->force_sechalf)) {
2312          mrf_high = mrf_low + 1;
2313       } else {
2314          mrf_high = mrf_low;
2315       }
2316
2317       /* Can't compute-to-MRF this GRF if someone else was going to
2318        * read it later.
2319        */
2320       if (this->virtual_grf_end[inst->src[0].reg] > ip)
2321          continue;
2322
2323       /* Found a move of a GRF to a MRF.  Let's see if we can go
2324        * rewrite the thing that made this GRF to write into the MRF.
2325        */
2326       fs_inst *scan_inst;
2327       for (scan_inst = (fs_inst *)inst->prev;
2328            scan_inst->prev != NULL;
2329            scan_inst = (fs_inst *)scan_inst->prev) {
2330          if (scan_inst->dst.file == GRF &&
2331              scan_inst->dst.reg == inst->src[0].reg) {
2332             /* Found the last thing to write our reg we want to turn
2333              * into a compute-to-MRF.
2334              */
2335
2336             /* If this one instruction didn't populate all the
2337              * channels, bail.  We might be able to rewrite everything
2338              * that writes that reg, but it would require smarter
2339              * tracking to delay the rewriting until complete success.
2340              */
2341             if (scan_inst->is_partial_write())
2342                break;
2343
2344             /* Things returning more than one register would need us to
2345              * understand coalescing out more than one MOV at a time.
2346              */
2347             if (scan_inst->regs_written > 1)
2348                break;
2349
2350             /* SEND instructions can't have MRF as a destination. */
2351             if (scan_inst->mlen)
2352                break;
2353
2354             if (brw->gen == 6) {
2355                /* gen6 math instructions must have the destination be
2356                 * GRF, so no compute-to-MRF for them.
2357                 */
2358                if (scan_inst->is_math()) {
2359                   break;
2360                }
2361             }
2362
2363             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2364                /* Found the creator of our MRF's source value. */
2365                scan_inst->dst.file = MRF;
2366                scan_inst->dst.reg = inst->dst.reg;
2367                scan_inst->saturate |= inst->saturate;
2368                inst->remove();
2369                progress = true;
2370             }
2371             break;
2372          }
2373
2374          /* We don't handle control flow here.  Most computation of
2375           * values that end up in MRFs are shortly before the MRF
2376           * write anyway.
2377           */
2378          if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2379             break;
2380
2381          /* You can't read from an MRF, so if someone else reads our
2382           * MRF's source GRF that we wanted to rewrite, that stops us.
2383           */
2384          bool interfered = false;
2385          for (int i = 0; i < 3; i++) {
2386             if (scan_inst->src[i].file == GRF &&
2387                 scan_inst->src[i].reg == inst->src[0].reg &&
2388                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2389                interfered = true;
2390             }
2391          }
2392          if (interfered)
2393             break;
2394
2395          if (scan_inst->dst.file == MRF) {
2396             /* If somebody else writes our MRF here, we can't
2397              * compute-to-MRF before that.
2398              */
2399             int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2400             int scan_mrf_high;
2401
2402             if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2403                scan_mrf_high = scan_mrf_low + 4;
2404             } else if (dispatch_width == 16 &&
2405                        (!scan_inst->force_uncompressed &&
2406                         !scan_inst->force_sechalf)) {
2407                scan_mrf_high = scan_mrf_low + 1;
2408             } else {
2409                scan_mrf_high = scan_mrf_low;
2410             }
2411
2412             if (mrf_low == scan_mrf_low ||
2413                 mrf_low == scan_mrf_high ||
2414                 mrf_high == scan_mrf_low ||
2415                 mrf_high == scan_mrf_high) {
2416                break;
2417             }
2418          }
2419
2420          if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2421             /* Found a SEND instruction, which means that there are
2422              * live values in MRFs from base_mrf to base_mrf +
2423              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
2424              * above it.
2425              */
2426             if (mrf_low >= scan_inst->base_mrf &&
2427                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2428                break;
2429             }
2430             if (mrf_high >= scan_inst->base_mrf &&
2431                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2432                break;
2433             }
2434          }
2435       }
2436    }
2437
2438    if (progress)
2439       invalidate_live_intervals();
2440
2441    return progress;
2442 }
2443
2444 /**
2445  * Walks through basic blocks, looking for repeated MRF writes and
2446  * removing the later ones.
2447  */
2448 bool
2449 fs_visitor::remove_duplicate_mrf_writes()
2450 {
2451    fs_inst *last_mrf_move[16];
2452    bool progress = false;
2453
2454    /* Need to update the MRF tracking for compressed instructions. */
2455    if (dispatch_width == 16)
2456       return false;
2457
2458    memset(last_mrf_move, 0, sizeof(last_mrf_move));
2459
2460    foreach_list_safe(node, &this->instructions) {
2461       fs_inst *inst = (fs_inst *)node;
2462
2463       if (inst->is_control_flow()) {
2464          memset(last_mrf_move, 0, sizeof(last_mrf_move));
2465       }
2466
2467       if (inst->opcode == BRW_OPCODE_MOV &&
2468           inst->dst.file == MRF) {
2469          fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2470          if (prev_inst && inst->equals(prev_inst)) {
2471             inst->remove();
2472             progress = true;
2473             continue;
2474          }
2475       }
2476
2477       /* Clear out the last-write records for MRFs that were overwritten. */
2478       if (inst->dst.file == MRF) {
2479          last_mrf_move[inst->dst.reg] = NULL;
2480       }
2481
2482       if (inst->mlen > 0 && inst->base_mrf != -1) {
2483          /* Found a SEND instruction, which will include two or fewer
2484           * implied MRF writes.  We could do better here.
2485           */
2486          for (int i = 0; i < implied_mrf_writes(inst); i++) {
2487             last_mrf_move[inst->base_mrf + i] = NULL;
2488          }
2489       }
2490
2491       /* Clear out any MRF move records whose sources got overwritten. */
2492       if (inst->dst.file == GRF) {
2493          for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2494             if (last_mrf_move[i] &&
2495                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2496                last_mrf_move[i] = NULL;
2497             }
2498          }
2499       }
2500
2501       if (inst->opcode == BRW_OPCODE_MOV &&
2502           inst->dst.file == MRF &&
2503           inst->src[0].file == GRF &&
2504           !inst->is_partial_write()) {
2505          last_mrf_move[inst->dst.reg] = inst;
2506       }
2507    }
2508
2509    if (progress)
2510       invalidate_live_intervals();
2511
2512    return progress;
2513 }
2514
2515 static void
2516 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2517                         int first_grf, int grf_len)
2518 {
2519    bool inst_16wide = (dispatch_width > 8 &&
2520                        !inst->force_uncompressed &&
2521                        !inst->force_sechalf);
2522
2523    /* Clear the flag for registers that actually got read (as expected). */
2524    for (int i = 0; i < 3; i++) {
2525       int grf;
2526       if (inst->src[i].file == GRF) {
2527          grf = inst->src[i].reg;
2528       } else if (inst->src[i].file == HW_REG &&
2529                  inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2530          grf = inst->src[i].fixed_hw_reg.nr;
2531       } else {
2532          continue;
2533       }
2534
2535       if (grf >= first_grf &&
2536           grf < first_grf + grf_len) {
2537          deps[grf - first_grf] = false;
2538          if (inst_16wide)
2539             deps[grf - first_grf + 1] = false;
2540       }
2541    }
2542 }
2543
2544 /**
2545  * Implements this workaround for the original 965:
2546  *
2547  *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2548  *      check for post destination dependencies on this instruction, software
2549  *      must ensure that there is no destination hazard for the case of ‘write
2550  *      followed by a posted write’ shown in the following example.
2551  *
2552  *      1. mov r3 0
2553  *      2. send r3.xy <rest of send instruction>
2554  *      3. mov r2 r3
2555  *
2556  *      Due to no post-destination dependency check on the ‘send’, the above
2557  *      code sequence could have two instructions (1 and 2) in flight at the
2558  *      same time that both consider ‘r3’ as the target of their final writes.
2559  */
2560 void
2561 fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
2562 {
2563    int reg_size = dispatch_width / 8;
2564    int write_len = inst->regs_written * reg_size;
2565    int first_write_grf = inst->dst.reg;
2566    bool needs_dep[BRW_MAX_MRF];
2567    assert(write_len < (int)sizeof(needs_dep) - 1);
2568
2569    memset(needs_dep, false, sizeof(needs_dep));
2570    memset(needs_dep, true, write_len);
2571
2572    clear_deps_for_inst_src(inst, dispatch_width,
2573                            needs_dep, first_write_grf, write_len);
2574
2575    /* Walk backwards looking for writes to registers we're writing which
2576     * aren't read since being written.  If we hit the start of the program,
2577     * we assume that there are no outstanding dependencies on entry to the
2578     * program.
2579     */
2580    for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2581         scan_inst != NULL;
2582         scan_inst = (fs_inst *)scan_inst->prev) {
2583
2584       /* If we hit control flow, assume that there *are* outstanding
2585        * dependencies, and force their cleanup before our instruction.
2586        */
2587       if (scan_inst->is_control_flow()) {
2588          for (int i = 0; i < write_len; i++) {
2589             if (needs_dep[i]) {
2590                inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2591             }
2592          }
2593          return;
2594       }
2595
2596       bool scan_inst_16wide = (dispatch_width > 8 &&
2597                                !scan_inst->force_uncompressed &&
2598                                !scan_inst->force_sechalf);
2599
2600       /* We insert our reads as late as possible on the assumption that any
2601        * instruction but a MOV that might have left us an outstanding
2602        * dependency has more latency than a MOV.
2603        */
2604       if (scan_inst->dst.file == GRF) {
2605          for (int i = 0; i < scan_inst->regs_written; i++) {
2606             int reg = scan_inst->dst.reg + i * reg_size;
2607
2608             if (reg >= first_write_grf &&
2609                 reg < first_write_grf + write_len &&
2610                 needs_dep[reg - first_write_grf]) {
2611                inst->insert_before(DEP_RESOLVE_MOV(reg));
2612                needs_dep[reg - first_write_grf] = false;
2613                if (scan_inst_16wide)
2614                   needs_dep[reg - first_write_grf + 1] = false;
2615             }
2616          }
2617       }
2618
2619       /* Clear the flag for registers that actually got read (as expected). */
2620       clear_deps_for_inst_src(scan_inst, dispatch_width,
2621                               needs_dep, first_write_grf, write_len);
2622
2623       /* Continue the loop only if we haven't resolved all the dependencies */
2624       int i;
2625       for (i = 0; i < write_len; i++) {
2626          if (needs_dep[i])
2627             break;
2628       }
2629       if (i == write_len)
2630          return;
2631    }
2632 }
2633
2634 /**
2635  * Implements this workaround for the original 965:
2636  *
2637  *     "[DevBW, DevCL] Errata: A destination register from a send can not be
2638  *      used as a destination register until after it has been sourced by an
2639  *      instruction with a different destination register.
2640  */
2641 void
2642 fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
2643 {
2644    int write_len = inst->regs_written * dispatch_width / 8;
2645    int first_write_grf = inst->dst.reg;
2646    bool needs_dep[BRW_MAX_MRF];
2647    assert(write_len < (int)sizeof(needs_dep) - 1);
2648
2649    memset(needs_dep, false, sizeof(needs_dep));
2650    memset(needs_dep, true, write_len);
2651    /* Walk forwards looking for writes to registers we're writing which aren't
2652     * read before being written.
2653     */
2654    for (fs_inst *scan_inst = (fs_inst *)inst->next;
2655         !scan_inst->is_tail_sentinel();
2656         scan_inst = (fs_inst *)scan_inst->next) {
2657       /* If we hit control flow, force resolve all remaining dependencies. */
2658       if (scan_inst->is_control_flow()) {
2659          for (int i = 0; i < write_len; i++) {
2660             if (needs_dep[i])
2661                scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2662          }
2663          return;
2664       }
2665
2666       /* Clear the flag for registers that actually got read (as expected). */
2667       clear_deps_for_inst_src(scan_inst, dispatch_width,
2668                               needs_dep, first_write_grf, write_len);
2669
2670       /* We insert our reads as late as possible since they're reading the
2671        * result of a SEND, which has massive latency.
2672        */
2673       if (scan_inst->dst.file == GRF &&
2674           scan_inst->dst.reg >= first_write_grf &&
2675           scan_inst->dst.reg < first_write_grf + write_len &&
2676           needs_dep[scan_inst->dst.reg - first_write_grf]) {
2677          scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2678          needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2679       }
2680
2681       /* Continue the loop only if we haven't resolved all the dependencies */
2682       int i;
2683       for (i = 0; i < write_len; i++) {
2684          if (needs_dep[i])
2685             break;
2686       }
2687       if (i == write_len)
2688          return;
2689    }
2690
2691    /* If we hit the end of the program, resolve all remaining dependencies out
2692     * of paranoia.
2693     */
2694    fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2695    assert(last_inst->eot);
2696    for (int i = 0; i < write_len; i++) {
2697       if (needs_dep[i])
2698          last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2699    }
2700 }
2701
2702 void
2703 fs_visitor::insert_gen4_send_dependency_workarounds()
2704 {
2705    if (brw->gen != 4 || brw->is_g4x)
2706       return;
2707
2708    /* Note that we're done with register allocation, so GRF fs_regs always
2709     * have a .reg_offset of 0.
2710     */
2711
2712    foreach_list_safe(node, &this->instructions) {
2713       fs_inst *inst = (fs_inst *)node;
2714
2715       if (inst->mlen != 0 && inst->dst.file == GRF) {
2716          insert_gen4_pre_send_dependency_workarounds(inst);
2717          insert_gen4_post_send_dependency_workarounds(inst);
2718       }
2719    }
2720 }
2721
2722 /**
2723  * Turns the generic expression-style uniform pull constant load instruction
2724  * into a hardware-specific series of instructions for loading a pull
2725  * constant.
2726  *
2727  * The expression style allows the CSE pass before this to optimize out
2728  * repeated loads from the same offset, and gives the pre-register-allocation
2729  * scheduling full flexibility, while the conversion to native instructions
2730  * allows the post-register-allocation scheduler the best information
2731  * possible.
2732  *
2733  * Note that execution masking for setting up pull constant loads is special:
2734  * the channels that need to be written are unrelated to the current execution
2735  * mask, since a later instruction will use one of the result channels as a
2736  * source operand for all 8 or 16 of its channels.
2737  */
2738 void
2739 fs_visitor::lower_uniform_pull_constant_loads()
2740 {
2741    foreach_list(node, &this->instructions) {
2742       fs_inst *inst = (fs_inst *)node;
2743
2744       if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2745          continue;
2746
2747       if (brw->gen >= 7) {
2748          /* The offset arg before was a vec4-aligned byte offset.  We need to
2749           * turn it into a dword offset.
2750           */
2751          fs_reg const_offset_reg = inst->src[1];
2752          assert(const_offset_reg.file == IMM &&
2753                 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2754          const_offset_reg.imm.u /= 4;
2755          fs_reg payload = fs_reg(this, glsl_type::uint_type);
2756
2757          /* This is actually going to be a MOV, but since only the first dword
2758           * is accessed, we have a special opcode to do just that one.  Note
2759           * that this needs to be an operation that will be considered a def
2760           * by live variable analysis, or register allocation will explode.
2761           */
2762          fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2763                                                payload, const_offset_reg);
2764          setup->force_writemask_all = true;
2765
2766          setup->ir = inst->ir;
2767          setup->annotation = inst->annotation;
2768          inst->insert_before(setup);
2769
2770          /* Similarly, this will only populate the first 4 channels of the
2771           * result register (since we only use smear values from 0-3), but we
2772           * don't tell the optimizer.
2773           */
2774          inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2775          inst->src[1] = payload;
2776
2777          invalidate_live_intervals();
2778       } else {
2779          /* Before register allocation, we didn't tell the scheduler about the
2780           * MRF we use.  We know it's safe to use this MRF because nothing
2781           * else does except for register spill/unspill, which generates and
2782           * uses its MRF within a single IR instruction.
2783           */
2784          inst->base_mrf = 14;
2785          inst->mlen = 1;
2786       }
2787    }
2788 }
2789
2790 void
2791 fs_visitor::dump_instruction(backend_instruction *be_inst)
2792 {
2793    fs_inst *inst = (fs_inst *)be_inst;
2794
2795    if (inst->predicate) {
2796       printf("(%cf0.%d) ",
2797              inst->predicate_inverse ? '-' : '+',
2798              inst->flag_subreg);
2799    }
2800
2801    printf("%s", brw_instruction_name(inst->opcode));
2802    if (inst->saturate)
2803       printf(".sat");
2804    if (inst->conditional_mod) {
2805       printf(".cmod");
2806       if (!inst->predicate &&
2807           (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2808                               inst->opcode != BRW_OPCODE_IF &&
2809                               inst->opcode != BRW_OPCODE_WHILE))) {
2810          printf(".f0.%d", inst->flag_subreg);
2811       }
2812    }
2813    printf(" ");
2814
2815
2816    switch (inst->dst.file) {
2817    case GRF:
2818       printf("vgrf%d", inst->dst.reg);
2819       if (inst->dst.reg_offset)
2820          printf("+%d", inst->dst.reg_offset);
2821       break;
2822    case MRF:
2823       printf("m%d", inst->dst.reg);
2824       break;
2825    case BAD_FILE:
2826       printf("(null)");
2827       break;
2828    case UNIFORM:
2829       printf("***u%d***", inst->dst.reg);
2830       break;
2831    case HW_REG:
2832       printf("hw_reg%d", inst->dst.fixed_hw_reg.nr);
2833       if (inst->dst.fixed_hw_reg.subnr)
2834          printf("+%d", inst->dst.fixed_hw_reg.subnr);
2835       break;
2836    default:
2837       printf("???");
2838       break;
2839    }
2840    printf(", ");
2841
2842    for (int i = 0; i < 3; i++) {
2843       if (inst->src[i].negate)
2844          printf("-");
2845       if (inst->src[i].abs)
2846          printf("|");
2847       switch (inst->src[i].file) {
2848       case GRF:
2849          printf("vgrf%d", inst->src[i].reg);
2850          if (inst->src[i].reg_offset)
2851             printf("+%d", inst->src[i].reg_offset);
2852          break;
2853       case MRF:
2854          printf("***m%d***", inst->src[i].reg);
2855          break;
2856       case UNIFORM:
2857          printf("u%d", inst->src[i].reg);
2858          if (inst->src[i].reg_offset)
2859             printf(".%d", inst->src[i].reg_offset);
2860          break;
2861       case BAD_FILE:
2862          printf("(null)");
2863          break;
2864       case IMM:
2865          switch (inst->src[i].type) {
2866          case BRW_REGISTER_TYPE_F:
2867             printf("%ff", inst->src[i].imm.f);
2868             break;
2869          case BRW_REGISTER_TYPE_D:
2870             printf("%dd", inst->src[i].imm.i);
2871             break;
2872          case BRW_REGISTER_TYPE_UD:
2873             printf("%uu", inst->src[i].imm.u);
2874             break;
2875          default:
2876             printf("???");
2877             break;
2878          }
2879          break;
2880       case HW_REG:
2881          if (inst->src[i].fixed_hw_reg.negate)
2882             printf("-");
2883          if (inst->src[i].fixed_hw_reg.abs)
2884             printf("|");
2885          printf("hw_reg%d", inst->src[i].fixed_hw_reg.nr);
2886          if (inst->src[i].fixed_hw_reg.subnr)
2887             printf("+%d", inst->src[i].fixed_hw_reg.subnr);
2888          if (inst->src[i].fixed_hw_reg.abs)
2889             printf("|");
2890          break;
2891       default:
2892          printf("???");
2893          break;
2894       }
2895       if (inst->src[i].abs)
2896          printf("|");
2897
2898       if (i < 3)
2899          printf(", ");
2900    }
2901
2902    printf(" ");
2903
2904    if (inst->force_uncompressed)
2905       printf("1sthalf ");
2906
2907    if (inst->force_sechalf)
2908       printf("2ndhalf ");
2909
2910    printf("\n");
2911 }
2912
2913 /**
2914  * Possibly returns an instruction that set up @param reg.
2915  *
2916  * Sometimes we want to take the result of some expression/variable
2917  * dereference tree and rewrite the instruction generating the result
2918  * of the tree.  When processing the tree, we know that the
2919  * instructions generated are all writing temporaries that are dead
2920  * outside of this tree.  So, if we have some instructions that write
2921  * a temporary, we're free to point that temp write somewhere else.
2922  *
2923  * Note that this doesn't guarantee that the instruction generated
2924  * only reg -- it might be the size=4 destination of a texture instruction.
2925  */
2926 fs_inst *
2927 fs_visitor::get_instruction_generating_reg(fs_inst *start,
2928                                            fs_inst *end,
2929                                            fs_reg reg)
2930 {
2931    if (end == start ||
2932        end->is_partial_write() ||
2933        reg.reladdr ||
2934        !reg.equals(end->dst)) {
2935       return NULL;
2936    } else {
2937       return end;
2938    }
2939 }
2940
2941 void
2942 fs_visitor::setup_payload_gen6()
2943 {
2944    bool uses_depth =
2945       (fp->Base.InputsRead & (1 << VARYING_SLOT_POS)) != 0;
2946    unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
2947
2948    assert(brw->gen >= 6);
2949
2950    /* R0-1: masks, pixel X/Y coordinates. */
2951    c->nr_payload_regs = 2;
2952    /* R2: only for 32-pixel dispatch.*/
2953
2954    /* R3-26: barycentric interpolation coordinates.  These appear in the
2955     * same order that they appear in the brw_wm_barycentric_interp_mode
2956     * enum.  Each set of coordinates occupies 2 registers if dispatch width
2957     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
2958     * appear if they were enabled using the "Barycentric Interpolation
2959     * Mode" bits in WM_STATE.
2960     */
2961    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
2962       if (barycentric_interp_modes & (1 << i)) {
2963          c->barycentric_coord_reg[i] = c->nr_payload_regs;
2964          c->nr_payload_regs += 2;
2965          if (dispatch_width == 16) {
2966             c->nr_payload_regs += 2;
2967          }
2968       }
2969    }
2970
2971    /* R27: interpolated depth if uses source depth */
2972    if (uses_depth) {
2973       c->source_depth_reg = c->nr_payload_regs;
2974       c->nr_payload_regs++;
2975       if (dispatch_width == 16) {
2976          /* R28: interpolated depth if not 8-wide. */
2977          c->nr_payload_regs++;
2978       }
2979    }
2980    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
2981    if (uses_depth) {
2982       c->source_w_reg = c->nr_payload_regs;
2983       c->nr_payload_regs++;
2984       if (dispatch_width == 16) {
2985          /* R30: interpolated W if not 8-wide. */
2986          c->nr_payload_regs++;
2987       }
2988    }
2989    /* R31: MSAA position offsets. */
2990    /* R32-: bary for 32-pixel. */
2991    /* R58-59: interp W for 32-pixel. */
2992
2993    if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
2994       c->source_depth_to_render_target = true;
2995    }
2996 }
2997
2998 void
2999 fs_visitor::assign_binding_table_offsets()
3000 {
3001    uint32_t next_binding_table_offset = 0;
3002
3003    c->prog_data.binding_table.render_target_start = next_binding_table_offset;
3004    next_binding_table_offset += c->key.nr_color_regions;
3005
3006    assign_common_binding_table_offsets(next_binding_table_offset);
3007 }
3008
3009 bool
3010 fs_visitor::run()
3011 {
3012    sanity_param_count = fp->Base.Parameters->NumParameters;
3013    uint32_t orig_nr_params = c->prog_data.nr_params;
3014
3015    assign_binding_table_offsets();
3016
3017    if (brw->gen >= 6)
3018       setup_payload_gen6();
3019    else
3020       setup_payload_gen4();
3021
3022    if (0) {
3023       emit_dummy_fs();
3024    } else {
3025       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3026          emit_shader_time_begin();
3027
3028       calculate_urb_setup();
3029       if (fp->Base.InputsRead > 0) {
3030          if (brw->gen < 6)
3031             emit_interpolation_setup_gen4();
3032          else
3033             emit_interpolation_setup_gen6();
3034       }
3035
3036       /* We handle discards by keeping track of the still-live pixels in f0.1.
3037        * Initialize it with the dispatched pixels.
3038        */
3039       if (fp->UsesKill) {
3040          fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
3041          discard_init->flag_subreg = 1;
3042       }
3043
3044       /* Generate FS IR for main().  (the visitor only descends into
3045        * functions called "main").
3046        */
3047       if (shader) {
3048          foreach_list(node, &*shader->ir) {
3049             ir_instruction *ir = (ir_instruction *)node;
3050             base_ir = ir;
3051             this->result = reg_undef;
3052             ir->accept(this);
3053          }
3054       } else {
3055          emit_fragment_program_code();
3056       }
3057       base_ir = NULL;
3058       if (failed)
3059          return false;
3060
3061       emit(FS_OPCODE_PLACEHOLDER_HALT);
3062
3063       emit_fb_writes();
3064
3065       split_virtual_grfs();
3066
3067       move_uniform_array_access_to_pull_constants();
3068       setup_pull_constants();
3069
3070       bool progress;
3071       do {
3072          progress = false;
3073
3074          compact_virtual_grfs();
3075
3076          progress = remove_duplicate_mrf_writes() || progress;
3077
3078          progress = opt_algebraic() || progress;
3079          progress = opt_cse() || progress;
3080          progress = opt_copy_propagate() || progress;
3081          progress = dead_code_eliminate() || progress;
3082          progress = dead_code_eliminate_local() || progress;
3083          progress = register_coalesce() || progress;
3084          progress = register_coalesce_2() || progress;
3085          progress = compute_to_mrf() || progress;
3086       } while (progress);
3087
3088       remove_dead_constants();
3089
3090       schedule_instructions(false);
3091
3092       lower_uniform_pull_constant_loads();
3093
3094       assign_curb_setup();
3095       assign_urb_setup();
3096
3097       if (0) {
3098          /* Debug of register spilling: Go spill everything. */
3099          for (int i = 0; i < virtual_grf_count; i++) {
3100             spill_reg(i);
3101          }
3102       }
3103
3104       if (0)
3105          assign_regs_trivial();
3106       else {
3107          while (!assign_regs()) {
3108             if (failed)
3109                break;
3110          }
3111       }
3112    }
3113    assert(force_uncompressed_stack == 0);
3114    assert(force_sechalf_stack == 0);
3115
3116    /* This must come after all optimization and register allocation, since
3117     * it inserts dead code that happens to have side effects, and it does
3118     * so based on the actual physical registers in use.
3119     */
3120    insert_gen4_send_dependency_workarounds();
3121
3122    if (failed)
3123       return false;
3124
3125    schedule_instructions(true);
3126
3127    if (dispatch_width == 8) {
3128       c->prog_data.reg_blocks = brw_register_blocks(grf_used);
3129    } else {
3130       c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
3131
3132       /* Make sure we didn't try to sneak in an extra uniform */
3133       assert(orig_nr_params == c->prog_data.nr_params);
3134       (void) orig_nr_params;
3135    }
3136
3137    /* If any state parameters were appended, then ParameterValues could have
3138     * been realloced, in which case the driver uniform storage set up by
3139     * _mesa_associate_uniform_storage() would point to freed memory.  Make
3140     * sure that didn't happen.
3141     */
3142    assert(sanity_param_count == fp->Base.Parameters->NumParameters);
3143
3144    return !failed;
3145 }
3146
3147 const unsigned *
3148 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
3149                struct gl_fragment_program *fp,
3150                struct gl_shader_program *prog,
3151                unsigned *final_assembly_size)
3152 {
3153    bool start_busy = false;
3154    float start_time = 0;
3155
3156    if (unlikely(brw->perf_debug)) {
3157       start_busy = (brw->batch.last_bo &&
3158                     drm_intel_bo_busy(brw->batch.last_bo));
3159       start_time = get_time();
3160    }
3161
3162    struct brw_shader *shader = NULL;
3163    if (prog)
3164       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3165
3166    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3167       if (prog) {
3168          printf("GLSL IR for native fragment shader %d:\n", prog->Name);
3169          _mesa_print_ir(shader->ir, NULL);
3170          printf("\n\n");
3171       } else {
3172          printf("ARB_fragment_program %d ir for native fragment shader\n",
3173                 fp->Base.Id);
3174          _mesa_print_program(&fp->Base);
3175       }
3176    }
3177
3178    /* Now the main event: Visit the shader IR and generate our FS IR for it.
3179     */
3180    fs_visitor v(brw, c, prog, fp, 8);
3181    if (!v.run()) {
3182       if (prog) {
3183          prog->LinkStatus = false;
3184          ralloc_strcat(&prog->InfoLog, v.fail_msg);
3185       }
3186
3187       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3188                     v.fail_msg);
3189
3190       return NULL;
3191    }
3192
3193    exec_list *simd16_instructions = NULL;
3194    fs_visitor v2(brw, c, prog, fp, 16);
3195    if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16))) {
3196       if (c->prog_data.nr_pull_params == 0) {
3197          /* Try a 16-wide compile */
3198          v2.import_uniforms(&v);
3199          if (!v2.run()) {
3200             perf_debug("16-wide shader failed to compile, falling back to "
3201                        "8-wide at a 10-20%% performance cost: %s", v2.fail_msg);
3202          } else {
3203             simd16_instructions = &v2.instructions;
3204          }
3205       } else {
3206          perf_debug("Skipping 16-wide due to pull parameters.\n");
3207       }
3208    }
3209
3210    fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
3211    const unsigned *generated = g.generate_assembly(&v.instructions,
3212                                                    simd16_instructions,
3213                                                    final_assembly_size);
3214
3215    if (unlikely(brw->perf_debug) && shader) {
3216       if (shader->compiled_once)
3217          brw_wm_debug_recompile(brw, prog, &c->key);
3218       shader->compiled_once = true;
3219
3220       if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
3221          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3222                     (get_time() - start_time) * 1000);
3223       }
3224    }
3225
3226    return generated;
3227 }
3228
3229 bool
3230 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
3231 {
3232    struct brw_context *brw = brw_context(ctx);
3233    struct brw_wm_prog_key key;
3234
3235    if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
3236       return true;
3237
3238    struct gl_fragment_program *fp = (struct gl_fragment_program *)
3239       prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
3240    struct brw_fragment_program *bfp = brw_fragment_program(fp);
3241    bool program_uses_dfdy = fp->UsesDFdy;
3242
3243    memset(&key, 0, sizeof(key));
3244
3245    if (brw->gen < 6) {
3246       if (fp->UsesKill)
3247          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3248
3249       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3250          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3251
3252       /* Just assume depth testing. */
3253       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3254       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3255    }
3256
3257    if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
3258                                          BRW_FS_VARYING_INPUT_MASK) > 16)
3259       key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
3260
3261    key.clamp_fragment_color = ctx->API == API_OPENGL_COMPAT;
3262
3263    unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
3264    for (unsigned i = 0; i < sampler_count; i++) {
3265       if (fp->Base.ShadowSamplers & (1 << i)) {
3266          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3267          key.tex.swizzles[i] =
3268             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3269       } else {
3270          /* Color sampler: assume no swizzling. */
3271          key.tex.swizzles[i] = SWIZZLE_XYZW;
3272       }
3273    }
3274
3275    if (fp->Base.InputsRead & VARYING_BIT_POS) {
3276       key.drawable_height = ctx->DrawBuffer->Height;
3277    }
3278
3279    if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3280       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
3281    }
3282
3283    key.nr_color_regions = 1;
3284
3285    /* GL_FRAGMENT_SHADER_DERIVATIVE_HINT is almost always GL_DONT_CARE.  The
3286     * quality of the derivatives is likely to be determined by the driconf
3287     * option.
3288     */
3289    key.high_quality_derivatives = brw->disable_derivative_optimization;
3290
3291    key.program_string_id = bfp->id;
3292
3293    uint32_t old_prog_offset = brw->wm.base.prog_offset;
3294    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3295
3296    bool success = do_wm_prog(brw, prog, bfp, &key);
3297
3298    brw->wm.base.prog_offset = old_prog_offset;
3299    brw->wm.prog_data = old_prog_data;
3300
3301    return success;
3302 }