src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 extern "C" {
  32
  33 #include <sys/types.h>
  34
  35 #include "main/hash_table.h"
  36 #include "main/macros.h"
  37 #include "main/shaderobj.h"
  38 #include "main/uniforms.h"
  39 #include "main/fbobject.h"
  40 #include "program/prog_parameter.h"
  41 #include "program/prog_print.h"
  42 #include "program/register_allocate.h"
  43 #include "program/sampler.h"
  44 #include "program/hash_table.h"
  45 #include "brw_context.h"
  46 #include "brw_eu.h"
  47 #include "brw_wm.h"
  48 }
  49 #include "brw_fs.h"
  50 #include "glsl/glsl_types.h"
  51
  52 void
  53 fs_inst::init()
  54 {
  55    memset(this, 0, sizeof(*this));
  56    this->opcode = BRW_OPCODE_NOP;
  57    this->conditional_mod = BRW_CONDITIONAL_NONE;
  58
  59    this->dst = reg_undef;
  60    this->src[0] = reg_undef;
  61    this->src[1] = reg_undef;
  62    this->src[2] = reg_undef;
  63
  64    /* This will be the case for almost all instructions. */
  65    this->regs_written = 1;
  66 }
  67
  68 fs_inst::fs_inst()
  69 {
  70    init();
  71 }
  72
  73 fs_inst::fs_inst(enum opcode opcode)
  74 {
  75    init();
  76    this->opcode = opcode;
  77 }
  78
  79 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
  80 {
  81    init();
  82    this->opcode = opcode;
  83    this->dst = dst;
  84
  85    if (dst.file == GRF)
  86       assert(dst.reg_offset >= 0);
  87 }
  88
  89 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
  90 {
  91    init();
  92    this->opcode = opcode;
  93    this->dst = dst;
  94    this->src[0] = src0;
  95
  96    if (dst.file == GRF)
  97       assert(dst.reg_offset >= 0);
  98    if (src[0].file == GRF)
  99       assert(src[0].reg_offset >= 0);
 100 }
 101
 102 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 103 {
 104    init();
 105    this->opcode = opcode;
 106    this->dst = dst;
 107    this->src[0] = src0;
 108    this->src[1] = src1;
 109
 110    if (dst.file == GRF)
 111       assert(dst.reg_offset >= 0);
 112    if (src[0].file == GRF)
 113       assert(src[0].reg_offset >= 0);
 114    if (src[1].file == GRF)
 115       assert(src[1].reg_offset >= 0);
 116 }
 117
 118 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
 119                  fs_reg src0, fs_reg src1, fs_reg src2)
 120 {
 121    init();
 122    this->opcode = opcode;
 123    this->dst = dst;
 124    this->src[0] = src0;
 125    this->src[1] = src1;
 126    this->src[2] = src2;
 127
 128    if (dst.file == GRF)
 129       assert(dst.reg_offset >= 0);
 130    if (src[0].file == GRF)
 131       assert(src[0].reg_offset >= 0);
 132    if (src[1].file == GRF)
 133       assert(src[1].reg_offset >= 0);
 134    if (src[2].file == GRF)
 135       assert(src[2].reg_offset >= 0);
 136 }
 137
 138 #define ALU1(op)                                                        \
 139    fs_inst *                                                            \
 140    fs_visitor::op(fs_reg dst, fs_reg src0)                              \
 141    {                                                                    \
 142       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
 143    }
 144
 145 #define ALU2(op)                                                        \
 146    fs_inst *                                                            \
 147    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1)                 \
 148    {                                                                    \
 149       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
 150    }
 151
 152 #define ALU3(op)                                                        \
 153    fs_inst *                                                            \
 154    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1, fs_reg src2)    \
 155    {                                                                    \
 156       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
 157    }
 158
 159 ALU1(NOT)
 160 ALU1(MOV)
 161 ALU1(FRC)
 162 ALU1(RNDD)
 163 ALU1(RNDE)
 164 ALU1(RNDZ)
 165 ALU2(ADD)
 166 ALU2(MUL)
 167 ALU2(MACH)
 168 ALU2(AND)
 169 ALU2(OR)
 170 ALU2(XOR)
 171 ALU2(SHL)
 172 ALU2(SHR)
 173 ALU2(ASR)
 174 ALU3(LRP)
 175 ALU1(BFREV)
 176 ALU3(BFE)
 177 ALU2(BFI1)
 178 ALU3(BFI2)
 179 ALU1(FBH)
 180 ALU1(FBL)
 181 ALU1(CBIT)
 182
 183 /** Gen4 predicated IF. */
 184 fs_inst *
 185 fs_visitor::IF(uint32_t predicate)
 186 {
 187    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
 188    inst->predicate = predicate;
 189    return inst;
 190 }
 191
 192 /** Gen6+ IF with embedded comparison. */
 193 fs_inst *
 194 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
 195 {
 196    assert(brw->gen >= 6);
 197    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
 198                                         reg_null_d, src0, src1);
 199    inst->conditional_mod = condition;
 200    return inst;
 201 }
 202
 203 /**
 204  * CMP: Sets the low bit of the destination channels with the result
 205  * of the comparison, while the upper bits are undefined, and updates
 206  * the flag register with the packed 16 bits of the result.
 207  */
 208 fs_inst *
 209 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
 210 {
 211    fs_inst *inst;
 212
 213    /* Take the instruction:
 214     *
 215     * CMP null<d> src0<f> src1<f>
 216     *
 217     * Original gen4 does type conversion to the destination type before
 218     * comparison, producing garbage results for floating point comparisons.
 219     * gen5 does the comparison on the execution type (resolved source types),
 220     * so dst type doesn't matter.  gen6 does comparison and then uses the
 221     * result as if it was the dst type with no conversion, which happens to
 222     * mostly work out for float-interpreted-as-int since our comparisons are
 223     * for >0, =0, <0.
 224     */
 225    if (brw->gen == 4) {
 226       dst.type = src0.type;
 227       if (dst.file == HW_REG)
 228          dst.fixed_hw_reg.type = dst.type;
 229    }
 230
 231    resolve_ud_negate(&src0);
 232    resolve_ud_negate(&src1);
 233
 234    inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
 235    inst->conditional_mod = condition;
 236
 237    return inst;
 238 }
 239
 240 exec_list
 241 fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
 242                                        fs_reg varying_offset,
 243                                        uint32_t const_offset)
 244 {
 245    exec_list instructions;
 246    fs_inst *inst;
 247
 248    /* We have our constant surface use a pitch of 4 bytes, so our index can
 249     * be any component of a vector, and then we load 4 contiguous
 250     * components starting from that.
 251     *
 252     * We break down the const_offset to a portion added to the variable
 253     * offset and a portion done using reg_offset, which means that if you
 254     * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
 255     * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
 256     * CSE can later notice that those loads are all the same and eliminate
 257     * the redundant ones.
 258     */
 259    fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
 260    instructions.push_tail(ADD(vec4_offset,
 261                               varying_offset, const_offset & ~3));
 262
 263    int scale = 1;
 264    if (brw->gen == 4 && dispatch_width == 8) {
 265       /* Pre-gen5, we can either use a SIMD8 message that requires (header,
 266        * u, v, r) as parameters, or we can just use the SIMD16 message
 267        * consisting of (header, u).  We choose the second, at the cost of a
 268        * longer return length.
 269        */
 270       scale = 2;
 271    }
 272
 273    enum opcode op;
 274    if (brw->gen >= 7)
 275       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
 276    else
 277       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
 278    fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4 * scale), dst.type);
 279    inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
 280    inst->regs_written = 4 * scale;
 281    instructions.push_tail(inst);
 282
 283    if (brw->gen < 7) {
 284       inst->base_mrf = 13;
 285       inst->header_present = true;
 286       if (brw->gen == 4)
 287          inst->mlen = 3;
 288       else
 289          inst->mlen = 1 + dispatch_width / 8;
 290    }
 291
 292    vec4_result.reg_offset += (const_offset & 3) * scale;
 293    instructions.push_tail(MOV(dst, vec4_result));
 294
 295    return instructions;
 296 }
 297
 298 /**
 299  * A helper for MOV generation for fixing up broken hardware SEND dependency
 300  * handling.
 301  */
 302 fs_inst *
 303 fs_visitor::DEP_RESOLVE_MOV(int grf)
 304 {
 305    fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
 306
 307    inst->ir = NULL;
 308    inst->annotation = "send dependency resolve";
 309
 310    /* The caller always wants uncompressed to emit the minimal extra
 311     * dependencies, and to avoid having to deal with aligning its regs to 2.
 312     */
 313    inst->force_uncompressed = true;
 314
 315    return inst;
 316 }
 317
 318 bool
 319 fs_inst::equals(fs_inst *inst)
 320 {
 321    return (opcode == inst->opcode &&
 322            dst.equals(inst->dst) &&
 323            src[0].equals(inst->src[0]) &&
 324            src[1].equals(inst->src[1]) &&
 325            src[2].equals(inst->src[2]) &&
 326            saturate == inst->saturate &&
 327            predicate == inst->predicate &&
 328            conditional_mod == inst->conditional_mod &&
 329            mlen == inst->mlen &&
 330            base_mrf == inst->base_mrf &&
 331            sampler == inst->sampler &&
 332            target == inst->target &&
 333            eot == inst->eot &&
 334            header_present == inst->header_present &&
 335            shadow_compare == inst->shadow_compare &&
 336            offset == inst->offset);
 337 }
 338
 339 bool
 340 fs_inst::overwrites_reg(const fs_reg &reg)
 341 {
 342    return (reg.file == dst.file &&
 343            reg.reg == dst.reg &&
 344            reg.reg_offset >= dst.reg_offset  &&
 345            reg.reg_offset < dst.reg_offset + regs_written);
 346 }
 347
 348 bool
 349 fs_inst::is_send_from_grf()
 350 {
 351    return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
 352            opcode == SHADER_OPCODE_SHADER_TIME_ADD ||
 353            (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
 354             src[1].file == GRF));
 355 }
 356
 357 bool
 358 fs_visitor::can_do_source_mods(fs_inst *inst)
 359 {
 360    if (brw->gen == 6 && inst->is_math())
 361       return false;
 362
 363    if (inst->is_send_from_grf())
 364       return false;
 365
 366    return true;
 367 }
 368
 369 void
 370 fs_reg::init()
 371 {
 372    memset(this, 0, sizeof(*this));
 373    this->smear = -1;
 374 }
 375
 376 /** Generic unset register constructor. */
 377 fs_reg::fs_reg()
 378 {
 379    init();
 380    this->file = BAD_FILE;
 381 }
 382
 383 /** Immediate value constructor. */
 384 fs_reg::fs_reg(float f)
 385 {
 386    init();
 387    this->file = IMM;
 388    this->type = BRW_REGISTER_TYPE_F;
 389    this->imm.f = f;
 390 }
 391
 392 /** Immediate value constructor. */
 393 fs_reg::fs_reg(int32_t i)
 394 {
 395    init();
 396    this->file = IMM;
 397    this->type = BRW_REGISTER_TYPE_D;
 398    this->imm.i = i;
 399 }
 400
 401 /** Immediate value constructor. */
 402 fs_reg::fs_reg(uint32_t u)
 403 {
 404    init();
 405    this->file = IMM;
 406    this->type = BRW_REGISTER_TYPE_UD;
 407    this->imm.u = u;
 408 }
 409
 410 /** Fixed brw_reg Immediate value constructor. */
 411 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
 412 {
 413    init();
 414    this->file = HW_REG;
 415    this->fixed_hw_reg = fixed_hw_reg;
 416    this->type = fixed_hw_reg.type;
 417 }
 418
 419 bool
 420 fs_reg::equals(const fs_reg &r) const
 421 {
 422    return (file == r.file &&
 423            reg == r.reg &&
 424            reg_offset == r.reg_offset &&
 425            type == r.type &&
 426            negate == r.negate &&
 427            abs == r.abs &&
 428            !reladdr && !r.reladdr &&
 429            memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
 430                   sizeof(fixed_hw_reg)) == 0 &&
 431            smear == r.smear &&
 432            imm.u == r.imm.u);
 433 }
 434
 435 bool
 436 fs_reg::is_zero() const
 437 {
 438    if (file != IMM)
 439       return false;
 440
 441    return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
 442 }
 443
 444 bool
 445 fs_reg::is_one() const
 446 {
 447    if (file != IMM)
 448       return false;
 449
 450    return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
 451 }
 452
 453 bool
 454 fs_reg::is_valid_3src() const
 455 {
 456    return file == GRF || file == UNIFORM;
 457 }
 458
 459 int
 460 fs_visitor::type_size(const struct glsl_type *type)
 461 {
 462    unsigned int size, i;
 463
 464    switch (type->base_type) {
 465    case GLSL_TYPE_UINT:
 466    case GLSL_TYPE_INT:
 467    case GLSL_TYPE_FLOAT:
 468    case GLSL_TYPE_BOOL:
 469       return type->components();
 470    case GLSL_TYPE_ARRAY:
 471       return type_size(type->fields.array) * type->length;
 472    case GLSL_TYPE_STRUCT:
 473       size = 0;
 474       for (i = 0; i < type->length; i++) {
 475          size += type_size(type->fields.structure[i].type);
 476       }
 477       return size;
 478    case GLSL_TYPE_SAMPLER:
 479       /* Samplers take up no register space, since they're baked in at
 480        * link time.
 481        */
 482       return 0;
 483    case GLSL_TYPE_VOID:
 484    case GLSL_TYPE_ERROR:
 485    case GLSL_TYPE_INTERFACE:
 486       assert(!"not reached");
 487       break;
 488    }
 489
 490    return 0;
 491 }
 492
 493 fs_reg
 494 fs_visitor::get_timestamp()
 495 {
 496    assert(brw->gen >= 7);
 497
 498    fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 499                                           BRW_ARF_TIMESTAMP,
 500                                           0),
 501                              BRW_REGISTER_TYPE_UD));
 502
 503    fs_reg dst = fs_reg(this, glsl_type::uint_type);
 504
 505    fs_inst *mov = emit(MOV(dst, ts));
 506    /* We want to read the 3 fields we care about (mostly field 0, but also 2)
 507     * even if it's not enabled in the dispatch.
 508     */
 509    mov->force_writemask_all = true;
 510    mov->force_uncompressed = true;
 511
 512    /* The caller wants the low 32 bits of the timestamp.  Since it's running
 513     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
 514     * which is plenty of time for our purposes.  It is identical across the
 515     * EUs, but since it's tracking GPU core speed it will increment at a
 516     * varying rate as render P-states change.
 517     *
 518     * The caller could also check if render P-states have changed (or anything
 519     * else that might disrupt timing) by setting smear to 2 and checking if
 520     * that field is != 0.
 521     */
 522    dst.smear = 0;
 523
 524    return dst;
 525 }
 526
 527 void
 528 fs_visitor::emit_shader_time_begin()
 529 {
 530    current_annotation = "shader time start";
 531    shader_start_time = get_timestamp();
 532 }
 533
 534 void
 535 fs_visitor::emit_shader_time_end()
 536 {
 537    current_annotation = "shader time end";
 538
 539    enum shader_time_shader_type type, written_type, reset_type;
 540    if (dispatch_width == 8) {
 541       type = ST_FS8;
 542       written_type = ST_FS8_WRITTEN;
 543       reset_type = ST_FS8_RESET;
 544    } else {
 545       assert(dispatch_width == 16);
 546       type = ST_FS16;
 547       written_type = ST_FS16_WRITTEN;
 548       reset_type = ST_FS16_RESET;
 549    }
 550
 551    fs_reg shader_end_time = get_timestamp();
 552
 553    /* Check that there weren't any timestamp reset events (assuming these
 554     * were the only two timestamp reads that happened).
 555     */
 556    fs_reg reset = shader_end_time;
 557    reset.smear = 2;
 558    fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
 559    test->conditional_mod = BRW_CONDITIONAL_Z;
 560    emit(IF(BRW_PREDICATE_NORMAL));
 561
 562    push_force_uncompressed();
 563    fs_reg start = shader_start_time;
 564    start.negate = true;
 565    fs_reg diff = fs_reg(this, glsl_type::uint_type);
 566    emit(ADD(diff, start, shader_end_time));
 567
 568    /* If there were no instructions between the two timestamp gets, the diff
 569     * is 2 cycles.  Remove that overhead, so I can forget about that when
 570     * trying to determine the time taken for single instructions.
 571     */
 572    emit(ADD(diff, diff, fs_reg(-2u)));
 573
 574    emit_shader_time_write(type, diff);
 575    emit_shader_time_write(written_type, fs_reg(1u));
 576    emit(BRW_OPCODE_ELSE);
 577    emit_shader_time_write(reset_type, fs_reg(1u));
 578    emit(BRW_OPCODE_ENDIF);
 579
 580    pop_force_uncompressed();
 581 }
 582
 583 void
 584 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
 585                                    fs_reg value)
 586 {
 587    int shader_time_index =
 588       brw_get_shader_time_index(brw, shader_prog, &fp->Base, type);
 589    fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
 590
 591    fs_reg payload;
 592    if (dispatch_width == 8)
 593       payload = fs_reg(this, glsl_type::uvec2_type);
 594    else
 595       payload = fs_reg(this, glsl_type::uint_type);
 596
 597    emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
 598                 fs_reg(), payload, offset, value));
 599 }
 600
 601 void
 602 fs_visitor::fail(const char *format, ...)
 603 {
 604    va_list va;
 605    char *msg;
 606
 607    if (failed)
 608       return;
 609
 610    failed = true;
 611
 612    va_start(va, format);
 613    msg = ralloc_vasprintf(mem_ctx, format, va);
 614    va_end(va);
 615    msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
 616
 617    this->fail_msg = msg;
 618
 619    if (INTEL_DEBUG & DEBUG_WM) {
 620       fprintf(stderr, "%s",  msg);
 621    }
 622 }
 623
 624 fs_inst *
 625 fs_visitor::emit(enum opcode opcode)
 626 {
 627    return emit(fs_inst(opcode));
 628 }
 629
 630 fs_inst *
 631 fs_visitor::emit(enum opcode opcode, fs_reg dst)
 632 {
 633    return emit(fs_inst(opcode, dst));
 634 }
 635
 636 fs_inst *
 637 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
 638 {
 639    return emit(fs_inst(opcode, dst, src0));
 640 }
 641
 642 fs_inst *
 643 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 644 {
 645    return emit(fs_inst(opcode, dst, src0, src1));
 646 }
 647
 648 fs_inst *
 649 fs_visitor::emit(enum opcode opcode, fs_reg dst,
 650                  fs_reg src0, fs_reg src1, fs_reg src2)
 651 {
 652    return emit(fs_inst(opcode, dst, src0, src1, src2));
 653 }
 654
 655 void
 656 fs_visitor::push_force_uncompressed()
 657 {
 658    force_uncompressed_stack++;
 659 }
 660
 661 void
 662 fs_visitor::pop_force_uncompressed()
 663 {
 664    force_uncompressed_stack--;
 665    assert(force_uncompressed_stack >= 0);
 666 }
 667
 668 void
 669 fs_visitor::push_force_sechalf()
 670 {
 671    force_sechalf_stack++;
 672 }
 673
 674 void
 675 fs_visitor::pop_force_sechalf()
 676 {
 677    force_sechalf_stack--;
 678    assert(force_sechalf_stack >= 0);
 679 }
 680
 681 /**
 682  * Returns true if the instruction has a flag that means it won't
 683  * update an entire destination register.
 684  *
 685  * For example, dead code elimination and live variable analysis want to know
 686  * when a write to a variable screens off any preceding values that were in
 687  * it.
 688  */
 689 bool
 690 fs_inst::is_partial_write()
 691 {
 692    return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
 693            this->force_uncompressed ||
 694            this->force_sechalf);
 695 }
 696
 697 /**
 698  * Returns how many MRFs an FS opcode will write over.
 699  *
 700  * Note that this is not the 0 or 1 implied writes in an actual gen
 701  * instruction -- the FS opcodes often generate MOVs in addition.
 702  */
 703 int
 704 fs_visitor::implied_mrf_writes(fs_inst *inst)
 705 {
 706    if (inst->mlen == 0)
 707       return 0;
 708
 709    switch (inst->opcode) {
 710    case SHADER_OPCODE_RCP:
 711    case SHADER_OPCODE_RSQ:
 712    case SHADER_OPCODE_SQRT:
 713    case SHADER_OPCODE_EXP2:
 714    case SHADER_OPCODE_LOG2:
 715    case SHADER_OPCODE_SIN:
 716    case SHADER_OPCODE_COS:
 717       return 1 * dispatch_width / 8;
 718    case SHADER_OPCODE_POW:
 719    case SHADER_OPCODE_INT_QUOTIENT:
 720    case SHADER_OPCODE_INT_REMAINDER:
 721       return 2 * dispatch_width / 8;
 722    case SHADER_OPCODE_TEX:
 723    case FS_OPCODE_TXB:
 724    case SHADER_OPCODE_TXD:
 725    case SHADER_OPCODE_TXF:
 726    case SHADER_OPCODE_TXF_MS:
 727    case SHADER_OPCODE_TXL:
 728    case SHADER_OPCODE_TXS:
 729    case SHADER_OPCODE_LOD:
 730       return 1;
 731    case FS_OPCODE_FB_WRITE:
 732       return 2;
 733    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 734    case FS_OPCODE_UNSPILL:
 735       return 1;
 736    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
 737       return inst->mlen;
 738    case FS_OPCODE_SPILL:
 739       return 2;
 740    default:
 741       assert(!"not reached");
 742       return inst->mlen;
 743    }
 744 }
 745
 746 int
 747 fs_visitor::virtual_grf_alloc(int size)
 748 {
 749    if (virtual_grf_array_size <= virtual_grf_count) {
 750       if (virtual_grf_array_size == 0)
 751          virtual_grf_array_size = 16;
 752       else
 753          virtual_grf_array_size *= 2;
 754       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 755                                    virtual_grf_array_size);
 756    }
 757    virtual_grf_sizes[virtual_grf_count] = size;
 758    return virtual_grf_count++;
 759 }
 760
 761 /** Fixed HW reg constructor. */
 762 fs_reg::fs_reg(enum register_file file, int reg)
 763 {
 764    init();
 765    this->file = file;
 766    this->reg = reg;
 767    this->type = BRW_REGISTER_TYPE_F;
 768 }
 769
 770 /** Fixed HW reg constructor. */
 771 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
 772 {
 773    init();
 774    this->file = file;
 775    this->reg = reg;
 776    this->type = type;
 777 }
 778
 779 /** Automatic reg constructor. */
 780 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
 781 {
 782    init();
 783
 784    this->file = GRF;
 785    this->reg = v->virtual_grf_alloc(v->type_size(type));
 786    this->reg_offset = 0;
 787    this->type = brw_type_for_base_type(type);
 788 }
 789
 790 fs_reg *
 791 fs_visitor::variable_storage(ir_variable *var)
 792 {
 793    return (fs_reg *)hash_table_find(this->variable_ht, var);
 794 }
 795
 796 void
 797 import_uniforms_callback(const void *key,
 798                          void *data,
 799                          void *closure)
 800 {
 801    struct hash_table *dst_ht = (struct hash_table *)closure;
 802    const fs_reg *reg = (const fs_reg *)data;
 803
 804    if (reg->file != UNIFORM)
 805       return;
 806
 807    hash_table_insert(dst_ht, data, key);
 808 }
 809
 810 /* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
 811  * This brings in those uniform definitions
 812  */
 813 void
 814 fs_visitor::import_uniforms(fs_visitor *v)
 815 {
 816    hash_table_call_foreach(v->variable_ht,
 817                            import_uniforms_callback,
 818                            variable_ht);
 819    this->params_remap = v->params_remap;
 820    this->nr_params_remap = v->nr_params_remap;
 821 }
 822
 823 /* Our support for uniforms is piggy-backed on the struct
 824  * gl_fragment_program, because that's where the values actually
 825  * get stored, rather than in some global gl_shader_program uniform
 826  * store.
 827  */
 828 void
 829 fs_visitor::setup_uniform_values(ir_variable *ir)
 830 {
 831    int namelen = strlen(ir->name);
 832
 833    /* The data for our (non-builtin) uniforms is stored in a series of
 834     * gl_uniform_driver_storage structs for each subcomponent that
 835     * glGetUniformLocation() could name.  We know it's been set up in the same
 836     * order we'd walk the type, so walk the list of storage and find anything
 837     * with our name, or the prefix of a component that starts with our name.
 838     */
 839    unsigned params_before = c->prog_data.nr_params;
 840    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
 841       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 842
 843       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 844           (storage->name[namelen] != 0 &&
 845            storage->name[namelen] != '.' &&
 846            storage->name[namelen] != '[')) {
 847          continue;
 848       }
 849
 850       unsigned slots = storage->type->component_slots();
 851       if (storage->array_elements)
 852          slots *= storage->array_elements;
 853
 854       for (unsigned i = 0; i < slots; i++) {
 855          c->prog_data.param[c->prog_data.nr_params++] =
 856             &storage->storage[i].f;
 857       }
 858    }
 859
 860    /* Make sure we actually initialized the right amount of stuff here. */
 861    assert(params_before + ir->type->component_slots() ==
 862           c->prog_data.nr_params);
 863    (void)params_before;
 864 }
 865
 866
 867 /* Our support for builtin uniforms is even scarier than non-builtin.
 868  * It sits on top of the PROG_STATE_VAR parameters that are
 869  * automatically updated from GL context state.
 870  */
 871 void
 872 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
 873 {
 874    const ir_state_slot *const slots = ir->state_slots;
 875    assert(ir->state_slots != NULL);
 876
 877    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 878       /* This state reference has already been setup by ir_to_mesa, but we'll
 879        * get the same index back here.
 880        */
 881       int index = _mesa_add_state_reference(this->fp->Base.Parameters,
 882                                             (gl_state_index *)slots[i].tokens);
 883
 884       /* Add each of the unique swizzles of the element as a parameter.
 885        * This'll end up matching the expected layout of the
 886        * array/matrix/structure we're trying to fill in.
 887        */
 888       int last_swiz = -1;
 889       for (unsigned int j = 0; j < 4; j++) {
 890          int swiz = GET_SWZ(slots[i].swizzle, j);
 891          if (swiz == last_swiz)
 892             break;
 893          last_swiz = swiz;
 894
 895          c->prog_data.param[c->prog_data.nr_params++] =
 896             &fp->Base.Parameters->ParameterValues[index][swiz].f;
 897       }
 898    }
 899 }
 900
 901 fs_reg *
 902 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
 903 {
 904    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 905    fs_reg wpos = *reg;
 906    bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
 907
 908    /* gl_FragCoord.x */
 909    if (ir->pixel_center_integer) {
 910       emit(MOV(wpos, this->pixel_x));
 911    } else {
 912       emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
 913    }
 914    wpos.reg_offset++;
 915
 916    /* gl_FragCoord.y */
 917    if (!flip && ir->pixel_center_integer) {
 918       emit(MOV(wpos, this->pixel_y));
 919    } else {
 920       fs_reg pixel_y = this->pixel_y;
 921       float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
 922
 923       if (flip) {
 924          pixel_y.negate = true;
 925          offset += c->key.drawable_height - 1.0;
 926       }
 927
 928       emit(ADD(wpos, pixel_y, fs_reg(offset)));
 929    }
 930    wpos.reg_offset++;
 931
 932    /* gl_FragCoord.z */
 933    if (brw->gen >= 6) {
 934       emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
 935    } else {
 936       emit(FS_OPCODE_LINTERP, wpos,
 937            this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
 938            this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
 939            interp_reg(VARYING_SLOT_POS, 2));
 940    }
 941    wpos.reg_offset++;
 942
 943    /* gl_FragCoord.w: Already set up in emit_interpolation */
 944    emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
 945
 946    return reg;
 947 }
 948
 949 fs_inst *
 950 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
 951                          glsl_interp_qualifier interpolation_mode,
 952                          bool is_centroid)
 953 {
 954    brw_wm_barycentric_interp_mode barycoord_mode;
 955    if (brw->gen >= 6) {
 956       if (is_centroid) {
 957          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
 958             barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
 959          else
 960             barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
 961       } else {
 962          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
 963             barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
 964          else
 965             barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
 966       }
 967    } else {
 968       /* On Ironlake and below, there is only one interpolation mode.
 969        * Centroid interpolation doesn't mean anything on this hardware --
 970        * there is no multisampling.
 971        */
 972       barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
 973    }
 974    return emit(FS_OPCODE_LINTERP, attr,
 975                this->delta_x[barycoord_mode],
 976                this->delta_y[barycoord_mode], interp);
 977 }
 978
 979 fs_reg *
 980 fs_visitor::emit_general_interpolation(ir_variable *ir)
 981 {
 982    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 983    reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
 984    fs_reg attr = *reg;
 985
 986    unsigned int array_elements;
 987    const glsl_type *type;
 988
 989    if (ir->type->is_array()) {
 990       array_elements = ir->type->length;
 991       if (array_elements == 0) {
 992          fail("dereferenced array '%s' has length 0\n", ir->name);
 993       }
 994       type = ir->type->fields.array;
 995    } else {
 996       array_elements = 1;
 997       type = ir->type;
 998    }
 999
1000    glsl_interp_qualifier interpolation_mode =
1001       ir->determine_interpolation_mode(c->key.flat_shade);
1002
1003    int location = ir->location;
1004    for (unsigned int i = 0; i < array_elements; i++) {
1005       for (unsigned int j = 0; j < type->matrix_columns; j++) {
1006          if (urb_setup[location] == -1) {
1007             /* If there's no incoming setup data for this slot, don't
1008              * emit interpolation for it.
1009              */
1010             attr.reg_offset += type->vector_elements;
1011             location++;
1012             continue;
1013          }
1014
1015          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1016             /* Constant interpolation (flat shading) case. The SF has
1017              * handed us defined values in only the constant offset
1018              * field of the setup reg.
1019              */
1020             for (unsigned int k = 0; k < type->vector_elements; k++) {
1021                struct brw_reg interp = interp_reg(location, k);
1022                interp = suboffset(interp, 3);
1023                interp.type = reg->type;
1024                emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1025                attr.reg_offset++;
1026             }
1027          } else {
1028             /* Smooth/noperspective interpolation case. */
1029             for (unsigned int k = 0; k < type->vector_elements; k++) {
1030                /* FINISHME: At some point we probably want to push
1031                 * this farther by giving similar treatment to the
1032                 * other potentially constant components of the
1033                 * attribute, as well as making brw_vs_constval.c
1034                 * handle varyings other than gl_TexCoord.
1035                 */
1036                struct brw_reg interp = interp_reg(location, k);
1037                emit_linterp(attr, fs_reg(interp), interpolation_mode,
1038                             ir->centroid);
1039                if (brw->needs_unlit_centroid_workaround && ir->centroid) {
1040                   /* Get the pixel/sample mask into f0 so that we know
1041                    * which pixels are lit.  Then, for each channel that is
1042                    * unlit, replace the centroid data with non-centroid
1043                    * data.
1044                    */
1045                   emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1046                   fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1047                                                interpolation_mode, false);
1048                   inst->predicate = BRW_PREDICATE_NORMAL;
1049                   inst->predicate_inverse = true;
1050                }
1051                if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1052                   emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1053                }
1054                attr.reg_offset++;
1055             }
1056
1057          }
1058          location++;
1059       }
1060    }
1061
1062    return reg;
1063 }
1064
1065 fs_reg *
1066 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1067 {
1068    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1069
1070    /* The frontfacing comes in as a bit in the thread payload. */
1071    if (brw->gen >= 6) {
1072       emit(BRW_OPCODE_ASR, *reg,
1073            fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1074            fs_reg(15));
1075       emit(BRW_OPCODE_NOT, *reg, *reg);
1076       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1077    } else {
1078       struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1079       /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1080        * us front face
1081        */
1082       emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1083       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1084    }
1085
1086    return reg;
1087 }
1088
1089 fs_reg
1090 fs_visitor::fix_math_operand(fs_reg src)
1091 {
1092    /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1093     * might be able to do better by doing execsize = 1 math and then
1094     * expanding that result out, but we would need to be careful with
1095     * masking.
1096     *
1097     * The hardware ignores source modifiers (negate and abs) on math
1098     * instructions, so we also move to a temp to set those up.
1099     */
1100    if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1101        !src.abs && !src.negate)
1102       return src;
1103
1104    /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1105     * operands to math
1106     */
1107    if (brw->gen >= 7 && src.file != IMM)
1108       return src;
1109
1110    fs_reg expanded = fs_reg(this, glsl_type::float_type);
1111    expanded.type = src.type;
1112    emit(BRW_OPCODE_MOV, expanded, src);
1113    return expanded;
1114 }
1115
1116 fs_inst *
1117 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1118 {
1119    switch (opcode) {
1120    case SHADER_OPCODE_RCP:
1121    case SHADER_OPCODE_RSQ:
1122    case SHADER_OPCODE_SQRT:
1123    case SHADER_OPCODE_EXP2:
1124    case SHADER_OPCODE_LOG2:
1125    case SHADER_OPCODE_SIN:
1126    case SHADER_OPCODE_COS:
1127       break;
1128    default:
1129       assert(!"not reached: bad math opcode");
1130       return NULL;
1131    }
1132
1133    /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
1134     * might be able to do better by doing execsize = 1 math and then
1135     * expanding that result out, but we would need to be careful with
1136     * masking.
1137     *
1138     * Gen 6 hardware ignores source modifiers (negate and abs) on math
1139     * instructions, so we also move to a temp to set those up.
1140     */
1141    if (brw->gen >= 6)
1142       src = fix_math_operand(src);
1143
1144    fs_inst *inst = emit(opcode, dst, src);
1145
1146    if (brw->gen < 6) {
1147       inst->base_mrf = 2;
1148       inst->mlen = dispatch_width / 8;
1149    }
1150
1151    return inst;
1152 }
1153
1154 fs_inst *
1155 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1156 {
1157    int base_mrf = 2;
1158    fs_inst *inst;
1159
1160    switch (opcode) {
1161    case SHADER_OPCODE_INT_QUOTIENT:
1162    case SHADER_OPCODE_INT_REMAINDER:
1163       if (brw->gen >= 7 && dispatch_width == 16)
1164          fail("16-wide INTDIV unsupported\n");
1165       break;
1166    case SHADER_OPCODE_POW:
1167       break;
1168    default:
1169       assert(!"not reached: unsupported binary math opcode.");
1170       return NULL;
1171    }
1172
1173    if (brw->gen >= 6) {
1174       src0 = fix_math_operand(src0);
1175       src1 = fix_math_operand(src1);
1176
1177       inst = emit(opcode, dst, src0, src1);
1178    } else {
1179       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1180        * "Message Payload":
1181        *
1182        * "Operand0[7].  For the INT DIV functions, this operand is the
1183        *  denominator."
1184        *  ...
1185        * "Operand1[7].  For the INT DIV functions, this operand is the
1186        *  numerator."
1187        */
1188       bool is_int_div = opcode != SHADER_OPCODE_POW;
1189       fs_reg &op0 = is_int_div ? src1 : src0;
1190       fs_reg &op1 = is_int_div ? src0 : src1;
1191
1192       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1193       inst = emit(opcode, dst, op0, reg_null_f);
1194
1195       inst->base_mrf = base_mrf;
1196       inst->mlen = 2 * dispatch_width / 8;
1197    }
1198    return inst;
1199 }
1200
1201 void
1202 fs_visitor::assign_curb_setup()
1203 {
1204    c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
1205    if (dispatch_width == 8) {
1206       c->prog_data.first_curbe_grf = c->nr_payload_regs;
1207    } else {
1208       c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1209    }
1210
1211    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1212    foreach_list(node, &this->instructions) {
1213       fs_inst *inst = (fs_inst *)node;
1214
1215       for (unsigned int i = 0; i < 3; i++) {
1216          if (inst->src[i].file == UNIFORM) {
1217             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1218             struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1219                                                   constant_nr / 8,
1220                                                   constant_nr % 8);
1221
1222             inst->src[i].file = HW_REG;
1223             inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
1224          }
1225       }
1226    }
1227 }
1228
1229 void
1230 fs_visitor::calculate_urb_setup()
1231 {
1232    for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1233       urb_setup[i] = -1;
1234    }
1235
1236    int urb_next = 0;
1237    /* Figure out where each of the incoming setup attributes lands. */
1238    if (brw->gen >= 6) {
1239       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1240          if (fp->Base.InputsRead & BITFIELD64_BIT(i)) {
1241             urb_setup[i] = urb_next++;
1242          }
1243       }
1244    } else {
1245       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1246       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1247          /* Point size is packed into the header, not as a general attribute */
1248          if (i == VARYING_SLOT_PSIZ)
1249             continue;
1250
1251          if (c->key.input_slots_valid & BITFIELD64_BIT(i)) {
1252             /* The back color slot is skipped when the front color is
1253              * also written to.  In addition, some slots can be
1254              * written in the vertex shader and not read in the
1255              * fragment shader.  So the register number must always be
1256              * incremented, mapped or not.
1257              */
1258             if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1259                urb_setup[i] = urb_next;
1260             urb_next++;
1261          }
1262       }
1263
1264       /*
1265        * It's a FS only attribute, and we did interpolation for this attribute
1266        * in SF thread. So, count it here, too.
1267        *
1268        * See compile_sf_prog() for more info.
1269        */
1270       if (fp->Base.InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1271          urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1272    }
1273
1274    /* Each attribute is 4 setup channels, each of which is half a reg. */
1275    c->prog_data.urb_read_length = urb_next * 2;
1276 }
1277
1278 void
1279 fs_visitor::assign_urb_setup()
1280 {
1281    int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1282
1283    /* Offset all the urb_setup[] index by the actual position of the
1284     * setup regs, now that the location of the constants has been chosen.
1285     */
1286    foreach_list(node, &this->instructions) {
1287       fs_inst *inst = (fs_inst *)node;
1288
1289       if (inst->opcode == FS_OPCODE_LINTERP) {
1290          assert(inst->src[2].file == HW_REG);
1291          inst->src[2].fixed_hw_reg.nr += urb_start;
1292       }
1293
1294       if (inst->opcode == FS_OPCODE_CINTERP) {
1295          assert(inst->src[0].file == HW_REG);
1296          inst->src[0].fixed_hw_reg.nr += urb_start;
1297       }
1298    }
1299
1300    this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length;
1301 }
1302
1303 /**
1304  * Split large virtual GRFs into separate components if we can.
1305  *
1306  * This is mostly duplicated with what brw_fs_vector_splitting does,
1307  * but that's really conservative because it's afraid of doing
1308  * splitting that doesn't result in real progress after the rest of
1309  * the optimization phases, which would cause infinite looping in
1310  * optimization.  We can do it once here, safely.  This also has the
1311  * opportunity to split interpolated values, or maybe even uniforms,
1312  * which we don't have at the IR level.
1313  *
1314  * We want to split, because virtual GRFs are what we register
1315  * allocate and spill (due to contiguousness requirements for some
1316  * instructions), and they're what we naturally generate in the
1317  * codegen process, but most virtual GRFs don't actually need to be
1318  * contiguous sets of GRFs.  If we split, we'll end up with reduced
1319  * live intervals and better dead code elimination and coalescing.
1320  */
1321 void
1322 fs_visitor::split_virtual_grfs()
1323 {
1324    int num_vars = this->virtual_grf_count;
1325    bool split_grf[num_vars];
1326    int new_virtual_grf[num_vars];
1327
1328    /* Try to split anything > 0 sized. */
1329    for (int i = 0; i < num_vars; i++) {
1330       if (this->virtual_grf_sizes[i] != 1)
1331          split_grf[i] = true;
1332       else
1333          split_grf[i] = false;
1334    }
1335
1336    if (brw->has_pln &&
1337        this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1338       /* PLN opcodes rely on the delta_xy being contiguous.  We only have to
1339        * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1340        * Gen6, that was the only supported interpolation mode, and since Gen6,
1341        * delta_x and delta_y are in fixed hardware registers.
1342        */
1343       split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1344          false;
1345    }
1346
1347    foreach_list(node, &this->instructions) {
1348       fs_inst *inst = (fs_inst *)node;
1349
1350       /* If there's a SEND message that requires contiguous destination
1351        * registers, no splitting is allowed.
1352        */
1353       if (inst->regs_written > 1) {
1354          split_grf[inst->dst.reg] = false;
1355       }
1356
1357       /* If we're sending from a GRF, don't split it, on the assumption that
1358        * the send is reading the whole thing.
1359        */
1360       if (inst->is_send_from_grf()) {
1361          split_grf[inst->src[0].reg] = false;
1362       }
1363    }
1364
1365    /* Allocate new space for split regs.  Note that the virtual
1366     * numbers will be contiguous.
1367     */
1368    for (int i = 0; i < num_vars; i++) {
1369       if (split_grf[i]) {
1370          new_virtual_grf[i] = virtual_grf_alloc(1);
1371          for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1372             int reg = virtual_grf_alloc(1);
1373             assert(reg == new_virtual_grf[i] + j - 1);
1374             (void) reg;
1375          }
1376          this->virtual_grf_sizes[i] = 1;
1377       }
1378    }
1379
1380    foreach_list(node, &this->instructions) {
1381       fs_inst *inst = (fs_inst *)node;
1382
1383       if (inst->dst.file == GRF &&
1384           split_grf[inst->dst.reg] &&
1385           inst->dst.reg_offset != 0) {
1386          inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1387                           inst->dst.reg_offset - 1);
1388          inst->dst.reg_offset = 0;
1389       }
1390       for (int i = 0; i < 3; i++) {
1391          if (inst->src[i].file == GRF &&
1392              split_grf[inst->src[i].reg] &&
1393              inst->src[i].reg_offset != 0) {
1394             inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1395                                 inst->src[i].reg_offset - 1);
1396             inst->src[i].reg_offset = 0;
1397          }
1398       }
1399    }
1400    this->live_intervals_valid = false;
1401 }
1402
1403 /**
1404  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1405  *
1406  * During code generation, we create tons of temporary variables, many of
1407  * which get immediately killed and are never used again.  Yet, in later
1408  * optimization and analysis passes, such as compute_live_intervals, we need
1409  * to loop over all the virtual GRFs.  Compacting them can save a lot of
1410  * overhead.
1411  */
1412 void
1413 fs_visitor::compact_virtual_grfs()
1414 {
1415    /* Mark which virtual GRFs are used, and count how many. */
1416    int remap_table[this->virtual_grf_count];
1417    memset(remap_table, -1, sizeof(remap_table));
1418
1419    foreach_list(node, &this->instructions) {
1420       const fs_inst *inst = (const fs_inst *) node;
1421
1422       if (inst->dst.file == GRF)
1423          remap_table[inst->dst.reg] = 0;
1424
1425       for (int i = 0; i < 3; i++) {
1426          if (inst->src[i].file == GRF)
1427             remap_table[inst->src[i].reg] = 0;
1428       }
1429    }
1430
1431    /* In addition to registers used in instructions, fs_visitor keeps
1432     * direct references to certain special values which must be patched:
1433     */
1434    fs_reg *special[] = {
1435       &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1436       &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1437       &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1438       &delta_x[0], &delta_x[1], &delta_x[2],
1439       &delta_x[3], &delta_x[4], &delta_x[5],
1440       &delta_y[0], &delta_y[1], &delta_y[2],
1441       &delta_y[3], &delta_y[4], &delta_y[5],
1442    };
1443    STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1444    STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1445
1446    /* Treat all special values as used, to be conservative */
1447    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1448       if (special[i]->file == GRF)
1449          remap_table[special[i]->reg] = 0;
1450    }
1451
1452    /* Compact the GRF arrays. */
1453    int new_index = 0;
1454    for (int i = 0; i < this->virtual_grf_count; i++) {
1455       if (remap_table[i] != -1) {
1456          remap_table[i] = new_index;
1457          virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1458          if (live_intervals_valid) {
1459             virtual_grf_start[new_index] = virtual_grf_start[i];
1460             virtual_grf_end[new_index] = virtual_grf_end[i];
1461          }
1462          ++new_index;
1463       }
1464    }
1465
1466    this->virtual_grf_count = new_index;
1467
1468    /* Patch all the instructions to use the newly renumbered registers */
1469    foreach_list(node, &this->instructions) {
1470       fs_inst *inst = (fs_inst *) node;
1471
1472       if (inst->dst.file == GRF)
1473          inst->dst.reg = remap_table[inst->dst.reg];
1474
1475       for (int i = 0; i < 3; i++) {
1476          if (inst->src[i].file == GRF)
1477             inst->src[i].reg = remap_table[inst->src[i].reg];
1478       }
1479    }
1480
1481    /* Patch all the references to special values */
1482    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1483       if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1484          special[i]->reg = remap_table[special[i]->reg];
1485    }
1486 }
1487
1488 bool
1489 fs_visitor::remove_dead_constants()
1490 {
1491    if (dispatch_width == 8) {
1492       this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
1493       this->nr_params_remap = c->prog_data.nr_params;
1494
1495       for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
1496          this->params_remap[i] = -1;
1497
1498       /* Find which params are still in use. */
1499       foreach_list(node, &this->instructions) {
1500          fs_inst *inst = (fs_inst *)node;
1501
1502          for (int i = 0; i < 3; i++) {
1503             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1504
1505             if (inst->src[i].file != UNIFORM)
1506                continue;
1507
1508             /* Section 5.11 of the OpenGL 4.3 spec says:
1509              *
1510              *     "Out-of-bounds reads return undefined values, which include
1511              *     values from other variables of the active program or zero."
1512              */
1513             if (constant_nr < 0 || constant_nr >= (int)c->prog_data.nr_params) {
1514                constant_nr = 0;
1515             }
1516
1517             /* For now, set this to non-negative.  We'll give it the
1518              * actual new number in a moment, in order to keep the
1519              * register numbers nicely ordered.
1520              */
1521             this->params_remap[constant_nr] = 0;
1522          }
1523       }
1524
1525       /* Figure out what the new numbers for the params will be.  At some
1526        * point when we're doing uniform array access, we're going to want
1527        * to keep the distinction between .reg and .reg_offset, but for
1528        * now we don't care.
1529        */
1530       unsigned int new_nr_params = 0;
1531       for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1532          if (this->params_remap[i] != -1) {
1533             this->params_remap[i] = new_nr_params++;
1534          }
1535       }
1536
1537       /* Update the list of params to be uploaded to match our new numbering. */
1538       for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1539          int remapped = this->params_remap[i];
1540
1541          if (remapped == -1)
1542             continue;
1543
1544          c->prog_data.param[remapped] = c->prog_data.param[i];
1545       }
1546
1547       c->prog_data.nr_params = new_nr_params;
1548    } else {
1549       /* This should have been generated in the 8-wide pass already. */
1550       assert(this->params_remap);
1551    }
1552
1553    /* Now do the renumbering of the shader to remove unused params. */
1554    foreach_list(node, &this->instructions) {
1555       fs_inst *inst = (fs_inst *)node;
1556
1557       for (int i = 0; i < 3; i++) {
1558          int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1559
1560          if (inst->src[i].file != UNIFORM)
1561             continue;
1562
1563          /* as above alias to 0 */
1564          if (constant_nr < 0 || constant_nr >= (int)this->nr_params_remap) {
1565             constant_nr = 0;
1566          }
1567          assert(this->params_remap[constant_nr] != -1);
1568          inst->src[i].reg = this->params_remap[constant_nr];
1569          inst->src[i].reg_offset = 0;
1570       }
1571    }
1572
1573    return true;
1574 }
1575
1576 /*
1577  * Implements array access of uniforms by inserting a
1578  * PULL_CONSTANT_LOAD instruction.
1579  *
1580  * Unlike temporary GRF array access (where we don't support it due to
1581  * the difficulty of doing relative addressing on instruction
1582  * destinations), we could potentially do array access of uniforms
1583  * that were loaded in GRF space as push constants.  In real-world
1584  * usage we've seen, though, the arrays being used are always larger
1585  * than we could load as push constants, so just always move all
1586  * uniform array access out to a pull constant buffer.
1587  */
1588 void
1589 fs_visitor::move_uniform_array_access_to_pull_constants()
1590 {
1591    int pull_constant_loc[c->prog_data.nr_params];
1592
1593    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1594       pull_constant_loc[i] = -1;
1595    }
1596
1597    /* Walk through and find array access of uniforms.  Put a copy of that
1598     * uniform in the pull constant buffer.
1599     *
1600     * Note that we don't move constant-indexed accesses to arrays.  No
1601     * testing has been done of the performance impact of this choice.
1602     */
1603    foreach_list_safe(node, &this->instructions) {
1604       fs_inst *inst = (fs_inst *)node;
1605
1606       for (int i = 0 ; i < 3; i++) {
1607          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1608             continue;
1609
1610          int uniform = inst->src[i].reg;
1611
1612          /* If this array isn't already present in the pull constant buffer,
1613           * add it.
1614           */
1615          if (pull_constant_loc[uniform] == -1) {
1616             const float **values = &c->prog_data.param[uniform];
1617
1618             pull_constant_loc[uniform] = c->prog_data.nr_pull_params;
1619
1620             assert(param_size[uniform]);
1621
1622             for (int j = 0; j < param_size[uniform]; j++) {
1623                c->prog_data.pull_param[c->prog_data.nr_pull_params++] =
1624                   values[j];
1625             }
1626          }
1627
1628          /* Set up the annotation tracking for new generated instructions. */
1629          base_ir = inst->ir;
1630          current_annotation = inst->annotation;
1631
1632          fs_reg surf_index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1633          fs_reg temp = fs_reg(this, glsl_type::float_type);
1634          exec_list list = VARYING_PULL_CONSTANT_LOAD(temp,
1635                                                      surf_index,
1636                                                      *inst->src[i].reladdr,
1637                                                      pull_constant_loc[uniform] +
1638                                                      inst->src[i].reg_offset);
1639          inst->insert_before(&list);
1640
1641          inst->src[i].file = temp.file;
1642          inst->src[i].reg = temp.reg;
1643          inst->src[i].reg_offset = temp.reg_offset;
1644          inst->src[i].reladdr = NULL;
1645       }
1646    }
1647 }
1648
1649 /**
1650  * Choose accesses from the UNIFORM file to demote to using the pull
1651  * constant buffer.
1652  *
1653  * We allow a fragment shader to have more than the specified minimum
1654  * maximum number of fragment shader uniform components (64).  If
1655  * there are too many of these, they'd fill up all of register space.
1656  * So, this will push some of them out to the pull constant buffer and
1657  * update the program to load them.
1658  */
1659 void
1660 fs_visitor::setup_pull_constants()
1661 {
1662    /* Only allow 16 registers (128 uniform components) as push constants. */
1663    unsigned int max_uniform_components = 16 * 8;
1664    if (c->prog_data.nr_params <= max_uniform_components)
1665       return;
1666
1667    if (dispatch_width == 16) {
1668       fail("Pull constants not supported in 16-wide\n");
1669       return;
1670    }
1671
1672    /* Just demote the end of the list.  We could probably do better
1673     * here, demoting things that are rarely used in the program first.
1674     */
1675    unsigned int pull_uniform_base = max_uniform_components;
1676
1677    int pull_constant_loc[c->prog_data.nr_params];
1678    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1679       if (i < pull_uniform_base) {
1680          pull_constant_loc[i] = -1;
1681       } else {
1682          pull_constant_loc[i] = -1;
1683          /* If our constant is already being uploaded for reladdr purposes,
1684           * reuse it.
1685           */
1686          for (unsigned int j = 0; j < c->prog_data.nr_pull_params; j++) {
1687             if (c->prog_data.pull_param[j] == c->prog_data.param[i]) {
1688                pull_constant_loc[i] = j;
1689                break;
1690             }
1691          }
1692          if (pull_constant_loc[i] == -1) {
1693             int pull_index = c->prog_data.nr_pull_params++;
1694             c->prog_data.pull_param[pull_index] = c->prog_data.param[i];
1695             pull_constant_loc[i] = pull_index;;
1696          }
1697       }
1698    }
1699    c->prog_data.nr_params = pull_uniform_base;
1700
1701    foreach_list(node, &this->instructions) {
1702       fs_inst *inst = (fs_inst *)node;
1703
1704       for (int i = 0; i < 3; i++) {
1705          if (inst->src[i].file != UNIFORM)
1706             continue;
1707
1708          int pull_index = pull_constant_loc[inst->src[i].reg +
1709                                             inst->src[i].reg_offset];
1710          if (pull_index == -1)
1711             continue;
1712
1713          assert(!inst->src[i].reladdr);
1714
1715          fs_reg dst = fs_reg(this, glsl_type::float_type);
1716          fs_reg index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1717          fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1718          fs_inst *pull =
1719             new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1720                                  dst, index, offset);
1721          pull->ir = inst->ir;
1722          pull->annotation = inst->annotation;
1723
1724          inst->insert_before(pull);
1725
1726          inst->src[i].file = GRF;
1727          inst->src[i].reg = dst.reg;
1728          inst->src[i].reg_offset = 0;
1729          inst->src[i].smear = pull_index & 3;
1730       }
1731    }
1732 }
1733
1734 bool
1735 fs_visitor::opt_algebraic()
1736 {
1737    bool progress = false;
1738
1739    foreach_list(node, &this->instructions) {
1740       fs_inst *inst = (fs_inst *)node;
1741
1742       switch (inst->opcode) {
1743       case BRW_OPCODE_MUL:
1744          if (inst->src[1].file != IMM)
1745             continue;
1746
1747          /* a * 1.0 = a */
1748          if (inst->src[1].is_one()) {
1749             inst->opcode = BRW_OPCODE_MOV;
1750             inst->src[1] = reg_undef;
1751             progress = true;
1752             break;
1753          }
1754
1755          /* a * 0.0 = 0.0 */
1756          if (inst->src[1].is_zero()) {
1757             inst->opcode = BRW_OPCODE_MOV;
1758             inst->src[0] = inst->src[1];
1759             inst->src[1] = reg_undef;
1760             progress = true;
1761             break;
1762          }
1763
1764          break;
1765       case BRW_OPCODE_ADD:
1766          if (inst->src[1].file != IMM)
1767             continue;
1768
1769          /* a + 0.0 = a */
1770          if (inst->src[1].is_zero()) {
1771             inst->opcode = BRW_OPCODE_MOV;
1772             inst->src[1] = reg_undef;
1773             progress = true;
1774             break;
1775          }
1776          break;
1777       default:
1778          break;
1779       }
1780    }
1781
1782    return progress;
1783 }
1784
1785 /**
1786  * Removes any instructions writing a VGRF where that VGRF is not used by any
1787  * later instruction.
1788  */
1789 bool
1790 fs_visitor::dead_code_eliminate()
1791 {
1792    bool progress = false;
1793    int pc = 0;
1794
1795    calculate_live_intervals();
1796
1797    foreach_list_safe(node, &this->instructions) {
1798       fs_inst *inst = (fs_inst *)node;
1799
1800       if (inst->dst.file == GRF) {
1801          assert(this->virtual_grf_end[inst->dst.reg] >= pc);
1802          if (this->virtual_grf_end[inst->dst.reg] == pc) {
1803             inst->remove();
1804             progress = true;
1805          }
1806       }
1807
1808       pc++;
1809    }
1810
1811    if (progress)
1812       live_intervals_valid = false;
1813
1814    return progress;
1815 }
1816
1817 struct dead_code_hash_key
1818 {
1819    int vgrf;
1820    int reg_offset;
1821 };
1822
1823 static bool
1824 dead_code_hash_compare(const void *a, const void *b)
1825 {
1826    return memcmp(a, b, sizeof(struct dead_code_hash_key)) == 0;
1827 }
1828
1829 static void
1830 clear_dead_code_hash(struct hash_table *ht)
1831 {
1832    struct hash_entry *entry;
1833
1834    hash_table_foreach(ht, entry) {
1835       _mesa_hash_table_remove(ht, entry);
1836    }
1837 }
1838
1839 static void
1840 insert_dead_code_hash(struct hash_table *ht,
1841                       int vgrf, int reg_offset, fs_inst *inst)
1842 {
1843    /* We don't bother freeing keys, because they'll be GCed with the ht. */
1844    struct dead_code_hash_key *key = ralloc(ht, struct dead_code_hash_key);
1845
1846    key->vgrf = vgrf;
1847    key->reg_offset = reg_offset;
1848
1849    _mesa_hash_table_insert(ht, _mesa_hash_data(key, sizeof(*key)), key, inst);
1850 }
1851
1852 static struct hash_entry *
1853 get_dead_code_hash_entry(struct hash_table *ht, int vgrf, int reg_offset)
1854 {
1855    struct dead_code_hash_key key;
1856
1857    key.vgrf = vgrf;
1858    key.reg_offset = reg_offset;
1859
1860    return _mesa_hash_table_search(ht, _mesa_hash_data(&key, sizeof(key)), &key);
1861 }
1862
1863 static void
1864 remove_dead_code_hash(struct hash_table *ht,
1865                       int vgrf, int reg_offset)
1866 {
1867    struct hash_entry *entry = get_dead_code_hash_entry(ht, vgrf, reg_offset);
1868    if (!entry)
1869       return;
1870
1871    _mesa_hash_table_remove(ht, entry);
1872 }
1873
1874 /**
1875  * Walks basic blocks, removing any regs that are written but not read before
1876  * being redefined.
1877  *
1878  * The dead_code_eliminate() function implements a global dead code
1879  * elimination, but it only handles the removing the last write to a register
1880  * if it's never read.  This one can handle intermediate writes, but only
1881  * within a basic block.
1882  */
1883 bool
1884 fs_visitor::dead_code_eliminate_local()
1885 {
1886    struct hash_table *ht;
1887    bool progress = false;
1888
1889    ht = _mesa_hash_table_create(mem_ctx, dead_code_hash_compare);
1890
1891    foreach_list_safe(node, &this->instructions) {
1892       fs_inst *inst = (fs_inst *)node;
1893
1894       /* At a basic block, empty the HT since we don't understand dataflow
1895        * here.
1896        */
1897       if (inst->is_control_flow()) {
1898          clear_dead_code_hash(ht);
1899          continue;
1900       }
1901
1902       /* Clear the HT of any instructions that got read. */
1903       for (int i = 0; i < 3; i++) {
1904          fs_reg src = inst->src[i];
1905          if (src.file != GRF)
1906             continue;
1907
1908          int read = 1;
1909          if (inst->is_send_from_grf())
1910             read = virtual_grf_sizes[src.reg] - src.reg_offset;
1911
1912          for (int reg_offset = src.reg_offset;
1913               reg_offset < src.reg_offset + read;
1914               reg_offset++) {
1915             remove_dead_code_hash(ht, src.reg, reg_offset);
1916          }
1917       }
1918
1919       /* Add any update of a GRF to the HT, removing a previous write if it
1920        * wasn't read.
1921        */
1922       if (inst->dst.file == GRF) {
1923          if (inst->regs_written > 1) {
1924             /* We don't know how to trim channels from an instruction's
1925              * writes, so we can't incrementally remove unread channels from
1926              * it.  Just remove whatever it overwrites from the table
1927              */
1928             for (int i = 0; i < inst->regs_written; i++) {
1929                remove_dead_code_hash(ht,
1930                                      inst->dst.reg,
1931                                      inst->dst.reg_offset + i);
1932             }
1933          } else {
1934             struct hash_entry *entry =
1935                get_dead_code_hash_entry(ht, inst->dst.reg,
1936                                         inst->dst.reg_offset);
1937
1938             if (inst->is_partial_write()) {
1939                /* For a partial write, we can't remove any previous dead code
1940                 * candidate, since we're just modifying their result, but we can
1941                 * be dead code eliminiated ourselves.
1942                 */
1943                if (entry) {
1944                   entry->data = inst;
1945                } else {
1946                   insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
1947                                         inst);
1948                }
1949             } else {
1950                if (entry) {
1951                   /* We're completely updating a channel, and there was a
1952                    * previous write to the channel that wasn't read.  Kill it!
1953                    */
1954                   fs_inst *inst = (fs_inst *)entry->data;
1955                   inst->remove();
1956                   progress = true;
1957                   _mesa_hash_table_remove(ht, entry);
1958                }
1959
1960                insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
1961                                      inst);
1962             }
1963          }
1964       }
1965    }
1966
1967    _mesa_hash_table_destroy(ht, NULL);
1968
1969    if (progress)
1970       live_intervals_valid = false;
1971
1972    return progress;
1973 }
1974
1975 /**
1976  * Implements a second type of register coalescing: This one checks if
1977  * the two regs involved in a raw move don't interfere, in which case
1978  * they can both by stored in the same place and the MOV removed.
1979  */
1980 bool
1981 fs_visitor::register_coalesce_2()
1982 {
1983    bool progress = false;
1984
1985    calculate_live_intervals();
1986
1987    foreach_list_safe(node, &this->instructions) {
1988       fs_inst *inst = (fs_inst *)node;
1989
1990       if (inst->opcode != BRW_OPCODE_MOV ||
1991           inst->is_partial_write() ||
1992           inst->saturate ||
1993           inst->src[0].file != GRF ||
1994           inst->src[0].negate ||
1995           inst->src[0].abs ||
1996           inst->src[0].smear != -1 ||
1997           inst->dst.file != GRF ||
1998           inst->dst.type != inst->src[0].type ||
1999           virtual_grf_sizes[inst->src[0].reg] != 1 ||
2000           virtual_grf_interferes(inst->dst.reg, inst->src[0].reg)) {
2001          continue;
2002       }
2003
2004       int reg_from = inst->src[0].reg;
2005       assert(inst->src[0].reg_offset == 0);
2006       int reg_to = inst->dst.reg;
2007       int reg_to_offset = inst->dst.reg_offset;
2008
2009       foreach_list(node, &this->instructions) {
2010          fs_inst *scan_inst = (fs_inst *)node;
2011
2012          if (scan_inst->dst.file == GRF &&
2013              scan_inst->dst.reg == reg_from) {
2014             scan_inst->dst.reg = reg_to;
2015             scan_inst->dst.reg_offset = reg_to_offset;
2016          }
2017          for (int i = 0; i < 3; i++) {
2018             if (scan_inst->src[i].file == GRF &&
2019                 scan_inst->src[i].reg == reg_from) {
2020                scan_inst->src[i].reg = reg_to;
2021                scan_inst->src[i].reg_offset = reg_to_offset;
2022             }
2023          }
2024       }
2025
2026       inst->remove();
2027
2028       /* We don't need to recalculate live intervals inside the loop despite
2029        * flagging live_intervals_valid because we only use live intervals for
2030        * the interferes test, and we must have had a situation where the
2031        * intervals were:
2032        *
2033        *  from  to
2034        *  ^
2035        *  |
2036        *  v
2037        *        ^
2038        *        |
2039        *        v
2040        *
2041        * Some register R that might get coalesced with one of these two could
2042        * only be referencing "to", otherwise "from"'s range would have been
2043        * longer.  R's range could also only start at the end of "to" or later,
2044        * otherwise it will conflict with "to" when we try to coalesce "to"
2045        * into Rw anyway.
2046        */
2047       live_intervals_valid = false;
2048
2049       progress = true;
2050       continue;
2051    }
2052
2053    return progress;
2054 }
2055
2056 bool
2057 fs_visitor::register_coalesce()
2058 {
2059    bool progress = false;
2060    int if_depth = 0;
2061    int loop_depth = 0;
2062
2063    foreach_list_safe(node, &this->instructions) {
2064       fs_inst *inst = (fs_inst *)node;
2065
2066       /* Make sure that we dominate the instructions we're going to
2067        * scan for interfering with our coalescing, or we won't have
2068        * scanned enough to see if anything interferes with our
2069        * coalescing.  We don't dominate the following instructions if
2070        * we're in a loop or an if block.
2071        */
2072       switch (inst->opcode) {
2073       case BRW_OPCODE_DO:
2074          loop_depth++;
2075          break;
2076       case BRW_OPCODE_WHILE:
2077          loop_depth--;
2078          break;
2079       case BRW_OPCODE_IF:
2080          if_depth++;
2081          break;
2082       case BRW_OPCODE_ENDIF:
2083          if_depth--;
2084          break;
2085       default:
2086          break;
2087       }
2088       if (loop_depth || if_depth)
2089          continue;
2090
2091       if (inst->opcode != BRW_OPCODE_MOV ||
2092           inst->is_partial_write() ||
2093           inst->saturate ||
2094           inst->dst.file != GRF || (inst->src[0].file != GRF &&
2095                                     inst->src[0].file != UNIFORM)||
2096           inst->dst.type != inst->src[0].type)
2097          continue;
2098
2099       bool has_source_modifiers = (inst->src[0].abs ||
2100                                    inst->src[0].negate ||
2101                                    inst->src[0].smear != -1 ||
2102                                    inst->src[0].file == UNIFORM);
2103
2104       /* Found a move of a GRF to a GRF.  Let's see if we can coalesce
2105        * them: check for no writes to either one until the exit of the
2106        * program.
2107        */
2108       bool interfered = false;
2109
2110       for (fs_inst *scan_inst = (fs_inst *)inst->next;
2111            !scan_inst->is_tail_sentinel();
2112            scan_inst = (fs_inst *)scan_inst->next) {
2113          if (scan_inst->dst.file == GRF) {
2114             if (scan_inst->overwrites_reg(inst->dst) ||
2115                 scan_inst->overwrites_reg(inst->src[0])) {
2116                interfered = true;
2117                break;
2118             }
2119          }
2120
2121          if (has_source_modifiers) {
2122             for (int i = 0; i < 3; i++) {
2123                if (scan_inst->src[i].file == GRF &&
2124                    scan_inst->src[i].reg == inst->dst.reg &&
2125                    scan_inst->src[i].reg_offset == inst->dst.reg_offset &&
2126                    inst->dst.type != scan_inst->src[i].type)
2127                {
2128                  interfered = true;
2129                  break;
2130                }
2131             }
2132          }
2133
2134
2135          /* The gen6 MATH instruction can't handle source modifiers or
2136           * unusual register regions, so avoid coalescing those for
2137           * now.  We should do something more specific.
2138           */
2139          if (has_source_modifiers && !can_do_source_mods(scan_inst)) {
2140             interfered = true;
2141             break;
2142          }
2143
2144          /* The accumulator result appears to get used for the
2145           * conditional modifier generation.  When negating a UD
2146           * value, there is a 33rd bit generated for the sign in the
2147           * accumulator value, so now you can't check, for example,
2148           * equality with a 32-bit value.  See piglit fs-op-neg-uint.
2149           */
2150          if (scan_inst->conditional_mod &&
2151              inst->src[0].negate &&
2152              inst->src[0].type == BRW_REGISTER_TYPE_UD) {
2153             interfered = true;
2154             break;
2155          }
2156       }
2157       if (interfered) {
2158          continue;
2159       }
2160
2161       /* Rewrite the later usage to point at the source of the move to
2162        * be removed.
2163        */
2164       for (fs_inst *scan_inst = inst;
2165            !scan_inst->is_tail_sentinel();
2166            scan_inst = (fs_inst *)scan_inst->next) {
2167          for (int i = 0; i < 3; i++) {
2168             if (scan_inst->src[i].file == GRF &&
2169                 scan_inst->src[i].reg == inst->dst.reg &&
2170                 scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
2171                fs_reg new_src = inst->src[0];
2172                if (scan_inst->src[i].abs) {
2173                   new_src.negate = 0;
2174                   new_src.abs = 1;
2175                }
2176                new_src.negate ^= scan_inst->src[i].negate;
2177                scan_inst->src[i] = new_src;
2178             }
2179          }
2180       }
2181
2182       inst->remove();
2183       progress = true;
2184    }
2185
2186    if (progress)
2187       live_intervals_valid = false;
2188
2189    return progress;
2190 }
2191
2192
2193 bool
2194 fs_visitor::compute_to_mrf()
2195 {
2196    bool progress = false;
2197    int next_ip = 0;
2198
2199    calculate_live_intervals();
2200
2201    foreach_list_safe(node, &this->instructions) {
2202       fs_inst *inst = (fs_inst *)node;
2203
2204       int ip = next_ip;
2205       next_ip++;
2206
2207       if (inst->opcode != BRW_OPCODE_MOV ||
2208           inst->is_partial_write() ||
2209           inst->dst.file != MRF || inst->src[0].file != GRF ||
2210           inst->dst.type != inst->src[0].type ||
2211           inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
2212          continue;
2213
2214       /* Work out which hardware MRF registers are written by this
2215        * instruction.
2216        */
2217       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2218       int mrf_high;
2219       if (inst->dst.reg & BRW_MRF_COMPR4) {
2220          mrf_high = mrf_low + 4;
2221       } else if (dispatch_width == 16 &&
2222                  (!inst->force_uncompressed && !inst->force_sechalf)) {
2223          mrf_high = mrf_low + 1;
2224       } else {
2225          mrf_high = mrf_low;
2226       }
2227
2228       /* Can't compute-to-MRF this GRF if someone else was going to
2229        * read it later.
2230        */
2231       if (this->virtual_grf_end[inst->src[0].reg] > ip)
2232          continue;
2233
2234       /* Found a move of a GRF to a MRF.  Let's see if we can go
2235        * rewrite the thing that made this GRF to write into the MRF.
2236        */
2237       fs_inst *scan_inst;
2238       for (scan_inst = (fs_inst *)inst->prev;
2239            scan_inst->prev != NULL;
2240            scan_inst = (fs_inst *)scan_inst->prev) {
2241          if (scan_inst->dst.file == GRF &&
2242              scan_inst->dst.reg == inst->src[0].reg) {
2243             /* Found the last thing to write our reg we want to turn
2244              * into a compute-to-MRF.
2245              */
2246
2247             /* If this one instruction didn't populate all the
2248              * channels, bail.  We might be able to rewrite everything
2249              * that writes that reg, but it would require smarter
2250              * tracking to delay the rewriting until complete success.
2251              */
2252             if (scan_inst->is_partial_write())
2253                break;
2254
2255             /* Things returning more than one register would need us to
2256              * understand coalescing out more than one MOV at a time.
2257              */
2258             if (scan_inst->regs_written > 1)
2259                break;
2260
2261             /* SEND instructions can't have MRF as a destination. */
2262             if (scan_inst->mlen)
2263                break;
2264
2265             if (brw->gen == 6) {
2266                /* gen6 math instructions must have the destination be
2267                 * GRF, so no compute-to-MRF for them.
2268                 */
2269                if (scan_inst->is_math()) {
2270                   break;
2271                }
2272             }
2273
2274             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2275                /* Found the creator of our MRF's source value. */
2276                scan_inst->dst.file = MRF;
2277                scan_inst->dst.reg = inst->dst.reg;
2278                scan_inst->saturate |= inst->saturate;
2279                inst->remove();
2280                progress = true;
2281             }
2282             break;
2283          }
2284
2285          /* We don't handle control flow here.  Most computation of
2286           * values that end up in MRFs are shortly before the MRF
2287           * write anyway.
2288           */
2289          if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2290             break;
2291
2292          /* You can't read from an MRF, so if someone else reads our
2293           * MRF's source GRF that we wanted to rewrite, that stops us.
2294           */
2295          bool interfered = false;
2296          for (int i = 0; i < 3; i++) {
2297             if (scan_inst->src[i].file == GRF &&
2298                 scan_inst->src[i].reg == inst->src[0].reg &&
2299                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2300                interfered = true;
2301             }
2302          }
2303          if (interfered)
2304             break;
2305
2306          if (scan_inst->dst.file == MRF) {
2307             /* If somebody else writes our MRF here, we can't
2308              * compute-to-MRF before that.
2309              */
2310             int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2311             int scan_mrf_high;
2312
2313             if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2314                scan_mrf_high = scan_mrf_low + 4;
2315             } else if (dispatch_width == 16 &&
2316                        (!scan_inst->force_uncompressed &&
2317                         !scan_inst->force_sechalf)) {
2318                scan_mrf_high = scan_mrf_low + 1;
2319             } else {
2320                scan_mrf_high = scan_mrf_low;
2321             }
2322
2323             if (mrf_low == scan_mrf_low ||
2324                 mrf_low == scan_mrf_high ||
2325                 mrf_high == scan_mrf_low ||
2326                 mrf_high == scan_mrf_high) {
2327                break;
2328             }
2329          }
2330
2331          if (scan_inst->mlen > 0) {
2332             /* Found a SEND instruction, which means that there are
2333              * live values in MRFs from base_mrf to base_mrf +
2334              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
2335              * above it.
2336              */
2337             if (mrf_low >= scan_inst->base_mrf &&
2338                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2339                break;
2340             }
2341             if (mrf_high >= scan_inst->base_mrf &&
2342                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2343                break;
2344             }
2345          }
2346       }
2347    }
2348
2349    if (progress)
2350       live_intervals_valid = false;
2351
2352    return progress;
2353 }
2354
2355 /**
2356  * Walks through basic blocks, looking for repeated MRF writes and
2357  * removing the later ones.
2358  */
2359 bool
2360 fs_visitor::remove_duplicate_mrf_writes()
2361 {
2362    fs_inst *last_mrf_move[16];
2363    bool progress = false;
2364
2365    /* Need to update the MRF tracking for compressed instructions. */
2366    if (dispatch_width == 16)
2367       return false;
2368
2369    memset(last_mrf_move, 0, sizeof(last_mrf_move));
2370
2371    foreach_list_safe(node, &this->instructions) {
2372       fs_inst *inst = (fs_inst *)node;
2373
2374       if (inst->is_control_flow()) {
2375          memset(last_mrf_move, 0, sizeof(last_mrf_move));
2376       }
2377
2378       if (inst->opcode == BRW_OPCODE_MOV &&
2379           inst->dst.file == MRF) {
2380          fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2381          if (prev_inst && inst->equals(prev_inst)) {
2382             inst->remove();
2383             progress = true;
2384             continue;
2385          }
2386       }
2387
2388       /* Clear out the last-write records for MRFs that were overwritten. */
2389       if (inst->dst.file == MRF) {
2390          last_mrf_move[inst->dst.reg] = NULL;
2391       }
2392
2393       if (inst->mlen > 0) {
2394          /* Found a SEND instruction, which will include two or fewer
2395           * implied MRF writes.  We could do better here.
2396           */
2397          for (int i = 0; i < implied_mrf_writes(inst); i++) {
2398             last_mrf_move[inst->base_mrf + i] = NULL;
2399          }
2400       }
2401
2402       /* Clear out any MRF move records whose sources got overwritten. */
2403       if (inst->dst.file == GRF) {
2404          for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2405             if (last_mrf_move[i] &&
2406                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2407                last_mrf_move[i] = NULL;
2408             }
2409          }
2410       }
2411
2412       if (inst->opcode == BRW_OPCODE_MOV &&
2413           inst->dst.file == MRF &&
2414           inst->src[0].file == GRF &&
2415           !inst->is_partial_write()) {
2416          last_mrf_move[inst->dst.reg] = inst;
2417       }
2418    }
2419
2420    if (progress)
2421       live_intervals_valid = false;
2422
2423    return progress;
2424 }
2425
2426 static void
2427 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2428                         int first_grf, int grf_len)
2429 {
2430    bool inst_16wide = (dispatch_width > 8 &&
2431                        !inst->force_uncompressed &&
2432                        !inst->force_sechalf);
2433
2434    /* Clear the flag for registers that actually got read (as expected). */
2435    for (int i = 0; i < 3; i++) {
2436       int grf;
2437       if (inst->src[i].file == GRF) {
2438          grf = inst->src[i].reg;
2439       } else if (inst->src[i].file == HW_REG &&
2440                  inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2441          grf = inst->src[i].fixed_hw_reg.nr;
2442       } else {
2443          continue;
2444       }
2445
2446       if (grf >= first_grf &&
2447           grf < first_grf + grf_len) {
2448          deps[grf - first_grf] = false;
2449          if (inst_16wide)
2450             deps[grf - first_grf + 1] = false;
2451       }
2452    }
2453 }
2454
2455 /**
2456  * Implements this workaround for the original 965:
2457  *
2458  *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2459  *      check for post destination dependencies on this instruction, software
2460  *      must ensure that there is no destination hazard for the case of ‘write
2461  *      followed by a posted write’ shown in the following example.
2462  *
2463  *      1. mov r3 0
2464  *      2. send r3.xy <rest of send instruction>
2465  *      3. mov r2 r3
2466  *
2467  *      Due to no post-destination dependency check on the ‘send’, the above
2468  *      code sequence could have two instructions (1 and 2) in flight at the
2469  *      same time that both consider ‘r3’ as the target of their final writes.
2470  */
2471 void
2472 fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
2473 {
2474    int reg_size = dispatch_width / 8;
2475    int write_len = inst->regs_written * reg_size;
2476    int first_write_grf = inst->dst.reg;
2477    bool needs_dep[BRW_MAX_MRF];
2478    assert(write_len < (int)sizeof(needs_dep) - 1);
2479
2480    memset(needs_dep, false, sizeof(needs_dep));
2481    memset(needs_dep, true, write_len);
2482
2483    clear_deps_for_inst_src(inst, dispatch_width,
2484                            needs_dep, first_write_grf, write_len);
2485
2486    /* Walk backwards looking for writes to registers we're writing which
2487     * aren't read since being written.  If we hit the start of the program,
2488     * we assume that there are no outstanding dependencies on entry to the
2489     * program.
2490     */
2491    for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2492         scan_inst != NULL;
2493         scan_inst = (fs_inst *)scan_inst->prev) {
2494
2495       /* If we hit control flow, assume that there *are* outstanding
2496        * dependencies, and force their cleanup before our instruction.
2497        */
2498       if (scan_inst->is_control_flow()) {
2499          for (int i = 0; i < write_len; i++) {
2500             if (needs_dep[i]) {
2501                inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2502             }
2503          }
2504          return;
2505       }
2506
2507       bool scan_inst_16wide = (dispatch_width > 8 &&
2508                                !scan_inst->force_uncompressed &&
2509                                !scan_inst->force_sechalf);
2510
2511       /* We insert our reads as late as possible on the assumption that any
2512        * instruction but a MOV that might have left us an outstanding
2513        * dependency has more latency than a MOV.
2514        */
2515       if (scan_inst->dst.file == GRF) {
2516          for (int i = 0; i < scan_inst->regs_written; i++) {
2517             int reg = scan_inst->dst.reg + i * reg_size;
2518
2519             if (reg >= first_write_grf &&
2520                 reg < first_write_grf + write_len &&
2521                 needs_dep[reg - first_write_grf]) {
2522                inst->insert_before(DEP_RESOLVE_MOV(reg));
2523                needs_dep[reg - first_write_grf] = false;
2524                if (scan_inst_16wide)
2525                   needs_dep[reg - first_write_grf + 1] = false;
2526             }
2527          }
2528       }
2529
2530       /* Clear the flag for registers that actually got read (as expected). */
2531       clear_deps_for_inst_src(scan_inst, dispatch_width,
2532                               needs_dep, first_write_grf, write_len);
2533
2534       /* Continue the loop only if we haven't resolved all the dependencies */
2535       int i;
2536       for (i = 0; i < write_len; i++) {
2537          if (needs_dep[i])
2538             break;
2539       }
2540       if (i == write_len)
2541          return;
2542    }
2543 }
2544
2545 /**
2546  * Implements this workaround for the original 965:
2547  *
2548  *     "[DevBW, DevCL] Errata: A destination register from a send can not be
2549  *      used as a destination register until after it has been sourced by an
2550  *      instruction with a different destination register.
2551  */
2552 void
2553 fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
2554 {
2555    int write_len = inst->regs_written * dispatch_width / 8;
2556    int first_write_grf = inst->dst.reg;
2557    bool needs_dep[BRW_MAX_MRF];
2558    assert(write_len < (int)sizeof(needs_dep) - 1);
2559
2560    memset(needs_dep, false, sizeof(needs_dep));
2561    memset(needs_dep, true, write_len);
2562    /* Walk forwards looking for writes to registers we're writing which aren't
2563     * read before being written.
2564     */
2565    for (fs_inst *scan_inst = (fs_inst *)inst->next;
2566         !scan_inst->is_tail_sentinel();
2567         scan_inst = (fs_inst *)scan_inst->next) {
2568       /* If we hit control flow, force resolve all remaining dependencies. */
2569       if (scan_inst->is_control_flow()) {
2570          for (int i = 0; i < write_len; i++) {
2571             if (needs_dep[i])
2572                scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2573          }
2574          return;
2575       }
2576
2577       /* Clear the flag for registers that actually got read (as expected). */
2578       clear_deps_for_inst_src(scan_inst, dispatch_width,
2579                               needs_dep, first_write_grf, write_len);
2580
2581       /* We insert our reads as late as possible since they're reading the
2582        * result of a SEND, which has massive latency.
2583        */
2584       if (scan_inst->dst.file == GRF &&
2585           scan_inst->dst.reg >= first_write_grf &&
2586           scan_inst->dst.reg < first_write_grf + write_len &&
2587           needs_dep[scan_inst->dst.reg - first_write_grf]) {
2588          scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2589          needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2590       }
2591
2592       /* Continue the loop only if we haven't resolved all the dependencies */
2593       int i;
2594       for (i = 0; i < write_len; i++) {
2595          if (needs_dep[i])
2596             break;
2597       }
2598       if (i == write_len)
2599          return;
2600    }
2601
2602    /* If we hit the end of the program, resolve all remaining dependencies out
2603     * of paranoia.
2604     */
2605    fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2606    assert(last_inst->eot);
2607    for (int i = 0; i < write_len; i++) {
2608       if (needs_dep[i])
2609          last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2610    }
2611 }
2612
2613 void
2614 fs_visitor::insert_gen4_send_dependency_workarounds()
2615 {
2616    if (brw->gen != 4 || brw->is_g4x)
2617       return;
2618
2619    /* Note that we're done with register allocation, so GRF fs_regs always
2620     * have a .reg_offset of 0.
2621     */
2622
2623    foreach_list_safe(node, &this->instructions) {
2624       fs_inst *inst = (fs_inst *)node;
2625
2626       if (inst->mlen != 0 && inst->dst.file == GRF) {
2627          insert_gen4_pre_send_dependency_workarounds(inst);
2628          insert_gen4_post_send_dependency_workarounds(inst);
2629       }
2630    }
2631 }
2632
2633 /**
2634  * Turns the generic expression-style uniform pull constant load instruction
2635  * into a hardware-specific series of instructions for loading a pull
2636  * constant.
2637  *
2638  * The expression style allows the CSE pass before this to optimize out
2639  * repeated loads from the same offset, and gives the pre-register-allocation
2640  * scheduling full flexibility, while the conversion to native instructions
2641  * allows the post-register-allocation scheduler the best information
2642  * possible.
2643  *
2644  * Note that execution masking for setting up pull constant loads is special:
2645  * the channels that need to be written are unrelated to the current execution
2646  * mask, since a later instruction will use one of the result channels as a
2647  * source operand for all 8 or 16 of its channels.
2648  */
2649 void
2650 fs_visitor::lower_uniform_pull_constant_loads()
2651 {
2652    foreach_list(node, &this->instructions) {
2653       fs_inst *inst = (fs_inst *)node;
2654
2655       if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2656          continue;
2657
2658       if (brw->gen >= 7) {
2659          /* The offset arg before was a vec4-aligned byte offset.  We need to
2660           * turn it into a dword offset.
2661           */
2662          fs_reg const_offset_reg = inst->src[1];
2663          assert(const_offset_reg.file == IMM &&
2664                 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2665          const_offset_reg.imm.u /= 4;
2666          fs_reg payload = fs_reg(this, glsl_type::uint_type);
2667
2668          /* This is actually going to be a MOV, but since only the first dword
2669           * is accessed, we have a special opcode to do just that one.  Note
2670           * that this needs to be an operation that will be considered a def
2671           * by live variable analysis, or register allocation will explode.
2672           */
2673          fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2674                                                payload, const_offset_reg);
2675          setup->force_writemask_all = true;
2676
2677          setup->ir = inst->ir;
2678          setup->annotation = inst->annotation;
2679          inst->insert_before(setup);
2680
2681          /* Similarly, this will only populate the first 4 channels of the
2682           * result register (since we only use smear values from 0-3), but we
2683           * don't tell the optimizer.
2684           */
2685          inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2686          inst->src[1] = payload;
2687
2688          this->live_intervals_valid = false;
2689       } else {
2690          /* Before register allocation, we didn't tell the scheduler about the
2691           * MRF we use.  We know it's safe to use this MRF because nothing
2692           * else does except for register spill/unspill, which generates and
2693           * uses its MRF within a single IR instruction.
2694           */
2695          inst->base_mrf = 14;
2696          inst->mlen = 1;
2697       }
2698    }
2699 }
2700
2701 void
2702 fs_visitor::dump_instruction(backend_instruction *be_inst)
2703 {
2704    fs_inst *inst = (fs_inst *)be_inst;
2705
2706    if (inst->predicate) {
2707       printf("(%cf0.%d) ",
2708              inst->predicate_inverse ? '-' : '+',
2709              inst->flag_subreg);
2710    }
2711
2712    printf("%s", brw_instruction_name(inst->opcode));
2713    if (inst->saturate)
2714       printf(".sat");
2715    if (inst->conditional_mod) {
2716       printf(".cmod");
2717       if (!inst->predicate &&
2718           (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2719                               inst->opcode != BRW_OPCODE_IF &&
2720                               inst->opcode != BRW_OPCODE_WHILE))) {
2721          printf(".f0.%d", inst->flag_subreg);
2722       }
2723    }
2724    printf(" ");
2725
2726
2727    switch (inst->dst.file) {
2728    case GRF:
2729       printf("vgrf%d", inst->dst.reg);
2730       if (inst->dst.reg_offset)
2731          printf("+%d", inst->dst.reg_offset);
2732       break;
2733    case MRF:
2734       printf("m%d", inst->dst.reg);
2735       break;
2736    case BAD_FILE:
2737       printf("(null)");
2738       break;
2739    case UNIFORM:
2740       printf("***u%d***", inst->dst.reg);
2741       break;
2742    case ARF:
2743       if (inst->dst.reg == BRW_ARF_NULL)
2744          printf("(null)");
2745       else
2746          printf("arf%d", inst->dst.reg);
2747       break;
2748    default:
2749       printf("???");
2750       break;
2751    }
2752    printf(", ");
2753
2754    for (int i = 0; i < 3; i++) {
2755       if (inst->src[i].negate)
2756          printf("-");
2757       if (inst->src[i].abs)
2758          printf("|");
2759       switch (inst->src[i].file) {
2760       case GRF:
2761          printf("vgrf%d", inst->src[i].reg);
2762          if (inst->src[i].reg_offset)
2763             printf("+%d", inst->src[i].reg_offset);
2764          break;
2765       case MRF:
2766          printf("***m%d***", inst->src[i].reg);
2767          break;
2768       case UNIFORM:
2769          printf("u%d", inst->src[i].reg);
2770          if (inst->src[i].reg_offset)
2771             printf(".%d", inst->src[i].reg_offset);
2772          break;
2773       case BAD_FILE:
2774          printf("(null)");
2775          break;
2776       case IMM:
2777          switch (inst->src[i].type) {
2778          case BRW_REGISTER_TYPE_F:
2779             printf("%ff", inst->src[i].imm.f);
2780             break;
2781          case BRW_REGISTER_TYPE_D:
2782             printf("%dd", inst->src[i].imm.i);
2783             break;
2784          case BRW_REGISTER_TYPE_UD:
2785             printf("%uu", inst->src[i].imm.u);
2786             break;
2787          default:
2788             printf("???");
2789             break;
2790          }
2791          break;
2792       default:
2793          printf("???");
2794          break;
2795       }
2796       if (inst->src[i].abs)
2797          printf("|");
2798
2799       if (i < 3)
2800          printf(", ");
2801    }
2802
2803    printf(" ");
2804
2805    if (inst->force_uncompressed)
2806       printf("1sthalf ");
2807
2808    if (inst->force_sechalf)
2809       printf("2ndhalf ");
2810
2811    printf("\n");
2812 }
2813
2814 /**
2815  * Possibly returns an instruction that set up @param reg.
2816  *
2817  * Sometimes we want to take the result of some expression/variable
2818  * dereference tree and rewrite the instruction generating the result
2819  * of the tree.  When processing the tree, we know that the
2820  * instructions generated are all writing temporaries that are dead
2821  * outside of this tree.  So, if we have some instructions that write
2822  * a temporary, we're free to point that temp write somewhere else.
2823  *
2824  * Note that this doesn't guarantee that the instruction generated
2825  * only reg -- it might be the size=4 destination of a texture instruction.
2826  */
2827 fs_inst *
2828 fs_visitor::get_instruction_generating_reg(fs_inst *start,
2829                                            fs_inst *end,
2830                                            fs_reg reg)
2831 {
2832    if (end == start ||
2833        end->is_partial_write() ||
2834        reg.reladdr ||
2835        !reg.equals(end->dst)) {
2836       return NULL;
2837    } else {
2838       return end;
2839    }
2840 }
2841
2842 void
2843 fs_visitor::setup_payload_gen6()
2844 {
2845    bool uses_depth =
2846       (fp->Base.InputsRead & (1 << VARYING_SLOT_POS)) != 0;
2847    unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
2848
2849    assert(brw->gen >= 6);
2850
2851    /* R0-1: masks, pixel X/Y coordinates. */
2852    c->nr_payload_regs = 2;
2853    /* R2: only for 32-pixel dispatch.*/
2854
2855    /* R3-26: barycentric interpolation coordinates.  These appear in the
2856     * same order that they appear in the brw_wm_barycentric_interp_mode
2857     * enum.  Each set of coordinates occupies 2 registers if dispatch width
2858     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
2859     * appear if they were enabled using the "Barycentric Interpolation
2860     * Mode" bits in WM_STATE.
2861     */
2862    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
2863       if (barycentric_interp_modes & (1 << i)) {
2864          c->barycentric_coord_reg[i] = c->nr_payload_regs;
2865          c->nr_payload_regs += 2;
2866          if (dispatch_width == 16) {
2867             c->nr_payload_regs += 2;
2868          }
2869       }
2870    }
2871
2872    /* R27: interpolated depth if uses source depth */
2873    if (uses_depth) {
2874       c->source_depth_reg = c->nr_payload_regs;
2875       c->nr_payload_regs++;
2876       if (dispatch_width == 16) {
2877          /* R28: interpolated depth if not 8-wide. */
2878          c->nr_payload_regs++;
2879       }
2880    }
2881    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
2882    if (uses_depth) {
2883       c->source_w_reg = c->nr_payload_regs;
2884       c->nr_payload_regs++;
2885       if (dispatch_width == 16) {
2886          /* R30: interpolated W if not 8-wide. */
2887          c->nr_payload_regs++;
2888       }
2889    }
2890    /* R31: MSAA position offsets. */
2891    /* R32-: bary for 32-pixel. */
2892    /* R58-59: interp W for 32-pixel. */
2893
2894    if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
2895       c->source_depth_to_render_target = true;
2896    }
2897 }
2898
2899 bool
2900 fs_visitor::run()
2901 {
2902    sanity_param_count = fp->Base.Parameters->NumParameters;
2903    uint32_t orig_nr_params = c->prog_data.nr_params;
2904
2905    if (brw->gen >= 6)
2906       setup_payload_gen6();
2907    else
2908       setup_payload_gen4();
2909
2910    if (0) {
2911       emit_dummy_fs();
2912    } else {
2913       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2914          emit_shader_time_begin();
2915
2916       calculate_urb_setup();
2917       if (brw->gen < 6)
2918          emit_interpolation_setup_gen4();
2919       else
2920          emit_interpolation_setup_gen6();
2921
2922       /* We handle discards by keeping track of the still-live pixels in f0.1.
2923        * Initialize it with the dispatched pixels.
2924        */
2925       if (fp->UsesKill) {
2926          fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
2927          discard_init->flag_subreg = 1;
2928       }
2929
2930       /* Generate FS IR for main().  (the visitor only descends into
2931        * functions called "main").
2932        */
2933       if (shader) {
2934          foreach_list(node, &*shader->ir) {
2935             ir_instruction *ir = (ir_instruction *)node;
2936             base_ir = ir;
2937             this->result = reg_undef;
2938             ir->accept(this);
2939          }
2940       } else {
2941          emit_fragment_program_code();
2942       }
2943       base_ir = NULL;
2944       if (failed)
2945          return false;
2946
2947       emit(FS_OPCODE_PLACEHOLDER_HALT);
2948
2949       emit_fb_writes();
2950
2951       split_virtual_grfs();
2952
2953       move_uniform_array_access_to_pull_constants();
2954       setup_pull_constants();
2955
2956       bool progress;
2957       do {
2958          progress = false;
2959
2960          compact_virtual_grfs();
2961
2962          progress = remove_duplicate_mrf_writes() || progress;
2963
2964          progress = opt_algebraic() || progress;
2965          progress = opt_cse() || progress;
2966          progress = opt_copy_propagate() || progress;
2967          progress = dead_code_eliminate() || progress;
2968          progress = dead_code_eliminate_local() || progress;
2969          progress = register_coalesce() || progress;
2970          progress = register_coalesce_2() || progress;
2971          progress = compute_to_mrf() || progress;
2972       } while (progress);
2973
2974       remove_dead_constants();
2975
2976       schedule_instructions(false);
2977
2978       lower_uniform_pull_constant_loads();
2979
2980       assign_curb_setup();
2981       assign_urb_setup();
2982
2983       if (0) {
2984          /* Debug of register spilling: Go spill everything. */
2985          for (int i = 0; i < virtual_grf_count; i++) {
2986             spill_reg(i);
2987          }
2988       }
2989
2990       if (0)
2991          assign_regs_trivial();
2992       else {
2993          while (!assign_regs()) {
2994             if (failed)
2995                break;
2996          }
2997       }
2998    }
2999    assert(force_uncompressed_stack == 0);
3000    assert(force_sechalf_stack == 0);
3001
3002    /* This must come after all optimization and register allocation, since
3003     * it inserts dead code that happens to have side effects, and it does
3004     * so based on the actual physical registers in use.
3005     */
3006    insert_gen4_send_dependency_workarounds();
3007
3008    if (failed)
3009       return false;
3010
3011    schedule_instructions(true);
3012
3013    if (dispatch_width == 8) {
3014       c->prog_data.reg_blocks = brw_register_blocks(grf_used);
3015    } else {
3016       c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
3017
3018       /* Make sure we didn't try to sneak in an extra uniform */
3019       assert(orig_nr_params == c->prog_data.nr_params);
3020       (void) orig_nr_params;
3021    }
3022
3023    /* If any state parameters were appended, then ParameterValues could have
3024     * been realloced, in which case the driver uniform storage set up by
3025     * _mesa_associate_uniform_storage() would point to freed memory.  Make
3026     * sure that didn't happen.
3027     */
3028    assert(sanity_param_count == fp->Base.Parameters->NumParameters);
3029
3030    return !failed;
3031 }
3032
3033 const unsigned *
3034 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
3035                struct gl_fragment_program *fp,
3036                struct gl_shader_program *prog,
3037                unsigned *final_assembly_size)
3038 {
3039    bool start_busy = false;
3040    float start_time = 0;
3041
3042    if (unlikely(brw->perf_debug)) {
3043       start_busy = (brw->batch.last_bo &&
3044                     drm_intel_bo_busy(brw->batch.last_bo));
3045       start_time = get_time();
3046    }
3047
3048    struct brw_shader *shader = NULL;
3049    if (prog)
3050       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3051
3052    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3053       if (prog) {
3054          printf("GLSL IR for native fragment shader %d:\n", prog->Name);
3055          _mesa_print_ir(shader->ir, NULL);
3056          printf("\n\n");
3057       } else {
3058          printf("ARB_fragment_program %d ir for native fragment shader\n",
3059                 fp->Base.Id);
3060          _mesa_print_program(&fp->Base);
3061       }
3062    }
3063
3064    /* Now the main event: Visit the shader IR and generate our FS IR for it.
3065     */
3066    fs_visitor v(brw, c, prog, fp, 8);
3067    if (!v.run()) {
3068       if (prog) {
3069          prog->LinkStatus = false;
3070          ralloc_strcat(&prog->InfoLog, v.fail_msg);
3071       }
3072
3073       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3074                     v.fail_msg);
3075
3076       return NULL;
3077    }
3078
3079    exec_list *simd16_instructions = NULL;
3080    fs_visitor v2(brw, c, prog, fp, 16);
3081    if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16))) {
3082       if (c->prog_data.nr_pull_params == 0) {
3083          /* Try a 16-wide compile */
3084          v2.import_uniforms(&v);
3085          if (!v2.run()) {
3086             perf_debug("16-wide shader failed to compile, falling back to "
3087                        "8-wide at a 10-20%% performance cost: %s", v2.fail_msg);
3088          } else {
3089             simd16_instructions = &v2.instructions;
3090          }
3091       } else {
3092          perf_debug("Skipping 16-wide due to pull parameters.\n");
3093       }
3094    }
3095
3096    c->prog_data.dispatch_width = 8;
3097
3098    fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
3099    const unsigned *generated = g.generate_assembly(&v.instructions,
3100                                                    simd16_instructions,
3101                                                    final_assembly_size);
3102
3103    if (unlikely(brw->perf_debug) && shader) {
3104       if (shader->compiled_once)
3105          brw_wm_debug_recompile(brw, prog, &c->key);
3106       shader->compiled_once = true;
3107
3108       if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
3109          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3110                     (get_time() - start_time) * 1000);
3111       }
3112    }
3113
3114    return generated;
3115 }
3116
3117 bool
3118 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
3119 {
3120    struct brw_context *brw = brw_context(ctx);
3121    struct brw_wm_prog_key key;
3122
3123    if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
3124       return true;
3125
3126    struct gl_fragment_program *fp = (struct gl_fragment_program *)
3127       prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
3128    struct brw_fragment_program *bfp = brw_fragment_program(fp);
3129    bool program_uses_dfdy = fp->UsesDFdy;
3130
3131    memset(&key, 0, sizeof(key));
3132
3133    if (brw->gen < 6) {
3134       if (fp->UsesKill)
3135          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3136
3137       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3138          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3139
3140       /* Just assume depth testing. */
3141       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3142       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3143    }
3144
3145    if (brw->gen < 6)
3146       key.input_slots_valid |= BITFIELD64_BIT(VARYING_SLOT_POS);
3147
3148    for (int i = 0; i < VARYING_SLOT_MAX; i++) {
3149       if (!(fp->Base.InputsRead & BITFIELD64_BIT(i)))
3150          continue;
3151
3152       if (brw->gen < 6) {
3153          if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
3154             key.input_slots_valid |= BITFIELD64_BIT(i);
3155       }
3156    }
3157
3158    key.clamp_fragment_color = ctx->API == API_OPENGL_COMPAT;
3159
3160    unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
3161    for (unsigned i = 0; i < sampler_count; i++) {
3162       if (fp->Base.ShadowSamplers & (1 << i)) {
3163          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3164          key.tex.swizzles[i] =
3165             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3166       } else {
3167          /* Color sampler: assume no swizzling. */
3168          key.tex.swizzles[i] = SWIZZLE_XYZW;
3169       }
3170    }
3171
3172    if (fp->Base.InputsRead & VARYING_BIT_POS) {
3173       key.drawable_height = ctx->DrawBuffer->Height;
3174    }
3175
3176    if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3177       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
3178    }
3179
3180    key.nr_color_regions = 1;
3181
3182    key.program_string_id = bfp->id;
3183
3184    uint32_t old_prog_offset = brw->wm.prog_offset;
3185    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3186
3187    bool success = do_wm_prog(brw, prog, bfp, &key);
3188
3189    brw->wm.prog_offset = old_prog_offset;
3190    brw->wm.prog_data = old_prog_data;
3191
3192    return success;
3193 }