src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 extern "C" {
  32
  33 #include <sys/types.h>
  34
  35 #include "main/hash_table.h"
  36 #include "main/macros.h"
  37 #include "main/shaderobj.h"
  38 #include "main/uniforms.h"
  39 #include "main/fbobject.h"
  40 #include "program/prog_parameter.h"
  41 #include "program/prog_print.h"
  42 #include "program/register_allocate.h"
  43 #include "program/sampler.h"
  44 #include "program/hash_table.h"
  45 #include "brw_context.h"
  46 #include "brw_eu.h"
  47 #include "brw_wm.h"
  48 }
  49 #include "brw_fs.h"
  50 #include "glsl/glsl_types.h"
  51
  52 void
  53 fs_inst::init()
  54 {
  55    memset(this, 0, sizeof(*this));
  56    this->opcode = BRW_OPCODE_NOP;
  57    this->conditional_mod = BRW_CONDITIONAL_NONE;
  58
  59    this->dst = reg_undef;
  60    this->src[0] = reg_undef;
  61    this->src[1] = reg_undef;
  62    this->src[2] = reg_undef;
  63
  64    /* This will be the case for almost all instructions. */
  65    this->regs_written = 1;
  66 }
  67
  68 fs_inst::fs_inst()
  69 {
  70    init();
  71 }
  72
  73 fs_inst::fs_inst(enum opcode opcode)
  74 {
  75    init();
  76    this->opcode = opcode;
  77 }
  78
  79 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
  80 {
  81    init();
  82    this->opcode = opcode;
  83    this->dst = dst;
  84
  85    if (dst.file == GRF)
  86       assert(dst.reg_offset >= 0);
  87 }
  88
  89 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
  90 {
  91    init();
  92    this->opcode = opcode;
  93    this->dst = dst;
  94    this->src[0] = src0;
  95
  96    if (dst.file == GRF)
  97       assert(dst.reg_offset >= 0);
  98    if (src[0].file == GRF)
  99       assert(src[0].reg_offset >= 0);
 100 }
 101
 102 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 103 {
 104    init();
 105    this->opcode = opcode;
 106    this->dst = dst;
 107    this->src[0] = src0;
 108    this->src[1] = src1;
 109
 110    if (dst.file == GRF)
 111       assert(dst.reg_offset >= 0);
 112    if (src[0].file == GRF)
 113       assert(src[0].reg_offset >= 0);
 114    if (src[1].file == GRF)
 115       assert(src[1].reg_offset >= 0);
 116 }
 117
 118 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
 119                  fs_reg src0, fs_reg src1, fs_reg src2)
 120 {
 121    init();
 122    this->opcode = opcode;
 123    this->dst = dst;
 124    this->src[0] = src0;
 125    this->src[1] = src1;
 126    this->src[2] = src2;
 127
 128    if (dst.file == GRF)
 129       assert(dst.reg_offset >= 0);
 130    if (src[0].file == GRF)
 131       assert(src[0].reg_offset >= 0);
 132    if (src[1].file == GRF)
 133       assert(src[1].reg_offset >= 0);
 134    if (src[2].file == GRF)
 135       assert(src[2].reg_offset >= 0);
 136 }
 137
 138 #define ALU1(op)                                                        \
 139    fs_inst *                                                            \
 140    fs_visitor::op(fs_reg dst, fs_reg src0)                              \
 141    {                                                                    \
 142       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
 143    }
 144
 145 #define ALU2(op)                                                        \
 146    fs_inst *                                                            \
 147    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1)                 \
 148    {                                                                    \
 149       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
 150    }
 151
 152 #define ALU3(op)                                                        \
 153    fs_inst *                                                            \
 154    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1, fs_reg src2)    \
 155    {                                                                    \
 156       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
 157    }
 158
 159 ALU1(NOT)
 160 ALU1(MOV)
 161 ALU1(FRC)
 162 ALU1(RNDD)
 163 ALU1(RNDE)
 164 ALU1(RNDZ)
 165 ALU2(ADD)
 166 ALU2(MUL)
 167 ALU2(MACH)
 168 ALU2(AND)
 169 ALU2(OR)
 170 ALU2(XOR)
 171 ALU2(SHL)
 172 ALU2(SHR)
 173 ALU2(ASR)
 174 ALU3(LRP)
 175 ALU1(BFREV)
 176 ALU3(BFE)
 177 ALU2(BFI1)
 178 ALU3(BFI2)
 179 ALU1(FBH)
 180 ALU1(FBL)
 181 ALU1(CBIT)
 182
 183 /** Gen4 predicated IF. */
 184 fs_inst *
 185 fs_visitor::IF(uint32_t predicate)
 186 {
 187    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
 188    inst->predicate = predicate;
 189    return inst;
 190 }
 191
 192 /** Gen6+ IF with embedded comparison. */
 193 fs_inst *
 194 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
 195 {
 196    assert(brw->gen >= 6);
 197    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
 198                                         reg_null_d, src0, src1);
 199    inst->conditional_mod = condition;
 200    return inst;
 201 }
 202
 203 /**
 204  * CMP: Sets the low bit of the destination channels with the result
 205  * of the comparison, while the upper bits are undefined, and updates
 206  * the flag register with the packed 16 bits of the result.
 207  */
 208 fs_inst *
 209 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
 210 {
 211    fs_inst *inst;
 212
 213    /* Take the instruction:
 214     *
 215     * CMP null<d> src0<f> src1<f>
 216     *
 217     * Original gen4 does type conversion to the destination type before
 218     * comparison, producing garbage results for floating point comparisons.
 219     * gen5 does the comparison on the execution type (resolved source types),
 220     * so dst type doesn't matter.  gen6 does comparison and then uses the
 221     * result as if it was the dst type with no conversion, which happens to
 222     * mostly work out for float-interpreted-as-int since our comparisons are
 223     * for >0, =0, <0.
 224     */
 225    if (brw->gen == 4) {
 226       dst.type = src0.type;
 227       if (dst.file == HW_REG)
 228          dst.fixed_hw_reg.type = dst.type;
 229    }
 230
 231    resolve_ud_negate(&src0);
 232    resolve_ud_negate(&src1);
 233
 234    inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
 235    inst->conditional_mod = condition;
 236
 237    return inst;
 238 }
 239
 240 exec_list
 241 fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
 242                                        fs_reg varying_offset,
 243                                        uint32_t const_offset)
 244 {
 245    exec_list instructions;
 246    fs_inst *inst;
 247
 248    /* We have our constant surface use a pitch of 4 bytes, so our index can
 249     * be any component of a vector, and then we load 4 contiguous
 250     * components starting from that.
 251     *
 252     * We break down the const_offset to a portion added to the variable
 253     * offset and a portion done using reg_offset, which means that if you
 254     * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
 255     * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
 256     * CSE can later notice that those loads are all the same and eliminate
 257     * the redundant ones.
 258     */
 259    fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
 260    instructions.push_tail(ADD(vec4_offset,
 261                               varying_offset, const_offset & ~3));
 262
 263    int scale = 1;
 264    if (brw->gen == 4 && dispatch_width == 8) {
 265       /* Pre-gen5, we can either use a SIMD8 message that requires (header,
 266        * u, v, r) as parameters, or we can just use the SIMD16 message
 267        * consisting of (header, u).  We choose the second, at the cost of a
 268        * longer return length.
 269        */
 270       scale = 2;
 271    }
 272
 273    enum opcode op;
 274    if (brw->gen >= 7)
 275       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
 276    else
 277       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
 278    fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4 * scale), dst.type);
 279    inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
 280    inst->regs_written = 4 * scale;
 281    instructions.push_tail(inst);
 282
 283    if (brw->gen < 7) {
 284       inst->base_mrf = 13;
 285       inst->header_present = true;
 286       if (brw->gen == 4)
 287          inst->mlen = 3;
 288       else
 289          inst->mlen = 1 + dispatch_width / 8;
 290    }
 291
 292    vec4_result.reg_offset += (const_offset & 3) * scale;
 293    instructions.push_tail(MOV(dst, vec4_result));
 294
 295    return instructions;
 296 }
 297
 298 /**
 299  * A helper for MOV generation for fixing up broken hardware SEND dependency
 300  * handling.
 301  */
 302 fs_inst *
 303 fs_visitor::DEP_RESOLVE_MOV(int grf)
 304 {
 305    fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
 306
 307    inst->ir = NULL;
 308    inst->annotation = "send dependency resolve";
 309
 310    /* The caller always wants uncompressed to emit the minimal extra
 311     * dependencies, and to avoid having to deal with aligning its regs to 2.
 312     */
 313    inst->force_uncompressed = true;
 314
 315    return inst;
 316 }
 317
 318 bool
 319 fs_inst::equals(fs_inst *inst)
 320 {
 321    return (opcode == inst->opcode &&
 322            dst.equals(inst->dst) &&
 323            src[0].equals(inst->src[0]) &&
 324            src[1].equals(inst->src[1]) &&
 325            src[2].equals(inst->src[2]) &&
 326            saturate == inst->saturate &&
 327            predicate == inst->predicate &&
 328            conditional_mod == inst->conditional_mod &&
 329            mlen == inst->mlen &&
 330            base_mrf == inst->base_mrf &&
 331            sampler == inst->sampler &&
 332            target == inst->target &&
 333            eot == inst->eot &&
 334            header_present == inst->header_present &&
 335            shadow_compare == inst->shadow_compare &&
 336            offset == inst->offset);
 337 }
 338
 339 bool
 340 fs_inst::overwrites_reg(const fs_reg &reg)
 341 {
 342    return (reg.file == dst.file &&
 343            reg.reg == dst.reg &&
 344            reg.reg_offset >= dst.reg_offset  &&
 345            reg.reg_offset < dst.reg_offset + regs_written);
 346 }
 347
 348 bool
 349 fs_inst::is_send_from_grf()
 350 {
 351    return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
 352            opcode == SHADER_OPCODE_SHADER_TIME_ADD ||
 353            (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
 354             src[1].file == GRF));
 355 }
 356
 357 bool
 358 fs_visitor::can_do_source_mods(fs_inst *inst)
 359 {
 360    if (brw->gen == 6 && inst->is_math())
 361       return false;
 362
 363    if (inst->is_send_from_grf())
 364       return false;
 365
 366    return true;
 367 }
 368
 369 void
 370 fs_reg::init()
 371 {
 372    memset(this, 0, sizeof(*this));
 373    this->smear = -1;
 374 }
 375
 376 /** Generic unset register constructor. */
 377 fs_reg::fs_reg()
 378 {
 379    init();
 380    this->file = BAD_FILE;
 381 }
 382
 383 /** Immediate value constructor. */
 384 fs_reg::fs_reg(float f)
 385 {
 386    init();
 387    this->file = IMM;
 388    this->type = BRW_REGISTER_TYPE_F;
 389    this->imm.f = f;
 390 }
 391
 392 /** Immediate value constructor. */
 393 fs_reg::fs_reg(int32_t i)
 394 {
 395    init();
 396    this->file = IMM;
 397    this->type = BRW_REGISTER_TYPE_D;
 398    this->imm.i = i;
 399 }
 400
 401 /** Immediate value constructor. */
 402 fs_reg::fs_reg(uint32_t u)
 403 {
 404    init();
 405    this->file = IMM;
 406    this->type = BRW_REGISTER_TYPE_UD;
 407    this->imm.u = u;
 408 }
 409
 410 /** Fixed brw_reg Immediate value constructor. */
 411 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
 412 {
 413    init();
 414    this->file = HW_REG;
 415    this->fixed_hw_reg = fixed_hw_reg;
 416    this->type = fixed_hw_reg.type;
 417 }
 418
 419 bool
 420 fs_reg::equals(const fs_reg &r) const
 421 {
 422    return (file == r.file &&
 423            reg == r.reg &&
 424            reg_offset == r.reg_offset &&
 425            type == r.type &&
 426            negate == r.negate &&
 427            abs == r.abs &&
 428            !reladdr && !r.reladdr &&
 429            memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
 430                   sizeof(fixed_hw_reg)) == 0 &&
 431            smear == r.smear &&
 432            imm.u == r.imm.u);
 433 }
 434
 435 bool
 436 fs_reg::is_zero() const
 437 {
 438    if (file != IMM)
 439       return false;
 440
 441    return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
 442 }
 443
 444 bool
 445 fs_reg::is_one() const
 446 {
 447    if (file != IMM)
 448       return false;
 449
 450    return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
 451 }
 452
 453 bool
 454 fs_reg::is_valid_3src() const
 455 {
 456    return file == GRF || file == UNIFORM;
 457 }
 458
 459 int
 460 fs_visitor::type_size(const struct glsl_type *type)
 461 {
 462    unsigned int size, i;
 463
 464    switch (type->base_type) {
 465    case GLSL_TYPE_UINT:
 466    case GLSL_TYPE_INT:
 467    case GLSL_TYPE_FLOAT:
 468    case GLSL_TYPE_BOOL:
 469       return type->components();
 470    case GLSL_TYPE_ARRAY:
 471       return type_size(type->fields.array) * type->length;
 472    case GLSL_TYPE_STRUCT:
 473       size = 0;
 474       for (i = 0; i < type->length; i++) {
 475          size += type_size(type->fields.structure[i].type);
 476       }
 477       return size;
 478    case GLSL_TYPE_SAMPLER:
 479       /* Samplers take up no register space, since they're baked in at
 480        * link time.
 481        */
 482       return 0;
 483    case GLSL_TYPE_VOID:
 484    case GLSL_TYPE_ERROR:
 485    case GLSL_TYPE_INTERFACE:
 486       assert(!"not reached");
 487       break;
 488    }
 489
 490    return 0;
 491 }
 492
 493 fs_reg
 494 fs_visitor::get_timestamp()
 495 {
 496    assert(brw->gen >= 7);
 497
 498    fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 499                                           BRW_ARF_TIMESTAMP,
 500                                           0),
 501                              BRW_REGISTER_TYPE_UD));
 502
 503    fs_reg dst = fs_reg(this, glsl_type::uint_type);
 504
 505    fs_inst *mov = emit(MOV(dst, ts));
 506    /* We want to read the 3 fields we care about (mostly field 0, but also 2)
 507     * even if it's not enabled in the dispatch.
 508     */
 509    mov->force_writemask_all = true;
 510    mov->force_uncompressed = true;
 511
 512    /* The caller wants the low 32 bits of the timestamp.  Since it's running
 513     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
 514     * which is plenty of time for our purposes.  It is identical across the
 515     * EUs, but since it's tracking GPU core speed it will increment at a
 516     * varying rate as render P-states change.
 517     *
 518     * The caller could also check if render P-states have changed (or anything
 519     * else that might disrupt timing) by setting smear to 2 and checking if
 520     * that field is != 0.
 521     */
 522    dst.smear = 0;
 523
 524    return dst;
 525 }
 526
 527 void
 528 fs_visitor::emit_shader_time_begin()
 529 {
 530    current_annotation = "shader time start";
 531    shader_start_time = get_timestamp();
 532 }
 533
 534 void
 535 fs_visitor::emit_shader_time_end()
 536 {
 537    current_annotation = "shader time end";
 538
 539    enum shader_time_shader_type type, written_type, reset_type;
 540    if (dispatch_width == 8) {
 541       type = ST_FS8;
 542       written_type = ST_FS8_WRITTEN;
 543       reset_type = ST_FS8_RESET;
 544    } else {
 545       assert(dispatch_width == 16);
 546       type = ST_FS16;
 547       written_type = ST_FS16_WRITTEN;
 548       reset_type = ST_FS16_RESET;
 549    }
 550
 551    fs_reg shader_end_time = get_timestamp();
 552
 553    /* Check that there weren't any timestamp reset events (assuming these
 554     * were the only two timestamp reads that happened).
 555     */
 556    fs_reg reset = shader_end_time;
 557    reset.smear = 2;
 558    fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
 559    test->conditional_mod = BRW_CONDITIONAL_Z;
 560    emit(IF(BRW_PREDICATE_NORMAL));
 561
 562    push_force_uncompressed();
 563    fs_reg start = shader_start_time;
 564    start.negate = true;
 565    fs_reg diff = fs_reg(this, glsl_type::uint_type);
 566    emit(ADD(diff, start, shader_end_time));
 567
 568    /* If there were no instructions between the two timestamp gets, the diff
 569     * is 2 cycles.  Remove that overhead, so I can forget about that when
 570     * trying to determine the time taken for single instructions.
 571     */
 572    emit(ADD(diff, diff, fs_reg(-2u)));
 573
 574    emit_shader_time_write(type, diff);
 575    emit_shader_time_write(written_type, fs_reg(1u));
 576    emit(BRW_OPCODE_ELSE);
 577    emit_shader_time_write(reset_type, fs_reg(1u));
 578    emit(BRW_OPCODE_ENDIF);
 579
 580    pop_force_uncompressed();
 581 }
 582
 583 void
 584 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
 585                                    fs_reg value)
 586 {
 587    int shader_time_index =
 588       brw_get_shader_time_index(brw, shader_prog, &fp->Base, type);
 589    fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
 590
 591    fs_reg payload;
 592    if (dispatch_width == 8)
 593       payload = fs_reg(this, glsl_type::uvec2_type);
 594    else
 595       payload = fs_reg(this, glsl_type::uint_type);
 596
 597    emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
 598                 fs_reg(), payload, offset, value));
 599 }
 600
 601 void
 602 fs_visitor::fail(const char *format, ...)
 603 {
 604    va_list va;
 605    char *msg;
 606
 607    if (failed)
 608       return;
 609
 610    failed = true;
 611
 612    va_start(va, format);
 613    msg = ralloc_vasprintf(mem_ctx, format, va);
 614    va_end(va);
 615    msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
 616
 617    this->fail_msg = msg;
 618
 619    if (INTEL_DEBUG & DEBUG_WM) {
 620       fprintf(stderr, "%s",  msg);
 621    }
 622 }
 623
 624 fs_inst *
 625 fs_visitor::emit(enum opcode opcode)
 626 {
 627    return emit(fs_inst(opcode));
 628 }
 629
 630 fs_inst *
 631 fs_visitor::emit(enum opcode opcode, fs_reg dst)
 632 {
 633    return emit(fs_inst(opcode, dst));
 634 }
 635
 636 fs_inst *
 637 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
 638 {
 639    return emit(fs_inst(opcode, dst, src0));
 640 }
 641
 642 fs_inst *
 643 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 644 {
 645    return emit(fs_inst(opcode, dst, src0, src1));
 646 }
 647
 648 fs_inst *
 649 fs_visitor::emit(enum opcode opcode, fs_reg dst,
 650                  fs_reg src0, fs_reg src1, fs_reg src2)
 651 {
 652    return emit(fs_inst(opcode, dst, src0, src1, src2));
 653 }
 654
 655 void
 656 fs_visitor::push_force_uncompressed()
 657 {
 658    force_uncompressed_stack++;
 659 }
 660
 661 void
 662 fs_visitor::pop_force_uncompressed()
 663 {
 664    force_uncompressed_stack--;
 665    assert(force_uncompressed_stack >= 0);
 666 }
 667
 668 void
 669 fs_visitor::push_force_sechalf()
 670 {
 671    force_sechalf_stack++;
 672 }
 673
 674 void
 675 fs_visitor::pop_force_sechalf()
 676 {
 677    force_sechalf_stack--;
 678    assert(force_sechalf_stack >= 0);
 679 }
 680
 681 /**
 682  * Returns true if the instruction has a flag that means it won't
 683  * update an entire destination register.
 684  *
 685  * For example, dead code elimination and live variable analysis want to know
 686  * when a write to a variable screens off any preceding values that were in
 687  * it.
 688  */
 689 bool
 690 fs_inst::is_partial_write()
 691 {
 692    return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
 693            this->force_uncompressed ||
 694            this->force_sechalf);
 695 }
 696
 697 /**
 698  * Returns how many MRFs an FS opcode will write over.
 699  *
 700  * Note that this is not the 0 or 1 implied writes in an actual gen
 701  * instruction -- the FS opcodes often generate MOVs in addition.
 702  */
 703 int
 704 fs_visitor::implied_mrf_writes(fs_inst *inst)
 705 {
 706    if (inst->mlen == 0)
 707       return 0;
 708
 709    switch (inst->opcode) {
 710    case SHADER_OPCODE_RCP:
 711    case SHADER_OPCODE_RSQ:
 712    case SHADER_OPCODE_SQRT:
 713    case SHADER_OPCODE_EXP2:
 714    case SHADER_OPCODE_LOG2:
 715    case SHADER_OPCODE_SIN:
 716    case SHADER_OPCODE_COS:
 717       return 1 * dispatch_width / 8;
 718    case SHADER_OPCODE_POW:
 719    case SHADER_OPCODE_INT_QUOTIENT:
 720    case SHADER_OPCODE_INT_REMAINDER:
 721       return 2 * dispatch_width / 8;
 722    case SHADER_OPCODE_TEX:
 723    case FS_OPCODE_TXB:
 724    case SHADER_OPCODE_TXD:
 725    case SHADER_OPCODE_TXF:
 726    case SHADER_OPCODE_TXF_MS:
 727    case SHADER_OPCODE_TXL:
 728    case SHADER_OPCODE_TXS:
 729    case SHADER_OPCODE_LOD:
 730       return 1;
 731    case FS_OPCODE_FB_WRITE:
 732       return 2;
 733    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 734    case FS_OPCODE_UNSPILL:
 735       return 1;
 736    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
 737       return inst->mlen;
 738    case FS_OPCODE_SPILL:
 739       return 2;
 740    default:
 741       assert(!"not reached");
 742       return inst->mlen;
 743    }
 744 }
 745
 746 int
 747 fs_visitor::virtual_grf_alloc(int size)
 748 {
 749    if (virtual_grf_array_size <= virtual_grf_count) {
 750       if (virtual_grf_array_size == 0)
 751          virtual_grf_array_size = 16;
 752       else
 753          virtual_grf_array_size *= 2;
 754       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 755                                    virtual_grf_array_size);
 756    }
 757    virtual_grf_sizes[virtual_grf_count] = size;
 758    return virtual_grf_count++;
 759 }
 760
 761 /** Fixed HW reg constructor. */
 762 fs_reg::fs_reg(enum register_file file, int reg)
 763 {
 764    init();
 765    this->file = file;
 766    this->reg = reg;
 767    this->type = BRW_REGISTER_TYPE_F;
 768 }
 769
 770 /** Fixed HW reg constructor. */
 771 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
 772 {
 773    init();
 774    this->file = file;
 775    this->reg = reg;
 776    this->type = type;
 777 }
 778
 779 /** Automatic reg constructor. */
 780 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
 781 {
 782    init();
 783
 784    this->file = GRF;
 785    this->reg = v->virtual_grf_alloc(v->type_size(type));
 786    this->reg_offset = 0;
 787    this->type = brw_type_for_base_type(type);
 788 }
 789
 790 fs_reg *
 791 fs_visitor::variable_storage(ir_variable *var)
 792 {
 793    return (fs_reg *)hash_table_find(this->variable_ht, var);
 794 }
 795
 796 void
 797 import_uniforms_callback(const void *key,
 798                          void *data,
 799                          void *closure)
 800 {
 801    struct hash_table *dst_ht = (struct hash_table *)closure;
 802    const fs_reg *reg = (const fs_reg *)data;
 803
 804    if (reg->file != UNIFORM)
 805       return;
 806
 807    hash_table_insert(dst_ht, data, key);
 808 }
 809
 810 /* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
 811  * This brings in those uniform definitions
 812  */
 813 void
 814 fs_visitor::import_uniforms(fs_visitor *v)
 815 {
 816    hash_table_call_foreach(v->variable_ht,
 817                            import_uniforms_callback,
 818                            variable_ht);
 819    this->params_remap = v->params_remap;
 820    this->nr_params_remap = v->nr_params_remap;
 821 }
 822
 823 /* Our support for uniforms is piggy-backed on the struct
 824  * gl_fragment_program, because that's where the values actually
 825  * get stored, rather than in some global gl_shader_program uniform
 826  * store.
 827  */
 828 void
 829 fs_visitor::setup_uniform_values(ir_variable *ir)
 830 {
 831    int namelen = strlen(ir->name);
 832
 833    /* The data for our (non-builtin) uniforms is stored in a series of
 834     * gl_uniform_driver_storage structs for each subcomponent that
 835     * glGetUniformLocation() could name.  We know it's been set up in the same
 836     * order we'd walk the type, so walk the list of storage and find anything
 837     * with our name, or the prefix of a component that starts with our name.
 838     */
 839    unsigned params_before = c->prog_data.nr_params;
 840    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
 841       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 842
 843       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 844           (storage->name[namelen] != 0 &&
 845            storage->name[namelen] != '.' &&
 846            storage->name[namelen] != '[')) {
 847          continue;
 848       }
 849
 850       unsigned slots = storage->type->component_slots();
 851       if (storage->array_elements)
 852          slots *= storage->array_elements;
 853
 854       for (unsigned i = 0; i < slots; i++) {
 855          c->prog_data.param[c->prog_data.nr_params++] =
 856             &storage->storage[i].f;
 857       }
 858    }
 859
 860    /* Make sure we actually initialized the right amount of stuff here. */
 861    assert(params_before + ir->type->component_slots() ==
 862           c->prog_data.nr_params);
 863    (void)params_before;
 864 }
 865
 866
 867 /* Our support for builtin uniforms is even scarier than non-builtin.
 868  * It sits on top of the PROG_STATE_VAR parameters that are
 869  * automatically updated from GL context state.
 870  */
 871 void
 872 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
 873 {
 874    const ir_state_slot *const slots = ir->state_slots;
 875    assert(ir->state_slots != NULL);
 876
 877    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 878       /* This state reference has already been setup by ir_to_mesa, but we'll
 879        * get the same index back here.
 880        */
 881       int index = _mesa_add_state_reference(this->fp->Base.Parameters,
 882                                             (gl_state_index *)slots[i].tokens);
 883
 884       /* Add each of the unique swizzles of the element as a parameter.
 885        * This'll end up matching the expected layout of the
 886        * array/matrix/structure we're trying to fill in.
 887        */
 888       int last_swiz = -1;
 889       for (unsigned int j = 0; j < 4; j++) {
 890          int swiz = GET_SWZ(slots[i].swizzle, j);
 891          if (swiz == last_swiz)
 892             break;
 893          last_swiz = swiz;
 894
 895          c->prog_data.param[c->prog_data.nr_params++] =
 896             &fp->Base.Parameters->ParameterValues[index][swiz].f;
 897       }
 898    }
 899 }
 900
 901 fs_reg *
 902 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
 903 {
 904    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 905    fs_reg wpos = *reg;
 906    bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
 907
 908    /* gl_FragCoord.x */
 909    if (ir->pixel_center_integer) {
 910       emit(MOV(wpos, this->pixel_x));
 911    } else {
 912       emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
 913    }
 914    wpos.reg_offset++;
 915
 916    /* gl_FragCoord.y */
 917    if (!flip && ir->pixel_center_integer) {
 918       emit(MOV(wpos, this->pixel_y));
 919    } else {
 920       fs_reg pixel_y = this->pixel_y;
 921       float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
 922
 923       if (flip) {
 924          pixel_y.negate = true;
 925          offset += c->key.drawable_height - 1.0;
 926       }
 927
 928       emit(ADD(wpos, pixel_y, fs_reg(offset)));
 929    }
 930    wpos.reg_offset++;
 931
 932    /* gl_FragCoord.z */
 933    if (brw->gen >= 6) {
 934       emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
 935    } else {
 936       emit(FS_OPCODE_LINTERP, wpos,
 937            this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
 938            this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
 939            interp_reg(VARYING_SLOT_POS, 2));
 940    }
 941    wpos.reg_offset++;
 942
 943    /* gl_FragCoord.w: Already set up in emit_interpolation */
 944    emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
 945
 946    return reg;
 947 }
 948
 949 fs_inst *
 950 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
 951                          glsl_interp_qualifier interpolation_mode,
 952                          bool is_centroid)
 953 {
 954    brw_wm_barycentric_interp_mode barycoord_mode;
 955    if (brw->gen >= 6) {
 956       if (is_centroid) {
 957          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
 958             barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
 959          else
 960             barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
 961       } else {
 962          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
 963             barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
 964          else
 965             barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
 966       }
 967    } else {
 968       /* On Ironlake and below, there is only one interpolation mode.
 969        * Centroid interpolation doesn't mean anything on this hardware --
 970        * there is no multisampling.
 971        */
 972       barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
 973    }
 974    return emit(FS_OPCODE_LINTERP, attr,
 975                this->delta_x[barycoord_mode],
 976                this->delta_y[barycoord_mode], interp);
 977 }
 978
 979 fs_reg *
 980 fs_visitor::emit_general_interpolation(ir_variable *ir)
 981 {
 982    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 983    reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
 984    fs_reg attr = *reg;
 985
 986    unsigned int array_elements;
 987    const glsl_type *type;
 988
 989    if (ir->type->is_array()) {
 990       array_elements = ir->type->length;
 991       if (array_elements == 0) {
 992          fail("dereferenced array '%s' has length 0\n", ir->name);
 993       }
 994       type = ir->type->fields.array;
 995    } else {
 996       array_elements = 1;
 997       type = ir->type;
 998    }
 999
1000    glsl_interp_qualifier interpolation_mode =
1001       ir->determine_interpolation_mode(c->key.flat_shade);
1002
1003    int location = ir->location;
1004    for (unsigned int i = 0; i < array_elements; i++) {
1005       for (unsigned int j = 0; j < type->matrix_columns; j++) {
1006          if (urb_setup[location] == -1) {
1007             /* If there's no incoming setup data for this slot, don't
1008              * emit interpolation for it.
1009              */
1010             attr.reg_offset += type->vector_elements;
1011             location++;
1012             continue;
1013          }
1014
1015          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1016             /* Constant interpolation (flat shading) case. The SF has
1017              * handed us defined values in only the constant offset
1018              * field of the setup reg.
1019              */
1020             for (unsigned int k = 0; k < type->vector_elements; k++) {
1021                struct brw_reg interp = interp_reg(location, k);
1022                interp = suboffset(interp, 3);
1023                interp.type = reg->type;
1024                emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1025                attr.reg_offset++;
1026             }
1027          } else {
1028             /* Smooth/noperspective interpolation case. */
1029             for (unsigned int k = 0; k < type->vector_elements; k++) {
1030                /* FINISHME: At some point we probably want to push
1031                 * this farther by giving similar treatment to the
1032                 * other potentially constant components of the
1033                 * attribute, as well as making brw_vs_constval.c
1034                 * handle varyings other than gl_TexCoord.
1035                 */
1036                struct brw_reg interp = interp_reg(location, k);
1037                emit_linterp(attr, fs_reg(interp), interpolation_mode,
1038                             ir->centroid);
1039                if (brw->needs_unlit_centroid_workaround && ir->centroid) {
1040                   /* Get the pixel/sample mask into f0 so that we know
1041                    * which pixels are lit.  Then, for each channel that is
1042                    * unlit, replace the centroid data with non-centroid
1043                    * data.
1044                    */
1045                   emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1046                   fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1047                                                interpolation_mode, false);
1048                   inst->predicate = BRW_PREDICATE_NORMAL;
1049                   inst->predicate_inverse = true;
1050                }
1051                if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1052                   emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1053                }
1054                attr.reg_offset++;
1055             }
1056
1057          }
1058          location++;
1059       }
1060    }
1061
1062    return reg;
1063 }
1064
1065 fs_reg *
1066 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1067 {
1068    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1069
1070    /* The frontfacing comes in as a bit in the thread payload. */
1071    if (brw->gen >= 6) {
1072       emit(BRW_OPCODE_ASR, *reg,
1073            fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1074            fs_reg(15));
1075       emit(BRW_OPCODE_NOT, *reg, *reg);
1076       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1077    } else {
1078       struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1079       /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1080        * us front face
1081        */
1082       emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1083       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1084    }
1085
1086    return reg;
1087 }
1088
1089 fs_reg
1090 fs_visitor::fix_math_operand(fs_reg src)
1091 {
1092    /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1093     * might be able to do better by doing execsize = 1 math and then
1094     * expanding that result out, but we would need to be careful with
1095     * masking.
1096     *
1097     * The hardware ignores source modifiers (negate and abs) on math
1098     * instructions, so we also move to a temp to set those up.
1099     */
1100    if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1101        !src.abs && !src.negate)
1102       return src;
1103
1104    /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1105     * operands to math
1106     */
1107    if (brw->gen >= 7 && src.file != IMM)
1108       return src;
1109
1110    fs_reg expanded = fs_reg(this, glsl_type::float_type);
1111    expanded.type = src.type;
1112    emit(BRW_OPCODE_MOV, expanded, src);
1113    return expanded;
1114 }
1115
1116 fs_inst *
1117 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1118 {
1119    switch (opcode) {
1120    case SHADER_OPCODE_RCP:
1121    case SHADER_OPCODE_RSQ:
1122    case SHADER_OPCODE_SQRT:
1123    case SHADER_OPCODE_EXP2:
1124    case SHADER_OPCODE_LOG2:
1125    case SHADER_OPCODE_SIN:
1126    case SHADER_OPCODE_COS:
1127       break;
1128    default:
1129       assert(!"not reached: bad math opcode");
1130       return NULL;
1131    }
1132
1133    /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
1134     * might be able to do better by doing execsize = 1 math and then
1135     * expanding that result out, but we would need to be careful with
1136     * masking.
1137     *
1138     * Gen 6 hardware ignores source modifiers (negate and abs) on math
1139     * instructions, so we also move to a temp to set those up.
1140     */
1141    if (brw->gen >= 6)
1142       src = fix_math_operand(src);
1143
1144    fs_inst *inst = emit(opcode, dst, src);
1145
1146    if (brw->gen < 6) {
1147       inst->base_mrf = 2;
1148       inst->mlen = dispatch_width / 8;
1149    }
1150
1151    return inst;
1152 }
1153
1154 fs_inst *
1155 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1156 {
1157    int base_mrf = 2;
1158    fs_inst *inst;
1159
1160    switch (opcode) {
1161    case SHADER_OPCODE_INT_QUOTIENT:
1162    case SHADER_OPCODE_INT_REMAINDER:
1163       if (brw->gen >= 7 && dispatch_width == 16)
1164          fail("16-wide INTDIV unsupported\n");
1165       break;
1166    case SHADER_OPCODE_POW:
1167       break;
1168    default:
1169       assert(!"not reached: unsupported binary math opcode.");
1170       return NULL;
1171    }
1172
1173    if (brw->gen >= 6) {
1174       src0 = fix_math_operand(src0);
1175       src1 = fix_math_operand(src1);
1176
1177       inst = emit(opcode, dst, src0, src1);
1178    } else {
1179       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1180        * "Message Payload":
1181        *
1182        * "Operand0[7].  For the INT DIV functions, this operand is the
1183        *  denominator."
1184        *  ...
1185        * "Operand1[7].  For the INT DIV functions, this operand is the
1186        *  numerator."
1187        */
1188       bool is_int_div = opcode != SHADER_OPCODE_POW;
1189       fs_reg &op0 = is_int_div ? src1 : src0;
1190       fs_reg &op1 = is_int_div ? src0 : src1;
1191
1192       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1193       inst = emit(opcode, dst, op0, reg_null_f);
1194
1195       inst->base_mrf = base_mrf;
1196       inst->mlen = 2 * dispatch_width / 8;
1197    }
1198    return inst;
1199 }
1200
1201 void
1202 fs_visitor::assign_curb_setup()
1203 {
1204    c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
1205    if (dispatch_width == 8) {
1206       c->prog_data.first_curbe_grf = c->nr_payload_regs;
1207    } else {
1208       c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1209    }
1210
1211    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1212    foreach_list(node, &this->instructions) {
1213       fs_inst *inst = (fs_inst *)node;
1214
1215       for (unsigned int i = 0; i < 3; i++) {
1216          if (inst->src[i].file == UNIFORM) {
1217             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1218             struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1219                                                   constant_nr / 8,
1220                                                   constant_nr % 8);
1221
1222             inst->src[i].file = HW_REG;
1223             inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
1224          }
1225       }
1226    }
1227 }
1228
1229 void
1230 fs_visitor::calculate_urb_setup()
1231 {
1232    for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1233       urb_setup[i] = -1;
1234    }
1235
1236    int urb_next = 0;
1237    /* Figure out where each of the incoming setup attributes lands. */
1238    if (brw->gen >= 6) {
1239       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1240          if (fp->Base.InputsRead & BITFIELD64_BIT(i)) {
1241             urb_setup[i] = urb_next++;
1242          }
1243       }
1244    } else {
1245       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1246       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1247          /* Point size is packed into the header, not as a general attribute */
1248          if (i == VARYING_SLOT_PSIZ)
1249             continue;
1250
1251          if (c->key.input_slots_valid & BITFIELD64_BIT(i)) {
1252             /* The back color slot is skipped when the front color is
1253              * also written to.  In addition, some slots can be
1254              * written in the vertex shader and not read in the
1255              * fragment shader.  So the register number must always be
1256              * incremented, mapped or not.
1257              */
1258             if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1259                urb_setup[i] = urb_next;
1260             urb_next++;
1261          }
1262       }
1263
1264       /*
1265        * It's a FS only attribute, and we did interpolation for this attribute
1266        * in SF thread. So, count it here, too.
1267        *
1268        * See compile_sf_prog() for more info.
1269        */
1270       if (fp->Base.InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1271          urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1272    }
1273
1274    /* Each attribute is 4 setup channels, each of which is half a reg. */
1275    c->prog_data.urb_read_length = urb_next * 2;
1276 }
1277
1278 void
1279 fs_visitor::assign_urb_setup()
1280 {
1281    int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1282
1283    /* Offset all the urb_setup[] index by the actual position of the
1284     * setup regs, now that the location of the constants has been chosen.
1285     */
1286    foreach_list(node, &this->instructions) {
1287       fs_inst *inst = (fs_inst *)node;
1288
1289       if (inst->opcode == FS_OPCODE_LINTERP) {
1290          assert(inst->src[2].file == HW_REG);
1291          inst->src[2].fixed_hw_reg.nr += urb_start;
1292       }
1293
1294       if (inst->opcode == FS_OPCODE_CINTERP) {
1295          assert(inst->src[0].file == HW_REG);
1296          inst->src[0].fixed_hw_reg.nr += urb_start;
1297       }
1298    }
1299
1300    this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length;
1301 }
1302
1303 /**
1304  * Split large virtual GRFs into separate components if we can.
1305  *
1306  * This is mostly duplicated with what brw_fs_vector_splitting does,
1307  * but that's really conservative because it's afraid of doing
1308  * splitting that doesn't result in real progress after the rest of
1309  * the optimization phases, which would cause infinite looping in
1310  * optimization.  We can do it once here, safely.  This also has the
1311  * opportunity to split interpolated values, or maybe even uniforms,
1312  * which we don't have at the IR level.
1313  *
1314  * We want to split, because virtual GRFs are what we register
1315  * allocate and spill (due to contiguousness requirements for some
1316  * instructions), and they're what we naturally generate in the
1317  * codegen process, but most virtual GRFs don't actually need to be
1318  * contiguous sets of GRFs.  If we split, we'll end up with reduced
1319  * live intervals and better dead code elimination and coalescing.
1320  */
1321 void
1322 fs_visitor::split_virtual_grfs()
1323 {
1324    int num_vars = this->virtual_grf_count;
1325    bool split_grf[num_vars];
1326    int new_virtual_grf[num_vars];
1327
1328    /* Try to split anything > 0 sized. */
1329    for (int i = 0; i < num_vars; i++) {
1330       if (this->virtual_grf_sizes[i] != 1)
1331          split_grf[i] = true;
1332       else
1333          split_grf[i] = false;
1334    }
1335
1336    if (brw->has_pln &&
1337        this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1338       /* PLN opcodes rely on the delta_xy being contiguous.  We only have to
1339        * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1340        * Gen6, that was the only supported interpolation mode, and since Gen6,
1341        * delta_x and delta_y are in fixed hardware registers.
1342        */
1343       split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1344          false;
1345    }
1346
1347    foreach_list(node, &this->instructions) {
1348       fs_inst *inst = (fs_inst *)node;
1349
1350       /* If there's a SEND message that requires contiguous destination
1351        * registers, no splitting is allowed.
1352        */
1353       if (inst->regs_written > 1) {
1354          split_grf[inst->dst.reg] = false;
1355       }
1356
1357       /* If we're sending from a GRF, don't split it, on the assumption that
1358        * the send is reading the whole thing.
1359        */
1360       if (inst->is_send_from_grf()) {
1361          split_grf[inst->src[0].reg] = false;
1362       }
1363    }
1364
1365    /* Allocate new space for split regs.  Note that the virtual
1366     * numbers will be contiguous.
1367     */
1368    for (int i = 0; i < num_vars; i++) {
1369       if (split_grf[i]) {
1370          new_virtual_grf[i] = virtual_grf_alloc(1);
1371          for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1372             int reg = virtual_grf_alloc(1);
1373             assert(reg == new_virtual_grf[i] + j - 1);
1374             (void) reg;
1375          }
1376          this->virtual_grf_sizes[i] = 1;
1377       }
1378    }
1379
1380    foreach_list(node, &this->instructions) {
1381       fs_inst *inst = (fs_inst *)node;
1382
1383       if (inst->dst.file == GRF &&
1384           split_grf[inst->dst.reg] &&
1385           inst->dst.reg_offset != 0) {
1386          inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1387                           inst->dst.reg_offset - 1);
1388          inst->dst.reg_offset = 0;
1389       }
1390       for (int i = 0; i < 3; i++) {
1391          if (inst->src[i].file == GRF &&
1392              split_grf[inst->src[i].reg] &&
1393              inst->src[i].reg_offset != 0) {
1394             inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1395                                 inst->src[i].reg_offset - 1);
1396             inst->src[i].reg_offset = 0;
1397          }
1398       }
1399    }
1400    this->live_intervals_valid = false;
1401 }
1402
1403 /**
1404  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1405  *
1406  * During code generation, we create tons of temporary variables, many of
1407  * which get immediately killed and are never used again.  Yet, in later
1408  * optimization and analysis passes, such as compute_live_intervals, we need
1409  * to loop over all the virtual GRFs.  Compacting them can save a lot of
1410  * overhead.
1411  */
1412 void
1413 fs_visitor::compact_virtual_grfs()
1414 {
1415    /* Mark which virtual GRFs are used, and count how many. */
1416    int remap_table[this->virtual_grf_count];
1417    memset(remap_table, -1, sizeof(remap_table));
1418
1419    foreach_list(node, &this->instructions) {
1420       const fs_inst *inst = (const fs_inst *) node;
1421
1422       if (inst->dst.file == GRF)
1423          remap_table[inst->dst.reg] = 0;
1424
1425       for (int i = 0; i < 3; i++) {
1426          if (inst->src[i].file == GRF)
1427             remap_table[inst->src[i].reg] = 0;
1428       }
1429    }
1430
1431    /* In addition to registers used in instructions, fs_visitor keeps
1432     * direct references to certain special values which must be patched:
1433     */
1434    fs_reg *special[] = {
1435       &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1436       &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1437       &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1438       &delta_x[0], &delta_x[1], &delta_x[2],
1439       &delta_x[3], &delta_x[4], &delta_x[5],
1440       &delta_y[0], &delta_y[1], &delta_y[2],
1441       &delta_y[3], &delta_y[4], &delta_y[5],
1442    };
1443    STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1444    STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1445
1446    /* Treat all special values as used, to be conservative */
1447    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1448       if (special[i]->file == GRF)
1449          remap_table[special[i]->reg] = 0;
1450    }
1451
1452    /* Compact the GRF arrays. */
1453    int new_index = 0;
1454    for (int i = 0; i < this->virtual_grf_count; i++) {
1455       if (remap_table[i] != -1) {
1456          remap_table[i] = new_index;
1457          virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1458          if (live_intervals_valid) {
1459             virtual_grf_start[new_index] = virtual_grf_start[i];
1460             virtual_grf_end[new_index] = virtual_grf_end[i];
1461          }
1462          ++new_index;
1463       }
1464    }
1465
1466    this->virtual_grf_count = new_index;
1467
1468    /* Patch all the instructions to use the newly renumbered registers */
1469    foreach_list(node, &this->instructions) {
1470       fs_inst *inst = (fs_inst *) node;
1471
1472       if (inst->dst.file == GRF)
1473          inst->dst.reg = remap_table[inst->dst.reg];
1474
1475       for (int i = 0; i < 3; i++) {
1476          if (inst->src[i].file == GRF)
1477             inst->src[i].reg = remap_table[inst->src[i].reg];
1478       }
1479    }
1480
1481    /* Patch all the references to special values */
1482    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1483       if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1484          special[i]->reg = remap_table[special[i]->reg];
1485    }
1486 }
1487
1488 bool
1489 fs_visitor::remove_dead_constants()
1490 {
1491    if (dispatch_width == 8) {
1492       this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
1493       this->nr_params_remap = c->prog_data.nr_params;
1494
1495       for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
1496          this->params_remap[i] = -1;
1497
1498       /* Find which params are still in use. */
1499       foreach_list(node, &this->instructions) {
1500          fs_inst *inst = (fs_inst *)node;
1501
1502          for (int i = 0; i < 3; i++) {
1503             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1504
1505             if (inst->src[i].file != UNIFORM)
1506                continue;
1507
1508             /* Section 5.11 of the OpenGL 4.3 spec says:
1509              *
1510              *     "Out-of-bounds reads return undefined values, which include
1511              *     values from other variables of the active program or zero."
1512              */
1513             if (constant_nr < 0 || constant_nr >= (int)c->prog_data.nr_params) {
1514                constant_nr = 0;
1515             }
1516
1517             /* For now, set this to non-negative.  We'll give it the
1518              * actual new number in a moment, in order to keep the
1519              * register numbers nicely ordered.
1520              */
1521             this->params_remap[constant_nr] = 0;
1522          }
1523       }
1524
1525       /* Figure out what the new numbers for the params will be.  At some
1526        * point when we're doing uniform array access, we're going to want
1527        * to keep the distinction between .reg and .reg_offset, but for
1528        * now we don't care.
1529        */
1530       unsigned int new_nr_params = 0;
1531       for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1532          if (this->params_remap[i] != -1) {
1533             this->params_remap[i] = new_nr_params++;
1534          }
1535       }
1536
1537       /* Update the list of params to be uploaded to match our new numbering. */
1538       for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1539          int remapped = this->params_remap[i];
1540
1541          if (remapped == -1)
1542             continue;
1543
1544          c->prog_data.param[remapped] = c->prog_data.param[i];
1545       }
1546
1547       c->prog_data.nr_params = new_nr_params;
1548    } else {
1549       /* This should have been generated in the 8-wide pass already. */
1550       assert(this->params_remap);
1551    }
1552
1553    /* Now do the renumbering of the shader to remove unused params. */
1554    foreach_list(node, &this->instructions) {
1555       fs_inst *inst = (fs_inst *)node;
1556
1557       for (int i = 0; i < 3; i++) {
1558          int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1559
1560          if (inst->src[i].file != UNIFORM)
1561             continue;
1562
1563          /* as above alias to 0 */
1564          if (constant_nr < 0 || constant_nr >= (int)this->nr_params_remap) {
1565             constant_nr = 0;
1566          }
1567          assert(this->params_remap[constant_nr] != -1);
1568          inst->src[i].reg = this->params_remap[constant_nr];
1569          inst->src[i].reg_offset = 0;
1570       }
1571    }
1572
1573    return true;
1574 }
1575
1576 /*
1577  * Implements array access of uniforms by inserting a
1578  * PULL_CONSTANT_LOAD instruction.
1579  *
1580  * Unlike temporary GRF array access (where we don't support it due to
1581  * the difficulty of doing relative addressing on instruction
1582  * destinations), we could potentially do array access of uniforms
1583  * that were loaded in GRF space as push constants.  In real-world
1584  * usage we've seen, though, the arrays being used are always larger
1585  * than we could load as push constants, so just always move all
1586  * uniform array access out to a pull constant buffer.
1587  */
1588 void
1589 fs_visitor::move_uniform_array_access_to_pull_constants()
1590 {
1591    int pull_constant_loc[c->prog_data.nr_params];
1592
1593    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1594       pull_constant_loc[i] = -1;
1595    }
1596
1597    /* Walk through and find array access of uniforms.  Put a copy of that
1598     * uniform in the pull constant buffer.
1599     *
1600     * Note that we don't move constant-indexed accesses to arrays.  No
1601     * testing has been done of the performance impact of this choice.
1602     */
1603    foreach_list_safe(node, &this->instructions) {
1604       fs_inst *inst = (fs_inst *)node;
1605
1606       for (int i = 0 ; i < 3; i++) {
1607          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1608             continue;
1609
1610          int uniform = inst->src[i].reg;
1611
1612          /* If this array isn't already present in the pull constant buffer,
1613           * add it.
1614           */
1615          if (pull_constant_loc[uniform] == -1) {
1616             const float **values = &c->prog_data.param[uniform];
1617
1618             pull_constant_loc[uniform] = c->prog_data.nr_pull_params;
1619
1620             assert(param_size[uniform]);
1621
1622             for (int j = 0; j < param_size[uniform]; j++) {
1623                c->prog_data.pull_param[c->prog_data.nr_pull_params++] =
1624                   values[j];
1625             }
1626          }
1627
1628          /* Set up the annotation tracking for new generated instructions. */
1629          base_ir = inst->ir;
1630          current_annotation = inst->annotation;
1631
1632          fs_reg surf_index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1633          fs_reg temp = fs_reg(this, glsl_type::float_type);
1634          exec_list list = VARYING_PULL_CONSTANT_LOAD(temp,
1635                                                      surf_index,
1636                                                      *inst->src[i].reladdr,
1637                                                      pull_constant_loc[uniform] +
1638                                                      inst->src[i].reg_offset);
1639          inst->insert_before(&list);
1640
1641          inst->src[i].file = temp.file;
1642          inst->src[i].reg = temp.reg;
1643          inst->src[i].reg_offset = temp.reg_offset;
1644          inst->src[i].reladdr = NULL;
1645       }
1646    }
1647 }
1648
1649 /**
1650  * Choose accesses from the UNIFORM file to demote to using the pull
1651  * constant buffer.
1652  *
1653  * We allow a fragment shader to have more than the specified minimum
1654  * maximum number of fragment shader uniform components (64).  If
1655  * there are too many of these, they'd fill up all of register space.
1656  * So, this will push some of them out to the pull constant buffer and
1657  * update the program to load them.
1658  */
1659 void
1660 fs_visitor::setup_pull_constants()
1661 {
1662    /* Only allow 16 registers (128 uniform components) as push constants. */
1663    unsigned int max_uniform_components = 16 * 8;
1664    if (c->prog_data.nr_params <= max_uniform_components)
1665       return;
1666
1667    if (dispatch_width == 16) {
1668       fail("Pull constants not supported in 16-wide\n");
1669       return;
1670    }
1671
1672    /* Just demote the end of the list.  We could probably do better
1673     * here, demoting things that are rarely used in the program first.
1674     */
1675    unsigned int pull_uniform_base = max_uniform_components;
1676
1677    int pull_constant_loc[c->prog_data.nr_params];
1678    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1679       if (i < pull_uniform_base) {
1680          pull_constant_loc[i] = -1;
1681       } else {
1682          pull_constant_loc[i] = -1;
1683          /* If our constant is already being uploaded for reladdr purposes,
1684           * reuse it.
1685           */
1686          for (unsigned int j = 0; j < c->prog_data.nr_pull_params; j++) {
1687             if (c->prog_data.pull_param[j] == c->prog_data.param[i]) {
1688                pull_constant_loc[i] = j;
1689                break;
1690             }
1691          }
1692          if (pull_constant_loc[i] == -1) {
1693             int pull_index = c->prog_data.nr_pull_params++;
1694             c->prog_data.pull_param[pull_index] = c->prog_data.param[i];
1695             pull_constant_loc[i] = pull_index;;
1696          }
1697       }
1698    }
1699    c->prog_data.nr_params = pull_uniform_base;
1700
1701    foreach_list(node, &this->instructions) {
1702       fs_inst *inst = (fs_inst *)node;
1703
1704       for (int i = 0; i < 3; i++) {
1705          if (inst->src[i].file != UNIFORM)
1706             continue;
1707
1708          int pull_index = pull_constant_loc[inst->src[i].reg +
1709                                             inst->src[i].reg_offset];
1710          if (pull_index == -1)
1711             continue;
1712
1713          assert(!inst->src[i].reladdr);
1714
1715          fs_reg dst = fs_reg(this, glsl_type::float_type);
1716          fs_reg index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1717          fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1718          fs_inst *pull =
1719             new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1720                                  dst, index, offset);
1721          pull->ir = inst->ir;
1722          pull->annotation = inst->annotation;
1723
1724          inst->insert_before(pull);
1725
1726          inst->src[i].file = GRF;
1727          inst->src[i].reg = dst.reg;
1728          inst->src[i].reg_offset = 0;
1729          inst->src[i].smear = pull_index & 3;
1730       }
1731    }
1732 }
1733
1734 bool
1735 fs_visitor::opt_algebraic()
1736 {
1737    bool progress = false;
1738
1739    foreach_list(node, &this->instructions) {
1740       fs_inst *inst = (fs_inst *)node;
1741
1742       switch (inst->opcode) {
1743       case BRW_OPCODE_MUL:
1744          if (inst->src[1].file != IMM)
1745             continue;
1746
1747          /* a * 1.0 = a */
1748          if (inst->src[1].is_one()) {
1749             inst->opcode = BRW_OPCODE_MOV;
1750             inst->src[1] = reg_undef;
1751             progress = true;
1752             break;
1753          }
1754
1755          /* a * 0.0 = 0.0 */
1756          if (inst->src[1].is_zero()) {
1757             inst->opcode = BRW_OPCODE_MOV;
1758             inst->src[0] = inst->src[1];
1759             inst->src[1] = reg_undef;
1760             progress = true;
1761             break;
1762          }
1763
1764          break;
1765       case BRW_OPCODE_ADD:
1766          if (inst->src[1].file != IMM)
1767             continue;
1768
1769          /* a + 0.0 = a */
1770          if (inst->src[1].is_zero()) {
1771             inst->opcode = BRW_OPCODE_MOV;
1772             inst->src[1] = reg_undef;
1773             progress = true;
1774             break;
1775          }
1776          break;
1777       default:
1778          break;
1779       }
1780    }
1781
1782    return progress;
1783 }
1784
1785 /**
1786  * Removes any instructions writing a VGRF where that VGRF is not used by any
1787  * later instruction.
1788  */
1789 bool
1790 fs_visitor::dead_code_eliminate()
1791 {
1792    bool progress = false;
1793    int pc = 0;
1794
1795    calculate_live_intervals();
1796
1797    foreach_list_safe(node, &this->instructions) {
1798       fs_inst *inst = (fs_inst *)node;
1799
1800       if (inst->dst.file == GRF) {
1801          assert(this->virtual_grf_end[inst->dst.reg] >= pc);
1802          if (this->virtual_grf_end[inst->dst.reg] == pc) {
1803             inst->remove();
1804             progress = true;
1805          }
1806       }
1807
1808       pc++;
1809    }
1810
1811    if (progress)
1812       live_intervals_valid = false;
1813
1814    return progress;
1815 }
1816
1817 struct dead_code_hash_key
1818 {
1819    int vgrf;
1820    int reg_offset;
1821 };
1822
1823 static bool
1824 dead_code_hash_compare(const void *a, const void *b)
1825 {
1826    return memcmp(a, b, sizeof(struct dead_code_hash_key)) == 0;
1827 }
1828
1829 static void
1830 clear_dead_code_hash(struct hash_table *ht)
1831 {
1832    struct hash_entry *entry;
1833
1834    hash_table_foreach(ht, entry) {
1835       _mesa_hash_table_remove(ht, entry);
1836    }
1837 }
1838
1839 static void
1840 insert_dead_code_hash(struct hash_table *ht,
1841                       int vgrf, int reg_offset, fs_inst *inst)
1842 {
1843    /* We don't bother freeing keys, because they'll be GCed with the ht. */
1844    struct dead_code_hash_key *key = ralloc(ht, struct dead_code_hash_key);
1845
1846    key->vgrf = vgrf;
1847    key->reg_offset = reg_offset;
1848
1849    _mesa_hash_table_insert(ht, _mesa_hash_data(key, sizeof(*key)), key, inst);
1850 }
1851
1852 static struct hash_entry *
1853 get_dead_code_hash_entry(struct hash_table *ht, int vgrf, int reg_offset)
1854 {
1855    struct dead_code_hash_key key;
1856
1857    key.vgrf = vgrf;
1858    key.reg_offset = reg_offset;
1859
1860    return _mesa_hash_table_search(ht, _mesa_hash_data(&key, sizeof(key)), &key);
1861 }
1862
1863 static void
1864 remove_dead_code_hash(struct hash_table *ht,
1865                       int vgrf, int reg_offset)
1866 {
1867    struct hash_entry *entry = get_dead_code_hash_entry(ht, vgrf, reg_offset);
1868    if (!entry)
1869       return;
1870
1871    _mesa_hash_table_remove(ht, entry);
1872 }
1873
1874 /**
1875  * Walks basic blocks, removing any regs that are written but not read before
1876  * being redefined.
1877  *
1878  * The dead_code_eliminate() function implements a global dead code
1879  * elimination, but it only handles the removing the last write to a register
1880  * if it's never read.  This one can handle intermediate writes, but only
1881  * within a basic block.
1882  */
1883 bool
1884 fs_visitor::dead_code_eliminate_local()
1885 {
1886    struct hash_table *ht;
1887    bool progress = false;
1888
1889    ht = _mesa_hash_table_create(mem_ctx, dead_code_hash_compare);
1890
1891    foreach_list_safe(node, &this->instructions) {
1892       fs_inst *inst = (fs_inst *)node;
1893
1894       /* At a basic block, empty the HT since we don't understand dataflow
1895        * here.
1896        */
1897       if (inst->is_control_flow()) {
1898          clear_dead_code_hash(ht);
1899          continue;
1900       }
1901
1902       /* Clear the HT of any instructions that got read. */
1903       for (int i = 0; i < 3; i++) {
1904          fs_reg src = inst->src[i];
1905          if (src.file != GRF)
1906             continue;
1907
1908          int read = 1;
1909          if (inst->is_send_from_grf())
1910             read = virtual_grf_sizes[src.reg] - src.reg_offset;
1911
1912          for (int reg_offset = src.reg_offset;
1913               reg_offset < src.reg_offset + read;
1914               reg_offset++) {
1915             remove_dead_code_hash(ht, src.reg, reg_offset);
1916          }
1917       }
1918
1919       /* Add any update of a GRF to the HT, removing a previous write if it
1920        * wasn't read.
1921        */
1922       if (inst->dst.file == GRF) {
1923          if (inst->regs_written > 1) {
1924             /* We don't know how to trim channels from an instruction's
1925              * writes, so we can't incrementally remove unread channels from
1926              * it.  Just remove whatever it overwrites from the table
1927              */
1928             for (int i = 0; i < inst->regs_written; i++) {
1929                remove_dead_code_hash(ht,
1930                                      inst->dst.reg,
1931                                      inst->dst.reg_offset + i);
1932             }
1933          } else {
1934             struct hash_entry *entry =
1935                get_dead_code_hash_entry(ht, inst->dst.reg,
1936                                         inst->dst.reg_offset);
1937
1938             if (inst->is_partial_write()) {
1939                /* For a partial write, we can't remove any previous dead code
1940                 * candidate, since we're just modifying their result, but we can
1941                 * be dead code eliminiated ourselves.
1942                 */
1943                if (entry) {
1944                   entry->data = inst;
1945                } else {
1946                   insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
1947                                         inst);
1948                }
1949             } else {
1950                if (entry) {
1951                   /* We're completely updating a channel, and there was a
1952                    * previous write to the channel that wasn't read.  Kill it!
1953                    */
1954                   fs_inst *inst = (fs_inst *)entry->data;
1955                   inst->remove();
1956                   progress = true;
1957                   _mesa_hash_table_remove(ht, entry);
1958                }
1959
1960                insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
1961                                      inst);
1962             }
1963          }
1964       }
1965    }
1966
1967    _mesa_hash_table_destroy(ht, NULL);
1968
1969    if (progress)
1970       live_intervals_valid = false;
1971
1972    return progress;
1973 }
1974
1975 /**
1976  * Implements a second type of register coalescing: This one checks if
1977  * the two regs involved in a raw move don't interfere, in which case
1978  * they can both by stored in the same place and the MOV removed.
1979  */
1980 bool
1981 fs_visitor::register_coalesce_2()
1982 {
1983    bool progress = false;
1984
1985    calculate_live_intervals();
1986
1987    foreach_list_safe(node, &this->instructions) {
1988       fs_inst *inst = (fs_inst *)node;
1989
1990       if (inst->opcode != BRW_OPCODE_MOV ||
1991           inst->is_partial_write() ||
1992           inst->saturate ||
1993           inst->src[0].file != GRF ||
1994           inst->src[0].negate ||
1995           inst->src[0].abs ||
1996           inst->src[0].smear != -1 ||
1997           inst->dst.file != GRF ||
1998           inst->dst.type != inst->src[0].type ||
1999           virtual_grf_sizes[inst->src[0].reg] != 1 ||
2000           virtual_grf_interferes(inst->dst.reg, inst->src[0].reg)) {
2001          continue;
2002       }
2003
2004       int reg_from = inst->src[0].reg;
2005       assert(inst->src[0].reg_offset == 0);
2006       int reg_to = inst->dst.reg;
2007       int reg_to_offset = inst->dst.reg_offset;
2008
2009       foreach_list(node, &this->instructions) {
2010          fs_inst *scan_inst = (fs_inst *)node;
2011
2012          if (scan_inst->dst.file == GRF &&
2013              scan_inst->dst.reg == reg_from) {
2014             scan_inst->dst.reg = reg_to;
2015             scan_inst->dst.reg_offset = reg_to_offset;
2016          }
2017          for (int i = 0; i < 3; i++) {
2018             if (scan_inst->src[i].file == GRF &&
2019                 scan_inst->src[i].reg == reg_from) {
2020                scan_inst->src[i].reg = reg_to;
2021                scan_inst->src[i].reg_offset = reg_to_offset;
2022             }
2023          }
2024       }
2025
2026       inst->remove();
2027
2028       /* We don't need to recalculate live intervals inside the loop despite
2029        * flagging live_intervals_valid because we only use live intervals for
2030        * the interferes test, and we must have had a situation where the
2031        * intervals were:
2032        *
2033        *  from  to
2034        *  ^
2035        *  |
2036        *  v
2037        *        ^
2038        *        |
2039        *        v
2040        *
2041        * Some register R that might get coalesced with one of these two could
2042        * only be referencing "to", otherwise "from"'s range would have been
2043        * longer.  R's range could also only start at the end of "to" or later,
2044        * otherwise it will conflict with "to" when we try to coalesce "to"
2045        * into Rw anyway.
2046        */
2047       live_intervals_valid = false;
2048
2049       progress = true;
2050       continue;
2051    }
2052
2053    return progress;
2054 }
2055
2056 bool
2057 fs_visitor::register_coalesce()
2058 {
2059    bool progress = false;
2060    int if_depth = 0;
2061    int loop_depth = 0;
2062
2063    foreach_list_safe(node, &this->instructions) {
2064       fs_inst *inst = (fs_inst *)node;
2065
2066       /* Make sure that we dominate the instructions we're going to
2067        * scan for interfering with our coalescing, or we won't have
2068        * scanned enough to see if anything interferes with our
2069        * coalescing.  We don't dominate the following instructions if
2070        * we're in a loop or an if block.
2071        */
2072       switch (inst->opcode) {
2073       case BRW_OPCODE_DO:
2074          loop_depth++;
2075          break;
2076       case BRW_OPCODE_WHILE:
2077          loop_depth--;
2078          break;
2079       case BRW_OPCODE_IF:
2080          if_depth++;
2081          break;
2082       case BRW_OPCODE_ENDIF:
2083          if_depth--;
2084          break;
2085       default:
2086          break;
2087       }
2088       if (loop_depth || if_depth)
2089          continue;
2090
2091       if (inst->opcode != BRW_OPCODE_MOV ||
2092           inst->is_partial_write() ||
2093           inst->saturate ||
2094           inst->dst.file != GRF || (inst->src[0].file != GRF &&
2095                                     inst->src[0].file != UNIFORM)||
2096           inst->dst.type != inst->src[0].type)
2097          continue;
2098
2099       bool has_source_modifiers = (inst->src[0].abs ||
2100                                    inst->src[0].negate ||
2101                                    inst->src[0].smear != -1 ||
2102                                    inst->src[0].file == UNIFORM);
2103
2104       /* Found a move of a GRF to a GRF.  Let's see if we can coalesce
2105        * them: check for no writes to either one until the exit of the
2106        * program.
2107        */
2108       bool interfered = false;
2109
2110       for (fs_inst *scan_inst = (fs_inst *)inst->next;
2111            !scan_inst->is_tail_sentinel();
2112            scan_inst = (fs_inst *)scan_inst->next) {
2113          if (scan_inst->dst.file == GRF) {
2114             if (scan_inst->overwrites_reg(inst->dst) ||
2115                 scan_inst->overwrites_reg(inst->src[0])) {
2116                interfered = true;
2117                break;
2118             }
2119          }
2120
2121          /* The gen6 MATH instruction can't handle source modifiers or
2122           * unusual register regions, so avoid coalescing those for
2123           * now.  We should do something more specific.
2124           */
2125          if (has_source_modifiers && !can_do_source_mods(scan_inst)) {
2126             interfered = true;
2127             break;
2128          }
2129
2130          /* The accumulator result appears to get used for the
2131           * conditional modifier generation.  When negating a UD
2132           * value, there is a 33rd bit generated for the sign in the
2133           * accumulator value, so now you can't check, for example,
2134           * equality with a 32-bit value.  See piglit fs-op-neg-uint.
2135           */
2136          if (scan_inst->conditional_mod &&
2137              inst->src[0].negate &&
2138              inst->src[0].type == BRW_REGISTER_TYPE_UD) {
2139             interfered = true;
2140             break;
2141          }
2142       }
2143       if (interfered) {
2144          continue;
2145       }
2146
2147       /* Rewrite the later usage to point at the source of the move to
2148        * be removed.
2149        */
2150       for (fs_inst *scan_inst = inst;
2151            !scan_inst->is_tail_sentinel();
2152            scan_inst = (fs_inst *)scan_inst->next) {
2153          for (int i = 0; i < 3; i++) {
2154             if (scan_inst->src[i].file == GRF &&
2155                 scan_inst->src[i].reg == inst->dst.reg &&
2156                 scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
2157                fs_reg new_src = inst->src[0];
2158                if (scan_inst->src[i].abs) {
2159                   new_src.negate = 0;
2160                   new_src.abs = 1;
2161                }
2162                new_src.negate ^= scan_inst->src[i].negate;
2163                scan_inst->src[i] = new_src;
2164             }
2165          }
2166       }
2167
2168       inst->remove();
2169       progress = true;
2170    }
2171
2172    if (progress)
2173       live_intervals_valid = false;
2174
2175    return progress;
2176 }
2177
2178
2179 bool
2180 fs_visitor::compute_to_mrf()
2181 {
2182    bool progress = false;
2183    int next_ip = 0;
2184
2185    calculate_live_intervals();
2186
2187    foreach_list_safe(node, &this->instructions) {
2188       fs_inst *inst = (fs_inst *)node;
2189
2190       int ip = next_ip;
2191       next_ip++;
2192
2193       if (inst->opcode != BRW_OPCODE_MOV ||
2194           inst->is_partial_write() ||
2195           inst->dst.file != MRF || inst->src[0].file != GRF ||
2196           inst->dst.type != inst->src[0].type ||
2197           inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
2198          continue;
2199
2200       /* Work out which hardware MRF registers are written by this
2201        * instruction.
2202        */
2203       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2204       int mrf_high;
2205       if (inst->dst.reg & BRW_MRF_COMPR4) {
2206          mrf_high = mrf_low + 4;
2207       } else if (dispatch_width == 16 &&
2208                  (!inst->force_uncompressed && !inst->force_sechalf)) {
2209          mrf_high = mrf_low + 1;
2210       } else {
2211          mrf_high = mrf_low;
2212       }
2213
2214       /* Can't compute-to-MRF this GRF if someone else was going to
2215        * read it later.
2216        */
2217       if (this->virtual_grf_end[inst->src[0].reg] > ip)
2218          continue;
2219
2220       /* Found a move of a GRF to a MRF.  Let's see if we can go
2221        * rewrite the thing that made this GRF to write into the MRF.
2222        */
2223       fs_inst *scan_inst;
2224       for (scan_inst = (fs_inst *)inst->prev;
2225            scan_inst->prev != NULL;
2226            scan_inst = (fs_inst *)scan_inst->prev) {
2227          if (scan_inst->dst.file == GRF &&
2228              scan_inst->dst.reg == inst->src[0].reg) {
2229             /* Found the last thing to write our reg we want to turn
2230              * into a compute-to-MRF.
2231              */
2232
2233             /* If this one instruction didn't populate all the
2234              * channels, bail.  We might be able to rewrite everything
2235              * that writes that reg, but it would require smarter
2236              * tracking to delay the rewriting until complete success.
2237              */
2238             if (scan_inst->is_partial_write())
2239                break;
2240
2241             /* Things returning more than one register would need us to
2242              * understand coalescing out more than one MOV at a time.
2243              */
2244             if (scan_inst->regs_written > 1)
2245                break;
2246
2247             /* SEND instructions can't have MRF as a destination. */
2248             if (scan_inst->mlen)
2249                break;
2250
2251             if (brw->gen == 6) {
2252                /* gen6 math instructions must have the destination be
2253                 * GRF, so no compute-to-MRF for them.
2254                 */
2255                if (scan_inst->is_math()) {
2256                   break;
2257                }
2258             }
2259
2260             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2261                /* Found the creator of our MRF's source value. */
2262                scan_inst->dst.file = MRF;
2263                scan_inst->dst.reg = inst->dst.reg;
2264                scan_inst->saturate |= inst->saturate;
2265                inst->remove();
2266                progress = true;
2267             }
2268             break;
2269          }
2270
2271          /* We don't handle control flow here.  Most computation of
2272           * values that end up in MRFs are shortly before the MRF
2273           * write anyway.
2274           */
2275          if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2276             break;
2277
2278          /* You can't read from an MRF, so if someone else reads our
2279           * MRF's source GRF that we wanted to rewrite, that stops us.
2280           */
2281          bool interfered = false;
2282          for (int i = 0; i < 3; i++) {
2283             if (scan_inst->src[i].file == GRF &&
2284                 scan_inst->src[i].reg == inst->src[0].reg &&
2285                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2286                interfered = true;
2287             }
2288          }
2289          if (interfered)
2290             break;
2291
2292          if (scan_inst->dst.file == MRF) {
2293             /* If somebody else writes our MRF here, we can't
2294              * compute-to-MRF before that.
2295              */
2296             int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2297             int scan_mrf_high;
2298
2299             if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2300                scan_mrf_high = scan_mrf_low + 4;
2301             } else if (dispatch_width == 16 &&
2302                        (!scan_inst->force_uncompressed &&
2303                         !scan_inst->force_sechalf)) {
2304                scan_mrf_high = scan_mrf_low + 1;
2305             } else {
2306                scan_mrf_high = scan_mrf_low;
2307             }
2308
2309             if (mrf_low == scan_mrf_low ||
2310                 mrf_low == scan_mrf_high ||
2311                 mrf_high == scan_mrf_low ||
2312                 mrf_high == scan_mrf_high) {
2313                break;
2314             }
2315          }
2316
2317          if (scan_inst->mlen > 0) {
2318             /* Found a SEND instruction, which means that there are
2319              * live values in MRFs from base_mrf to base_mrf +
2320              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
2321              * above it.
2322              */
2323             if (mrf_low >= scan_inst->base_mrf &&
2324                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2325                break;
2326             }
2327             if (mrf_high >= scan_inst->base_mrf &&
2328                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2329                break;
2330             }
2331          }
2332       }
2333    }
2334
2335    if (progress)
2336       live_intervals_valid = false;
2337
2338    return progress;
2339 }
2340
2341 /**
2342  * Walks through basic blocks, looking for repeated MRF writes and
2343  * removing the later ones.
2344  */
2345 bool
2346 fs_visitor::remove_duplicate_mrf_writes()
2347 {
2348    fs_inst *last_mrf_move[16];
2349    bool progress = false;
2350
2351    /* Need to update the MRF tracking for compressed instructions. */
2352    if (dispatch_width == 16)
2353       return false;
2354
2355    memset(last_mrf_move, 0, sizeof(last_mrf_move));
2356
2357    foreach_list_safe(node, &this->instructions) {
2358       fs_inst *inst = (fs_inst *)node;
2359
2360       if (inst->is_control_flow()) {
2361          memset(last_mrf_move, 0, sizeof(last_mrf_move));
2362       }
2363
2364       if (inst->opcode == BRW_OPCODE_MOV &&
2365           inst->dst.file == MRF) {
2366          fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2367          if (prev_inst && inst->equals(prev_inst)) {
2368             inst->remove();
2369             progress = true;
2370             continue;
2371          }
2372       }
2373
2374       /* Clear out the last-write records for MRFs that were overwritten. */
2375       if (inst->dst.file == MRF) {
2376          last_mrf_move[inst->dst.reg] = NULL;
2377       }
2378
2379       if (inst->mlen > 0) {
2380          /* Found a SEND instruction, which will include two or fewer
2381           * implied MRF writes.  We could do better here.
2382           */
2383          for (int i = 0; i < implied_mrf_writes(inst); i++) {
2384             last_mrf_move[inst->base_mrf + i] = NULL;
2385          }
2386       }
2387
2388       /* Clear out any MRF move records whose sources got overwritten. */
2389       if (inst->dst.file == GRF) {
2390          for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2391             if (last_mrf_move[i] &&
2392                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2393                last_mrf_move[i] = NULL;
2394             }
2395          }
2396       }
2397
2398       if (inst->opcode == BRW_OPCODE_MOV &&
2399           inst->dst.file == MRF &&
2400           inst->src[0].file == GRF &&
2401           !inst->is_partial_write()) {
2402          last_mrf_move[inst->dst.reg] = inst;
2403       }
2404    }
2405
2406    if (progress)
2407       live_intervals_valid = false;
2408
2409    return progress;
2410 }
2411
2412 static void
2413 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2414                         int first_grf, int grf_len)
2415 {
2416    bool inst_16wide = (dispatch_width > 8 &&
2417                        !inst->force_uncompressed &&
2418                        !inst->force_sechalf);
2419
2420    /* Clear the flag for registers that actually got read (as expected). */
2421    for (int i = 0; i < 3; i++) {
2422       int grf;
2423       if (inst->src[i].file == GRF) {
2424          grf = inst->src[i].reg;
2425       } else if (inst->src[i].file == HW_REG &&
2426                  inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2427          grf = inst->src[i].fixed_hw_reg.nr;
2428       } else {
2429          continue;
2430       }
2431
2432       if (grf >= first_grf &&
2433           grf < first_grf + grf_len) {
2434          deps[grf - first_grf] = false;
2435          if (inst_16wide)
2436             deps[grf - first_grf + 1] = false;
2437       }
2438    }
2439 }
2440
2441 /**
2442  * Implements this workaround for the original 965:
2443  *
2444  *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2445  *      check for post destination dependencies on this instruction, software
2446  *      must ensure that there is no destination hazard for the case of ‘write
2447  *      followed by a posted write’ shown in the following example.
2448  *
2449  *      1. mov r3 0
2450  *      2. send r3.xy <rest of send instruction>
2451  *      3. mov r2 r3
2452  *
2453  *      Due to no post-destination dependency check on the ‘send’, the above
2454  *      code sequence could have two instructions (1 and 2) in flight at the
2455  *      same time that both consider ‘r3’ as the target of their final writes.
2456  */
2457 void
2458 fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
2459 {
2460    int reg_size = dispatch_width / 8;
2461    int write_len = inst->regs_written * reg_size;
2462    int first_write_grf = inst->dst.reg;
2463    bool needs_dep[BRW_MAX_MRF];
2464    assert(write_len < (int)sizeof(needs_dep) - 1);
2465
2466    memset(needs_dep, false, sizeof(needs_dep));
2467    memset(needs_dep, true, write_len);
2468
2469    clear_deps_for_inst_src(inst, dispatch_width,
2470                            needs_dep, first_write_grf, write_len);
2471
2472    /* Walk backwards looking for writes to registers we're writing which
2473     * aren't read since being written.  If we hit the start of the program,
2474     * we assume that there are no outstanding dependencies on entry to the
2475     * program.
2476     */
2477    for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2478         scan_inst != NULL;
2479         scan_inst = (fs_inst *)scan_inst->prev) {
2480
2481       /* If we hit control flow, assume that there *are* outstanding
2482        * dependencies, and force their cleanup before our instruction.
2483        */
2484       if (scan_inst->is_control_flow()) {
2485          for (int i = 0; i < write_len; i++) {
2486             if (needs_dep[i]) {
2487                inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2488             }
2489          }
2490          return;
2491       }
2492
2493       bool scan_inst_16wide = (dispatch_width > 8 &&
2494                                !scan_inst->force_uncompressed &&
2495                                !scan_inst->force_sechalf);
2496
2497       /* We insert our reads as late as possible on the assumption that any
2498        * instruction but a MOV that might have left us an outstanding
2499        * dependency has more latency than a MOV.
2500        */
2501       if (scan_inst->dst.file == GRF) {
2502          for (int i = 0; i < scan_inst->regs_written; i++) {
2503             int reg = scan_inst->dst.reg + i * reg_size;
2504
2505             if (reg >= first_write_grf &&
2506                 reg < first_write_grf + write_len &&
2507                 needs_dep[reg - first_write_grf]) {
2508                inst->insert_before(DEP_RESOLVE_MOV(reg));
2509                needs_dep[reg - first_write_grf] = false;
2510                if (scan_inst_16wide)
2511                   needs_dep[reg - first_write_grf + 1] = false;
2512             }
2513          }
2514       }
2515
2516       /* Clear the flag for registers that actually got read (as expected). */
2517       clear_deps_for_inst_src(scan_inst, dispatch_width,
2518                               needs_dep, first_write_grf, write_len);
2519
2520       /* Continue the loop only if we haven't resolved all the dependencies */
2521       int i;
2522       for (i = 0; i < write_len; i++) {
2523          if (needs_dep[i])
2524             break;
2525       }
2526       if (i == write_len)
2527          return;
2528    }
2529 }
2530
2531 /**
2532  * Implements this workaround for the original 965:
2533  *
2534  *     "[DevBW, DevCL] Errata: A destination register from a send can not be
2535  *      used as a destination register until after it has been sourced by an
2536  *      instruction with a different destination register.
2537  */
2538 void
2539 fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
2540 {
2541    int write_len = inst->regs_written * dispatch_width / 8;
2542    int first_write_grf = inst->dst.reg;
2543    bool needs_dep[BRW_MAX_MRF];
2544    assert(write_len < (int)sizeof(needs_dep) - 1);
2545
2546    memset(needs_dep, false, sizeof(needs_dep));
2547    memset(needs_dep, true, write_len);
2548    /* Walk forwards looking for writes to registers we're writing which aren't
2549     * read before being written.
2550     */
2551    for (fs_inst *scan_inst = (fs_inst *)inst->next;
2552         !scan_inst->is_tail_sentinel();
2553         scan_inst = (fs_inst *)scan_inst->next) {
2554       /* If we hit control flow, force resolve all remaining dependencies. */
2555       if (scan_inst->is_control_flow()) {
2556          for (int i = 0; i < write_len; i++) {
2557             if (needs_dep[i])
2558                scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2559          }
2560          return;
2561       }
2562
2563       /* Clear the flag for registers that actually got read (as expected). */
2564       clear_deps_for_inst_src(scan_inst, dispatch_width,
2565                               needs_dep, first_write_grf, write_len);
2566
2567       /* We insert our reads as late as possible since they're reading the
2568        * result of a SEND, which has massive latency.
2569        */
2570       if (scan_inst->dst.file == GRF &&
2571           scan_inst->dst.reg >= first_write_grf &&
2572           scan_inst->dst.reg < first_write_grf + write_len &&
2573           needs_dep[scan_inst->dst.reg - first_write_grf]) {
2574          scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2575          needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2576       }
2577
2578       /* Continue the loop only if we haven't resolved all the dependencies */
2579       int i;
2580       for (i = 0; i < write_len; i++) {
2581          if (needs_dep[i])
2582             break;
2583       }
2584       if (i == write_len)
2585          return;
2586    }
2587
2588    /* If we hit the end of the program, resolve all remaining dependencies out
2589     * of paranoia.
2590     */
2591    fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2592    assert(last_inst->eot);
2593    for (int i = 0; i < write_len; i++) {
2594       if (needs_dep[i])
2595          last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2596    }
2597 }
2598
2599 void
2600 fs_visitor::insert_gen4_send_dependency_workarounds()
2601 {
2602    if (brw->gen != 4 || brw->is_g4x)
2603       return;
2604
2605    /* Note that we're done with register allocation, so GRF fs_regs always
2606     * have a .reg_offset of 0.
2607     */
2608
2609    foreach_list_safe(node, &this->instructions) {
2610       fs_inst *inst = (fs_inst *)node;
2611
2612       if (inst->mlen != 0 && inst->dst.file == GRF) {
2613          insert_gen4_pre_send_dependency_workarounds(inst);
2614          insert_gen4_post_send_dependency_workarounds(inst);
2615       }
2616    }
2617 }
2618
2619 /**
2620  * Turns the generic expression-style uniform pull constant load instruction
2621  * into a hardware-specific series of instructions for loading a pull
2622  * constant.
2623  *
2624  * The expression style allows the CSE pass before this to optimize out
2625  * repeated loads from the same offset, and gives the pre-register-allocation
2626  * scheduling full flexibility, while the conversion to native instructions
2627  * allows the post-register-allocation scheduler the best information
2628  * possible.
2629  *
2630  * Note that execution masking for setting up pull constant loads is special:
2631  * the channels that need to be written are unrelated to the current execution
2632  * mask, since a later instruction will use one of the result channels as a
2633  * source operand for all 8 or 16 of its channels.
2634  */
2635 void
2636 fs_visitor::lower_uniform_pull_constant_loads()
2637 {
2638    foreach_list(node, &this->instructions) {
2639       fs_inst *inst = (fs_inst *)node;
2640
2641       if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2642          continue;
2643
2644       if (brw->gen >= 7) {
2645          /* The offset arg before was a vec4-aligned byte offset.  We need to
2646           * turn it into a dword offset.
2647           */
2648          fs_reg const_offset_reg = inst->src[1];
2649          assert(const_offset_reg.file == IMM &&
2650                 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2651          const_offset_reg.imm.u /= 4;
2652          fs_reg payload = fs_reg(this, glsl_type::uint_type);
2653
2654          /* This is actually going to be a MOV, but since only the first dword
2655           * is accessed, we have a special opcode to do just that one.  Note
2656           * that this needs to be an operation that will be considered a def
2657           * by live variable analysis, or register allocation will explode.
2658           */
2659          fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2660                                                payload, const_offset_reg);
2661          setup->force_writemask_all = true;
2662
2663          setup->ir = inst->ir;
2664          setup->annotation = inst->annotation;
2665          inst->insert_before(setup);
2666
2667          /* Similarly, this will only populate the first 4 channels of the
2668           * result register (since we only use smear values from 0-3), but we
2669           * don't tell the optimizer.
2670           */
2671          inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2672          inst->src[1] = payload;
2673
2674          this->live_intervals_valid = false;
2675       } else {
2676          /* Before register allocation, we didn't tell the scheduler about the
2677           * MRF we use.  We know it's safe to use this MRF because nothing
2678           * else does except for register spill/unspill, which generates and
2679           * uses its MRF within a single IR instruction.
2680           */
2681          inst->base_mrf = 14;
2682          inst->mlen = 1;
2683       }
2684    }
2685 }
2686
2687 void
2688 fs_visitor::dump_instruction(backend_instruction *be_inst)
2689 {
2690    fs_inst *inst = (fs_inst *)be_inst;
2691
2692    if (inst->predicate) {
2693       printf("(%cf0.%d) ",
2694              inst->predicate_inverse ? '-' : '+',
2695              inst->flag_subreg);
2696    }
2697
2698    printf("%s", brw_instruction_name(inst->opcode));
2699    if (inst->saturate)
2700       printf(".sat");
2701    if (inst->conditional_mod) {
2702       printf(".cmod");
2703       if (!inst->predicate &&
2704           (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2705                               inst->opcode != BRW_OPCODE_IF &&
2706                               inst->opcode != BRW_OPCODE_WHILE))) {
2707          printf(".f0.%d", inst->flag_subreg);
2708       }
2709    }
2710    printf(" ");
2711
2712
2713    switch (inst->dst.file) {
2714    case GRF:
2715       printf("vgrf%d", inst->dst.reg);
2716       if (inst->dst.reg_offset)
2717          printf("+%d", inst->dst.reg_offset);
2718       break;
2719    case MRF:
2720       printf("m%d", inst->dst.reg);
2721       break;
2722    case BAD_FILE:
2723       printf("(null)");
2724       break;
2725    case UNIFORM:
2726       printf("***u%d***", inst->dst.reg);
2727       break;
2728    case ARF:
2729       if (inst->dst.reg == BRW_ARF_NULL)
2730          printf("(null)");
2731       else
2732          printf("arf%d", inst->dst.reg);
2733       break;
2734    default:
2735       printf("???");
2736       break;
2737    }
2738    printf(", ");
2739
2740    for (int i = 0; i < 3; i++) {
2741       if (inst->src[i].negate)
2742          printf("-");
2743       if (inst->src[i].abs)
2744          printf("|");
2745       switch (inst->src[i].file) {
2746       case GRF:
2747          printf("vgrf%d", inst->src[i].reg);
2748          if (inst->src[i].reg_offset)
2749             printf("+%d", inst->src[i].reg_offset);
2750          break;
2751       case MRF:
2752          printf("***m%d***", inst->src[i].reg);
2753          break;
2754       case UNIFORM:
2755          printf("u%d", inst->src[i].reg);
2756          if (inst->src[i].reg_offset)
2757             printf(".%d", inst->src[i].reg_offset);
2758          break;
2759       case BAD_FILE:
2760          printf("(null)");
2761          break;
2762       case IMM:
2763          switch (inst->src[i].type) {
2764          case BRW_REGISTER_TYPE_F:
2765             printf("%ff", inst->src[i].imm.f);
2766             break;
2767          case BRW_REGISTER_TYPE_D:
2768             printf("%dd", inst->src[i].imm.i);
2769             break;
2770          case BRW_REGISTER_TYPE_UD:
2771             printf("%uu", inst->src[i].imm.u);
2772             break;
2773          default:
2774             printf("???");
2775             break;
2776          }
2777          break;
2778       default:
2779          printf("???");
2780          break;
2781       }
2782       if (inst->src[i].abs)
2783          printf("|");
2784
2785       if (i < 3)
2786          printf(", ");
2787    }
2788
2789    printf(" ");
2790
2791    if (inst->force_uncompressed)
2792       printf("1sthalf ");
2793
2794    if (inst->force_sechalf)
2795       printf("2ndhalf ");
2796
2797    printf("\n");
2798 }
2799
2800 /**
2801  * Possibly returns an instruction that set up @param reg.
2802  *
2803  * Sometimes we want to take the result of some expression/variable
2804  * dereference tree and rewrite the instruction generating the result
2805  * of the tree.  When processing the tree, we know that the
2806  * instructions generated are all writing temporaries that are dead
2807  * outside of this tree.  So, if we have some instructions that write
2808  * a temporary, we're free to point that temp write somewhere else.
2809  *
2810  * Note that this doesn't guarantee that the instruction generated
2811  * only reg -- it might be the size=4 destination of a texture instruction.
2812  */
2813 fs_inst *
2814 fs_visitor::get_instruction_generating_reg(fs_inst *start,
2815                                            fs_inst *end,
2816                                            fs_reg reg)
2817 {
2818    if (end == start ||
2819        end->is_partial_write() ||
2820        reg.reladdr ||
2821        !reg.equals(end->dst)) {
2822       return NULL;
2823    } else {
2824       return end;
2825    }
2826 }
2827
2828 void
2829 fs_visitor::setup_payload_gen6()
2830 {
2831    bool uses_depth =
2832       (fp->Base.InputsRead & (1 << VARYING_SLOT_POS)) != 0;
2833    unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
2834
2835    assert(brw->gen >= 6);
2836
2837    /* R0-1: masks, pixel X/Y coordinates. */
2838    c->nr_payload_regs = 2;
2839    /* R2: only for 32-pixel dispatch.*/
2840
2841    /* R3-26: barycentric interpolation coordinates.  These appear in the
2842     * same order that they appear in the brw_wm_barycentric_interp_mode
2843     * enum.  Each set of coordinates occupies 2 registers if dispatch width
2844     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
2845     * appear if they were enabled using the "Barycentric Interpolation
2846     * Mode" bits in WM_STATE.
2847     */
2848    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
2849       if (barycentric_interp_modes & (1 << i)) {
2850          c->barycentric_coord_reg[i] = c->nr_payload_regs;
2851          c->nr_payload_regs += 2;
2852          if (dispatch_width == 16) {
2853             c->nr_payload_regs += 2;
2854          }
2855       }
2856    }
2857
2858    /* R27: interpolated depth if uses source depth */
2859    if (uses_depth) {
2860       c->source_depth_reg = c->nr_payload_regs;
2861       c->nr_payload_regs++;
2862       if (dispatch_width == 16) {
2863          /* R28: interpolated depth if not 8-wide. */
2864          c->nr_payload_regs++;
2865       }
2866    }
2867    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
2868    if (uses_depth) {
2869       c->source_w_reg = c->nr_payload_regs;
2870       c->nr_payload_regs++;
2871       if (dispatch_width == 16) {
2872          /* R30: interpolated W if not 8-wide. */
2873          c->nr_payload_regs++;
2874       }
2875    }
2876    /* R31: MSAA position offsets. */
2877    /* R32-: bary for 32-pixel. */
2878    /* R58-59: interp W for 32-pixel. */
2879
2880    if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
2881       c->source_depth_to_render_target = true;
2882    }
2883 }
2884
2885 bool
2886 fs_visitor::run()
2887 {
2888    sanity_param_count = fp->Base.Parameters->NumParameters;
2889    uint32_t orig_nr_params = c->prog_data.nr_params;
2890
2891    if (brw->gen >= 6)
2892       setup_payload_gen6();
2893    else
2894       setup_payload_gen4();
2895
2896    if (0) {
2897       emit_dummy_fs();
2898    } else {
2899       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2900          emit_shader_time_begin();
2901
2902       calculate_urb_setup();
2903       if (brw->gen < 6)
2904          emit_interpolation_setup_gen4();
2905       else
2906          emit_interpolation_setup_gen6();
2907
2908       /* We handle discards by keeping track of the still-live pixels in f0.1.
2909        * Initialize it with the dispatched pixels.
2910        */
2911       if (fp->UsesKill) {
2912          fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
2913          discard_init->flag_subreg = 1;
2914       }
2915
2916       /* Generate FS IR for main().  (the visitor only descends into
2917        * functions called "main").
2918        */
2919       if (shader) {
2920          foreach_list(node, &*shader->ir) {
2921             ir_instruction *ir = (ir_instruction *)node;
2922             base_ir = ir;
2923             this->result = reg_undef;
2924             ir->accept(this);
2925          }
2926       } else {
2927          emit_fragment_program_code();
2928       }
2929       base_ir = NULL;
2930       if (failed)
2931          return false;
2932
2933       emit(FS_OPCODE_PLACEHOLDER_HALT);
2934
2935       emit_fb_writes();
2936
2937       split_virtual_grfs();
2938
2939       move_uniform_array_access_to_pull_constants();
2940       setup_pull_constants();
2941
2942       bool progress;
2943       do {
2944          progress = false;
2945
2946          compact_virtual_grfs();
2947
2948          progress = remove_duplicate_mrf_writes() || progress;
2949
2950          progress = opt_algebraic() || progress;
2951          progress = opt_cse() || progress;
2952          progress = opt_copy_propagate() || progress;
2953          progress = dead_code_eliminate() || progress;
2954          progress = dead_code_eliminate_local() || progress;
2955          progress = register_coalesce() || progress;
2956          progress = register_coalesce_2() || progress;
2957          progress = compute_to_mrf() || progress;
2958       } while (progress);
2959
2960       remove_dead_constants();
2961
2962       schedule_instructions(false);
2963
2964       lower_uniform_pull_constant_loads();
2965
2966       assign_curb_setup();
2967       assign_urb_setup();
2968
2969       if (0) {
2970          /* Debug of register spilling: Go spill everything. */
2971          for (int i = 0; i < virtual_grf_count; i++) {
2972             spill_reg(i);
2973          }
2974       }
2975
2976       if (0)
2977          assign_regs_trivial();
2978       else {
2979          while (!assign_regs()) {
2980             if (failed)
2981                break;
2982          }
2983       }
2984    }
2985    assert(force_uncompressed_stack == 0);
2986    assert(force_sechalf_stack == 0);
2987
2988    /* This must come after all optimization and register allocation, since
2989     * it inserts dead code that happens to have side effects, and it does
2990     * so based on the actual physical registers in use.
2991     */
2992    insert_gen4_send_dependency_workarounds();
2993
2994    if (failed)
2995       return false;
2996
2997    schedule_instructions(true);
2998
2999    if (dispatch_width == 8) {
3000       c->prog_data.reg_blocks = brw_register_blocks(grf_used);
3001    } else {
3002       c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
3003
3004       /* Make sure we didn't try to sneak in an extra uniform */
3005       assert(orig_nr_params == c->prog_data.nr_params);
3006       (void) orig_nr_params;
3007    }
3008
3009    /* If any state parameters were appended, then ParameterValues could have
3010     * been realloced, in which case the driver uniform storage set up by
3011     * _mesa_associate_uniform_storage() would point to freed memory.  Make
3012     * sure that didn't happen.
3013     */
3014    assert(sanity_param_count == fp->Base.Parameters->NumParameters);
3015
3016    return !failed;
3017 }
3018
3019 const unsigned *
3020 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
3021                struct gl_fragment_program *fp,
3022                struct gl_shader_program *prog,
3023                unsigned *final_assembly_size)
3024 {
3025    bool start_busy = false;
3026    float start_time = 0;
3027
3028    if (unlikely(brw->perf_debug)) {
3029       start_busy = (brw->batch.last_bo &&
3030                     drm_intel_bo_busy(brw->batch.last_bo));
3031       start_time = get_time();
3032    }
3033
3034    struct brw_shader *shader = NULL;
3035    if (prog)
3036       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3037
3038    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3039       if (prog) {
3040          printf("GLSL IR for native fragment shader %d:\n", prog->Name);
3041          _mesa_print_ir(shader->ir, NULL);
3042          printf("\n\n");
3043       } else {
3044          printf("ARB_fragment_program %d ir for native fragment shader\n",
3045                 fp->Base.Id);
3046          _mesa_print_program(&fp->Base);
3047       }
3048    }
3049
3050    /* Now the main event: Visit the shader IR and generate our FS IR for it.
3051     */
3052    fs_visitor v(brw, c, prog, fp, 8);
3053    if (!v.run()) {
3054       if (prog) {
3055          prog->LinkStatus = false;
3056          ralloc_strcat(&prog->InfoLog, v.fail_msg);
3057       }
3058
3059       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3060                     v.fail_msg);
3061
3062       return NULL;
3063    }
3064
3065    exec_list *simd16_instructions = NULL;
3066    fs_visitor v2(brw, c, prog, fp, 16);
3067    if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16))) {
3068       if (c->prog_data.nr_pull_params == 0) {
3069          /* Try a 16-wide compile */
3070          v2.import_uniforms(&v);
3071          if (!v2.run()) {
3072             perf_debug("16-wide shader failed to compile, falling back to "
3073                        "8-wide at a 10-20%% performance cost: %s", v2.fail_msg);
3074          } else {
3075             simd16_instructions = &v2.instructions;
3076          }
3077       } else {
3078          perf_debug("Skipping 16-wide due to pull parameters.\n");
3079       }
3080    }
3081
3082    c->prog_data.dispatch_width = 8;
3083
3084    fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
3085    const unsigned *generated = g.generate_assembly(&v.instructions,
3086                                                    simd16_instructions,
3087                                                    final_assembly_size);
3088
3089    if (unlikely(brw->perf_debug) && shader) {
3090       if (shader->compiled_once)
3091          brw_wm_debug_recompile(brw, prog, &c->key);
3092       shader->compiled_once = true;
3093
3094       if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
3095          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3096                     (get_time() - start_time) * 1000);
3097       }
3098    }
3099
3100    return generated;
3101 }
3102
3103 bool
3104 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
3105 {
3106    struct brw_context *brw = brw_context(ctx);
3107    struct brw_wm_prog_key key;
3108
3109    if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
3110       return true;
3111
3112    struct gl_fragment_program *fp = (struct gl_fragment_program *)
3113       prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
3114    struct brw_fragment_program *bfp = brw_fragment_program(fp);
3115    bool program_uses_dfdy = fp->UsesDFdy;
3116
3117    memset(&key, 0, sizeof(key));
3118
3119    if (brw->gen < 6) {
3120       if (fp->UsesKill)
3121          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3122
3123       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3124          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3125
3126       /* Just assume depth testing. */
3127       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3128       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3129    }
3130
3131    if (brw->gen < 6)
3132       key.input_slots_valid |= BITFIELD64_BIT(VARYING_SLOT_POS);
3133
3134    for (int i = 0; i < VARYING_SLOT_MAX; i++) {
3135       if (!(fp->Base.InputsRead & BITFIELD64_BIT(i)))
3136          continue;
3137
3138       if (brw->gen < 6) {
3139          if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
3140             key.input_slots_valid |= BITFIELD64_BIT(i);
3141       }
3142    }
3143
3144    key.clamp_fragment_color = ctx->API == API_OPENGL_COMPAT;
3145
3146    for (int i = 0; i < MAX_SAMPLERS; i++) {
3147       if (fp->Base.ShadowSamplers & (1 << i)) {
3148          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3149          key.tex.swizzles[i] =
3150             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3151       } else {
3152          /* Color sampler: assume no swizzling. */
3153          key.tex.swizzles[i] = SWIZZLE_XYZW;
3154       }
3155    }
3156
3157    if (fp->Base.InputsRead & VARYING_BIT_POS) {
3158       key.drawable_height = ctx->DrawBuffer->Height;
3159    }
3160
3161    if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3162       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
3163    }
3164
3165    key.nr_color_regions = 1;
3166
3167    key.program_string_id = bfp->id;
3168
3169    uint32_t old_prog_offset = brw->wm.prog_offset;
3170    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3171
3172    bool success = do_wm_prog(brw, prog, bfp, &key);
3173
3174    brw->wm.prog_offset = old_prog_offset;
3175    brw->wm.prog_data = old_prog_data;
3176
3177    return success;
3178 }