src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 extern "C" {
  32
  33 #include <sys/types.h>
  34
  35 #include "main/hash_table.h"
  36 #include "main/macros.h"
  37 #include "main/shaderobj.h"
  38 #include "main/fbobject.h"
  39 #include "program/prog_parameter.h"
  40 #include "program/prog_print.h"
  41 #include "program/register_allocate.h"
  42 #include "program/sampler.h"
  43 #include "program/hash_table.h"
  44 #include "brw_context.h"
  45 #include "brw_eu.h"
  46 #include "brw_wm.h"
  47 }
  48 #include "brw_fs.h"
  49 #include "main/uniforms.h"
  50 #include "glsl/glsl_types.h"
  51
  52 void
  53 fs_inst::init()
  54 {
  55    memset(this, 0, sizeof(*this));
  56    this->opcode = BRW_OPCODE_NOP;
  57    this->conditional_mod = BRW_CONDITIONAL_NONE;
  58
  59    this->dst = reg_undef;
  60    this->src[0] = reg_undef;
  61    this->src[1] = reg_undef;
  62    this->src[2] = reg_undef;
  63
  64    /* This will be the case for almost all instructions. */
  65    this->regs_written = 1;
  66 }
  67
  68 fs_inst::fs_inst()
  69 {
  70    init();
  71 }
  72
  73 fs_inst::fs_inst(enum opcode opcode)
  74 {
  75    init();
  76    this->opcode = opcode;
  77 }
  78
  79 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
  80 {
  81    init();
  82    this->opcode = opcode;
  83    this->dst = dst;
  84
  85    if (dst.file == GRF)
  86       assert(dst.reg_offset >= 0);
  87 }
  88
  89 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
  90 {
  91    init();
  92    this->opcode = opcode;
  93    this->dst = dst;
  94    this->src[0] = src0;
  95
  96    if (dst.file == GRF)
  97       assert(dst.reg_offset >= 0);
  98    if (src[0].file == GRF)
  99       assert(src[0].reg_offset >= 0);
 100 }
 101
 102 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 103 {
 104    init();
 105    this->opcode = opcode;
 106    this->dst = dst;
 107    this->src[0] = src0;
 108    this->src[1] = src1;
 109
 110    if (dst.file == GRF)
 111       assert(dst.reg_offset >= 0);
 112    if (src[0].file == GRF)
 113       assert(src[0].reg_offset >= 0);
 114    if (src[1].file == GRF)
 115       assert(src[1].reg_offset >= 0);
 116 }
 117
 118 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
 119                  fs_reg src0, fs_reg src1, fs_reg src2)
 120 {
 121    init();
 122    this->opcode = opcode;
 123    this->dst = dst;
 124    this->src[0] = src0;
 125    this->src[1] = src1;
 126    this->src[2] = src2;
 127
 128    if (dst.file == GRF)
 129       assert(dst.reg_offset >= 0);
 130    if (src[0].file == GRF)
 131       assert(src[0].reg_offset >= 0);
 132    if (src[1].file == GRF)
 133       assert(src[1].reg_offset >= 0);
 134    if (src[2].file == GRF)
 135       assert(src[2].reg_offset >= 0);
 136 }
 137
 138 #define ALU1(op)                                                        \
 139    fs_inst *                                                            \
 140    fs_visitor::op(fs_reg dst, fs_reg src0)                              \
 141    {                                                                    \
 142       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
 143    }
 144
 145 #define ALU2(op)                                                        \
 146    fs_inst *                                                            \
 147    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1)                 \
 148    {                                                                    \
 149       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
 150    }
 151
 152 #define ALU3(op)                                                        \
 153    fs_inst *                                                            \
 154    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1, fs_reg src2)    \
 155    {                                                                    \
 156       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
 157    }
 158
 159 ALU1(NOT)
 160 ALU1(MOV)
 161 ALU1(FRC)
 162 ALU1(RNDD)
 163 ALU1(RNDE)
 164 ALU1(RNDZ)
 165 ALU2(ADD)
 166 ALU2(MUL)
 167 ALU2(MACH)
 168 ALU2(AND)
 169 ALU2(OR)
 170 ALU2(XOR)
 171 ALU2(SHL)
 172 ALU2(SHR)
 173 ALU2(ASR)
 174 ALU3(LRP)
 175 ALU1(BFREV)
 176 ALU3(BFE)
 177 ALU2(BFI1)
 178 ALU3(BFI2)
 179 ALU1(FBH)
 180 ALU1(FBL)
 181 ALU1(CBIT)
 182 ALU3(MAD)
 183
 184 /** Gen4 predicated IF. */
 185 fs_inst *
 186 fs_visitor::IF(uint32_t predicate)
 187 {
 188    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
 189    inst->predicate = predicate;
 190    return inst;
 191 }
 192
 193 /** Gen6+ IF with embedded comparison. */
 194 fs_inst *
 195 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
 196 {
 197    assert(brw->gen >= 6);
 198    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
 199                                         reg_null_d, src0, src1);
 200    inst->conditional_mod = condition;
 201    return inst;
 202 }
 203
 204 /**
 205  * CMP: Sets the low bit of the destination channels with the result
 206  * of the comparison, while the upper bits are undefined, and updates
 207  * the flag register with the packed 16 bits of the result.
 208  */
 209 fs_inst *
 210 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
 211 {
 212    fs_inst *inst;
 213
 214    /* Take the instruction:
 215     *
 216     * CMP null<d> src0<f> src1<f>
 217     *
 218     * Original gen4 does type conversion to the destination type before
 219     * comparison, producing garbage results for floating point comparisons.
 220     * gen5 does the comparison on the execution type (resolved source types),
 221     * so dst type doesn't matter.  gen6 does comparison and then uses the
 222     * result as if it was the dst type with no conversion, which happens to
 223     * mostly work out for float-interpreted-as-int since our comparisons are
 224     * for >0, =0, <0.
 225     */
 226    if (brw->gen == 4) {
 227       dst.type = src0.type;
 228       if (dst.file == HW_REG)
 229          dst.fixed_hw_reg.type = dst.type;
 230    }
 231
 232    resolve_ud_negate(&src0);
 233    resolve_ud_negate(&src1);
 234
 235    inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
 236    inst->conditional_mod = condition;
 237
 238    return inst;
 239 }
 240
 241 exec_list
 242 fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
 243                                        fs_reg varying_offset,
 244                                        uint32_t const_offset)
 245 {
 246    exec_list instructions;
 247    fs_inst *inst;
 248
 249    /* We have our constant surface use a pitch of 4 bytes, so our index can
 250     * be any component of a vector, and then we load 4 contiguous
 251     * components starting from that.
 252     *
 253     * We break down the const_offset to a portion added to the variable
 254     * offset and a portion done using reg_offset, which means that if you
 255     * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
 256     * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
 257     * CSE can later notice that those loads are all the same and eliminate
 258     * the redundant ones.
 259     */
 260    fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
 261    instructions.push_tail(ADD(vec4_offset,
 262                               varying_offset, const_offset & ~3));
 263
 264    int scale = 1;
 265    if (brw->gen == 4 && dispatch_width == 8) {
 266       /* Pre-gen5, we can either use a SIMD8 message that requires (header,
 267        * u, v, r) as parameters, or we can just use the SIMD16 message
 268        * consisting of (header, u).  We choose the second, at the cost of a
 269        * longer return length.
 270        */
 271       scale = 2;
 272    }
 273
 274    enum opcode op;
 275    if (brw->gen >= 7)
 276       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
 277    else
 278       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
 279    fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4 * scale), dst.type);
 280    inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
 281    inst->regs_written = 4 * scale;
 282    instructions.push_tail(inst);
 283
 284    if (brw->gen < 7) {
 285       inst->base_mrf = 13;
 286       inst->header_present = true;
 287       if (brw->gen == 4)
 288          inst->mlen = 3;
 289       else
 290          inst->mlen = 1 + dispatch_width / 8;
 291    }
 292
 293    vec4_result.reg_offset += (const_offset & 3) * scale;
 294    instructions.push_tail(MOV(dst, vec4_result));
 295
 296    return instructions;
 297 }
 298
 299 /**
 300  * A helper for MOV generation for fixing up broken hardware SEND dependency
 301  * handling.
 302  */
 303 fs_inst *
 304 fs_visitor::DEP_RESOLVE_MOV(int grf)
 305 {
 306    fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
 307
 308    inst->ir = NULL;
 309    inst->annotation = "send dependency resolve";
 310
 311    /* The caller always wants uncompressed to emit the minimal extra
 312     * dependencies, and to avoid having to deal with aligning its regs to 2.
 313     */
 314    inst->force_uncompressed = true;
 315
 316    return inst;
 317 }
 318
 319 bool
 320 fs_inst::equals(fs_inst *inst)
 321 {
 322    return (opcode == inst->opcode &&
 323            dst.equals(inst->dst) &&
 324            src[0].equals(inst->src[0]) &&
 325            src[1].equals(inst->src[1]) &&
 326            src[2].equals(inst->src[2]) &&
 327            saturate == inst->saturate &&
 328            predicate == inst->predicate &&
 329            conditional_mod == inst->conditional_mod &&
 330            mlen == inst->mlen &&
 331            base_mrf == inst->base_mrf &&
 332            sampler == inst->sampler &&
 333            target == inst->target &&
 334            eot == inst->eot &&
 335            header_present == inst->header_present &&
 336            shadow_compare == inst->shadow_compare &&
 337            offset == inst->offset);
 338 }
 339
 340 bool
 341 fs_inst::overwrites_reg(const fs_reg &reg)
 342 {
 343    return (reg.file == dst.file &&
 344            reg.reg == dst.reg &&
 345            reg.reg_offset >= dst.reg_offset  &&
 346            reg.reg_offset < dst.reg_offset + regs_written);
 347 }
 348
 349 bool
 350 fs_inst::is_send_from_grf()
 351 {
 352    return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
 353            opcode == SHADER_OPCODE_SHADER_TIME_ADD ||
 354            (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
 355             src[1].file == GRF));
 356 }
 357
 358 bool
 359 fs_visitor::can_do_source_mods(fs_inst *inst)
 360 {
 361    if (brw->gen == 6 && inst->is_math())
 362       return false;
 363
 364    if (inst->is_send_from_grf())
 365       return false;
 366
 367    return true;
 368 }
 369
 370 void
 371 fs_reg::init()
 372 {
 373    memset(this, 0, sizeof(*this));
 374    this->smear = -1;
 375 }
 376
 377 /** Generic unset register constructor. */
 378 fs_reg::fs_reg()
 379 {
 380    init();
 381    this->file = BAD_FILE;
 382 }
 383
 384 /** Immediate value constructor. */
 385 fs_reg::fs_reg(float f)
 386 {
 387    init();
 388    this->file = IMM;
 389    this->type = BRW_REGISTER_TYPE_F;
 390    this->imm.f = f;
 391 }
 392
 393 /** Immediate value constructor. */
 394 fs_reg::fs_reg(int32_t i)
 395 {
 396    init();
 397    this->file = IMM;
 398    this->type = BRW_REGISTER_TYPE_D;
 399    this->imm.i = i;
 400 }
 401
 402 /** Immediate value constructor. */
 403 fs_reg::fs_reg(uint32_t u)
 404 {
 405    init();
 406    this->file = IMM;
 407    this->type = BRW_REGISTER_TYPE_UD;
 408    this->imm.u = u;
 409 }
 410
 411 /** Fixed brw_reg Immediate value constructor. */
 412 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
 413 {
 414    init();
 415    this->file = HW_REG;
 416    this->fixed_hw_reg = fixed_hw_reg;
 417    this->type = fixed_hw_reg.type;
 418 }
 419
 420 bool
 421 fs_reg::equals(const fs_reg &r) const
 422 {
 423    return (file == r.file &&
 424            reg == r.reg &&
 425            reg_offset == r.reg_offset &&
 426            type == r.type &&
 427            negate == r.negate &&
 428            abs == r.abs &&
 429            !reladdr && !r.reladdr &&
 430            memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
 431                   sizeof(fixed_hw_reg)) == 0 &&
 432            smear == r.smear &&
 433            imm.u == r.imm.u);
 434 }
 435
 436 bool
 437 fs_reg::is_zero() const
 438 {
 439    if (file != IMM)
 440       return false;
 441
 442    return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
 443 }
 444
 445 bool
 446 fs_reg::is_one() const
 447 {
 448    if (file != IMM)
 449       return false;
 450
 451    return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
 452 }
 453
 454 bool
 455 fs_reg::is_valid_3src() const
 456 {
 457    return file == GRF || file == UNIFORM;
 458 }
 459
 460 int
 461 fs_visitor::type_size(const struct glsl_type *type)
 462 {
 463    unsigned int size, i;
 464
 465    switch (type->base_type) {
 466    case GLSL_TYPE_UINT:
 467    case GLSL_TYPE_INT:
 468    case GLSL_TYPE_FLOAT:
 469    case GLSL_TYPE_BOOL:
 470       return type->components();
 471    case GLSL_TYPE_ARRAY:
 472       return type_size(type->fields.array) * type->length;
 473    case GLSL_TYPE_STRUCT:
 474       size = 0;
 475       for (i = 0; i < type->length; i++) {
 476          size += type_size(type->fields.structure[i].type);
 477       }
 478       return size;
 479    case GLSL_TYPE_SAMPLER:
 480       /* Samplers take up no register space, since they're baked in at
 481        * link time.
 482        */
 483       return 0;
 484    case GLSL_TYPE_VOID:
 485    case GLSL_TYPE_ERROR:
 486    case GLSL_TYPE_INTERFACE:
 487       assert(!"not reached");
 488       break;
 489    }
 490
 491    return 0;
 492 }
 493
 494 fs_reg
 495 fs_visitor::get_timestamp()
 496 {
 497    assert(brw->gen >= 7);
 498
 499    fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 500                                           BRW_ARF_TIMESTAMP,
 501                                           0),
 502                              BRW_REGISTER_TYPE_UD));
 503
 504    fs_reg dst = fs_reg(this, glsl_type::uint_type);
 505
 506    fs_inst *mov = emit(MOV(dst, ts));
 507    /* We want to read the 3 fields we care about (mostly field 0, but also 2)
 508     * even if it's not enabled in the dispatch.
 509     */
 510    mov->force_writemask_all = true;
 511    mov->force_uncompressed = true;
 512
 513    /* The caller wants the low 32 bits of the timestamp.  Since it's running
 514     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
 515     * which is plenty of time for our purposes.  It is identical across the
 516     * EUs, but since it's tracking GPU core speed it will increment at a
 517     * varying rate as render P-states change.
 518     *
 519     * The caller could also check if render P-states have changed (or anything
 520     * else that might disrupt timing) by setting smear to 2 and checking if
 521     * that field is != 0.
 522     */
 523    dst.smear = 0;
 524
 525    return dst;
 526 }
 527
 528 void
 529 fs_visitor::emit_shader_time_begin()
 530 {
 531    current_annotation = "shader time start";
 532    shader_start_time = get_timestamp();
 533 }
 534
 535 void
 536 fs_visitor::emit_shader_time_end()
 537 {
 538    current_annotation = "shader time end";
 539
 540    enum shader_time_shader_type type, written_type, reset_type;
 541    if (dispatch_width == 8) {
 542       type = ST_FS8;
 543       written_type = ST_FS8_WRITTEN;
 544       reset_type = ST_FS8_RESET;
 545    } else {
 546       assert(dispatch_width == 16);
 547       type = ST_FS16;
 548       written_type = ST_FS16_WRITTEN;
 549       reset_type = ST_FS16_RESET;
 550    }
 551
 552    fs_reg shader_end_time = get_timestamp();
 553
 554    /* Check that there weren't any timestamp reset events (assuming these
 555     * were the only two timestamp reads that happened).
 556     */
 557    fs_reg reset = shader_end_time;
 558    reset.smear = 2;
 559    fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
 560    test->conditional_mod = BRW_CONDITIONAL_Z;
 561    emit(IF(BRW_PREDICATE_NORMAL));
 562
 563    push_force_uncompressed();
 564    fs_reg start = shader_start_time;
 565    start.negate = true;
 566    fs_reg diff = fs_reg(this, glsl_type::uint_type);
 567    emit(ADD(diff, start, shader_end_time));
 568
 569    /* If there were no instructions between the two timestamp gets, the diff
 570     * is 2 cycles.  Remove that overhead, so I can forget about that when
 571     * trying to determine the time taken for single instructions.
 572     */
 573    emit(ADD(diff, diff, fs_reg(-2u)));
 574
 575    emit_shader_time_write(type, diff);
 576    emit_shader_time_write(written_type, fs_reg(1u));
 577    emit(BRW_OPCODE_ELSE);
 578    emit_shader_time_write(reset_type, fs_reg(1u));
 579    emit(BRW_OPCODE_ENDIF);
 580
 581    pop_force_uncompressed();
 582 }
 583
 584 void
 585 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
 586                                    fs_reg value)
 587 {
 588    int shader_time_index =
 589       brw_get_shader_time_index(brw, shader_prog, &fp->Base, type);
 590    fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
 591
 592    fs_reg payload;
 593    if (dispatch_width == 8)
 594       payload = fs_reg(this, glsl_type::uvec2_type);
 595    else
 596       payload = fs_reg(this, glsl_type::uint_type);
 597
 598    emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
 599                 fs_reg(), payload, offset, value));
 600 }
 601
 602 void
 603 fs_visitor::fail(const char *format, ...)
 604 {
 605    va_list va;
 606    char *msg;
 607
 608    if (failed)
 609       return;
 610
 611    failed = true;
 612
 613    va_start(va, format);
 614    msg = ralloc_vasprintf(mem_ctx, format, va);
 615    va_end(va);
 616    msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
 617
 618    this->fail_msg = msg;
 619
 620    if (INTEL_DEBUG & DEBUG_WM) {
 621       fprintf(stderr, "%s",  msg);
 622    }
 623 }
 624
 625 fs_inst *
 626 fs_visitor::emit(enum opcode opcode)
 627 {
 628    return emit(fs_inst(opcode));
 629 }
 630
 631 fs_inst *
 632 fs_visitor::emit(enum opcode opcode, fs_reg dst)
 633 {
 634    return emit(fs_inst(opcode, dst));
 635 }
 636
 637 fs_inst *
 638 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
 639 {
 640    return emit(fs_inst(opcode, dst, src0));
 641 }
 642
 643 fs_inst *
 644 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 645 {
 646    return emit(fs_inst(opcode, dst, src0, src1));
 647 }
 648
 649 fs_inst *
 650 fs_visitor::emit(enum opcode opcode, fs_reg dst,
 651                  fs_reg src0, fs_reg src1, fs_reg src2)
 652 {
 653    return emit(fs_inst(opcode, dst, src0, src1, src2));
 654 }
 655
 656 void
 657 fs_visitor::push_force_uncompressed()
 658 {
 659    force_uncompressed_stack++;
 660 }
 661
 662 void
 663 fs_visitor::pop_force_uncompressed()
 664 {
 665    force_uncompressed_stack--;
 666    assert(force_uncompressed_stack >= 0);
 667 }
 668
 669 void
 670 fs_visitor::push_force_sechalf()
 671 {
 672    force_sechalf_stack++;
 673 }
 674
 675 void
 676 fs_visitor::pop_force_sechalf()
 677 {
 678    force_sechalf_stack--;
 679    assert(force_sechalf_stack >= 0);
 680 }
 681
 682 /**
 683  * Returns true if the instruction has a flag that means it won't
 684  * update an entire destination register.
 685  *
 686  * For example, dead code elimination and live variable analysis want to know
 687  * when a write to a variable screens off any preceding values that were in
 688  * it.
 689  */
 690 bool
 691 fs_inst::is_partial_write()
 692 {
 693    return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
 694            this->force_uncompressed ||
 695            this->force_sechalf);
 696 }
 697
 698 /**
 699  * Returns how many MRFs an FS opcode will write over.
 700  *
 701  * Note that this is not the 0 or 1 implied writes in an actual gen
 702  * instruction -- the FS opcodes often generate MOVs in addition.
 703  */
 704 int
 705 fs_visitor::implied_mrf_writes(fs_inst *inst)
 706 {
 707    if (inst->mlen == 0)
 708       return 0;
 709
 710    switch (inst->opcode) {
 711    case SHADER_OPCODE_RCP:
 712    case SHADER_OPCODE_RSQ:
 713    case SHADER_OPCODE_SQRT:
 714    case SHADER_OPCODE_EXP2:
 715    case SHADER_OPCODE_LOG2:
 716    case SHADER_OPCODE_SIN:
 717    case SHADER_OPCODE_COS:
 718       return 1 * dispatch_width / 8;
 719    case SHADER_OPCODE_POW:
 720    case SHADER_OPCODE_INT_QUOTIENT:
 721    case SHADER_OPCODE_INT_REMAINDER:
 722       return 2 * dispatch_width / 8;
 723    case SHADER_OPCODE_TEX:
 724    case FS_OPCODE_TXB:
 725    case SHADER_OPCODE_TXD:
 726    case SHADER_OPCODE_TXF:
 727    case SHADER_OPCODE_TXF_MS:
 728    case SHADER_OPCODE_TXL:
 729    case SHADER_OPCODE_TXS:
 730    case SHADER_OPCODE_LOD:
 731       return 1;
 732    case FS_OPCODE_FB_WRITE:
 733       return 2;
 734    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 735    case FS_OPCODE_UNSPILL:
 736       return 1;
 737    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
 738       return inst->mlen;
 739    case FS_OPCODE_SPILL:
 740       return 2;
 741    default:
 742       assert(!"not reached");
 743       return inst->mlen;
 744    }
 745 }
 746
 747 int
 748 fs_visitor::virtual_grf_alloc(int size)
 749 {
 750    if (virtual_grf_array_size <= virtual_grf_count) {
 751       if (virtual_grf_array_size == 0)
 752          virtual_grf_array_size = 16;
 753       else
 754          virtual_grf_array_size *= 2;
 755       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 756                                    virtual_grf_array_size);
 757    }
 758    virtual_grf_sizes[virtual_grf_count] = size;
 759    return virtual_grf_count++;
 760 }
 761
 762 /** Fixed HW reg constructor. */
 763 fs_reg::fs_reg(enum register_file file, int reg)
 764 {
 765    init();
 766    this->file = file;
 767    this->reg = reg;
 768    this->type = BRW_REGISTER_TYPE_F;
 769 }
 770
 771 /** Fixed HW reg constructor. */
 772 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
 773 {
 774    init();
 775    this->file = file;
 776    this->reg = reg;
 777    this->type = type;
 778 }
 779
 780 /** Automatic reg constructor. */
 781 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
 782 {
 783    init();
 784
 785    this->file = GRF;
 786    this->reg = v->virtual_grf_alloc(v->type_size(type));
 787    this->reg_offset = 0;
 788    this->type = brw_type_for_base_type(type);
 789 }
 790
 791 fs_reg *
 792 fs_visitor::variable_storage(ir_variable *var)
 793 {
 794    return (fs_reg *)hash_table_find(this->variable_ht, var);
 795 }
 796
 797 void
 798 import_uniforms_callback(const void *key,
 799                          void *data,
 800                          void *closure)
 801 {
 802    struct hash_table *dst_ht = (struct hash_table *)closure;
 803    const fs_reg *reg = (const fs_reg *)data;
 804
 805    if (reg->file != UNIFORM)
 806       return;
 807
 808    hash_table_insert(dst_ht, data, key);
 809 }
 810
 811 /* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
 812  * This brings in those uniform definitions
 813  */
 814 void
 815 fs_visitor::import_uniforms(fs_visitor *v)
 816 {
 817    hash_table_call_foreach(v->variable_ht,
 818                            import_uniforms_callback,
 819                            variable_ht);
 820    this->params_remap = v->params_remap;
 821    this->nr_params_remap = v->nr_params_remap;
 822 }
 823
 824 /* Our support for uniforms is piggy-backed on the struct
 825  * gl_fragment_program, because that's where the values actually
 826  * get stored, rather than in some global gl_shader_program uniform
 827  * store.
 828  */
 829 void
 830 fs_visitor::setup_uniform_values(ir_variable *ir)
 831 {
 832    int namelen = strlen(ir->name);
 833
 834    /* The data for our (non-builtin) uniforms is stored in a series of
 835     * gl_uniform_driver_storage structs for each subcomponent that
 836     * glGetUniformLocation() could name.  We know it's been set up in the same
 837     * order we'd walk the type, so walk the list of storage and find anything
 838     * with our name, or the prefix of a component that starts with our name.
 839     */
 840    unsigned params_before = c->prog_data.nr_params;
 841    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
 842       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 843
 844       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 845           (storage->name[namelen] != 0 &&
 846            storage->name[namelen] != '.' &&
 847            storage->name[namelen] != '[')) {
 848          continue;
 849       }
 850
 851       unsigned slots = storage->type->component_slots();
 852       if (storage->array_elements)
 853          slots *= storage->array_elements;
 854
 855       for (unsigned i = 0; i < slots; i++) {
 856          c->prog_data.param[c->prog_data.nr_params++] =
 857             &storage->storage[i].f;
 858       }
 859    }
 860
 861    /* Make sure we actually initialized the right amount of stuff here. */
 862    assert(params_before + ir->type->component_slots() ==
 863           c->prog_data.nr_params);
 864    (void)params_before;
 865 }
 866
 867
 868 /* Our support for builtin uniforms is even scarier than non-builtin.
 869  * It sits on top of the PROG_STATE_VAR parameters that are
 870  * automatically updated from GL context state.
 871  */
 872 void
 873 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
 874 {
 875    const ir_state_slot *const slots = ir->state_slots;
 876    assert(ir->state_slots != NULL);
 877
 878    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 879       /* This state reference has already been setup by ir_to_mesa, but we'll
 880        * get the same index back here.
 881        */
 882       int index = _mesa_add_state_reference(this->fp->Base.Parameters,
 883                                             (gl_state_index *)slots[i].tokens);
 884
 885       /* Add each of the unique swizzles of the element as a parameter.
 886        * This'll end up matching the expected layout of the
 887        * array/matrix/structure we're trying to fill in.
 888        */
 889       int last_swiz = -1;
 890       for (unsigned int j = 0; j < 4; j++) {
 891          int swiz = GET_SWZ(slots[i].swizzle, j);
 892          if (swiz == last_swiz)
 893             break;
 894          last_swiz = swiz;
 895
 896          c->prog_data.param[c->prog_data.nr_params++] =
 897             &fp->Base.Parameters->ParameterValues[index][swiz].f;
 898       }
 899    }
 900 }
 901
 902 fs_reg *
 903 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
 904 {
 905    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 906    fs_reg wpos = *reg;
 907    bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
 908
 909    /* gl_FragCoord.x */
 910    if (ir->pixel_center_integer) {
 911       emit(MOV(wpos, this->pixel_x));
 912    } else {
 913       emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
 914    }
 915    wpos.reg_offset++;
 916
 917    /* gl_FragCoord.y */
 918    if (!flip && ir->pixel_center_integer) {
 919       emit(MOV(wpos, this->pixel_y));
 920    } else {
 921       fs_reg pixel_y = this->pixel_y;
 922       float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
 923
 924       if (flip) {
 925          pixel_y.negate = true;
 926          offset += c->key.drawable_height - 1.0;
 927       }
 928
 929       emit(ADD(wpos, pixel_y, fs_reg(offset)));
 930    }
 931    wpos.reg_offset++;
 932
 933    /* gl_FragCoord.z */
 934    if (brw->gen >= 6) {
 935       emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
 936    } else {
 937       emit(FS_OPCODE_LINTERP, wpos,
 938            this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
 939            this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
 940            interp_reg(VARYING_SLOT_POS, 2));
 941    }
 942    wpos.reg_offset++;
 943
 944    /* gl_FragCoord.w: Already set up in emit_interpolation */
 945    emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
 946
 947    return reg;
 948 }
 949
 950 fs_inst *
 951 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
 952                          glsl_interp_qualifier interpolation_mode,
 953                          bool is_centroid)
 954 {
 955    brw_wm_barycentric_interp_mode barycoord_mode;
 956    if (brw->gen >= 6) {
 957       if (is_centroid) {
 958          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
 959             barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
 960          else
 961             barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
 962       } else {
 963          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
 964             barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
 965          else
 966             barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
 967       }
 968    } else {
 969       /* On Ironlake and below, there is only one interpolation mode.
 970        * Centroid interpolation doesn't mean anything on this hardware --
 971        * there is no multisampling.
 972        */
 973       barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
 974    }
 975    return emit(FS_OPCODE_LINTERP, attr,
 976                this->delta_x[barycoord_mode],
 977                this->delta_y[barycoord_mode], interp);
 978 }
 979
 980 fs_reg *
 981 fs_visitor::emit_general_interpolation(ir_variable *ir)
 982 {
 983    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 984    reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
 985    fs_reg attr = *reg;
 986
 987    unsigned int array_elements;
 988    const glsl_type *type;
 989
 990    if (ir->type->is_array()) {
 991       array_elements = ir->type->length;
 992       if (array_elements == 0) {
 993          fail("dereferenced array '%s' has length 0\n", ir->name);
 994       }
 995       type = ir->type->fields.array;
 996    } else {
 997       array_elements = 1;
 998       type = ir->type;
 999    }
1000
1001    glsl_interp_qualifier interpolation_mode =
1002       ir->determine_interpolation_mode(c->key.flat_shade);
1003
1004    int location = ir->location;
1005    for (unsigned int i = 0; i < array_elements; i++) {
1006       for (unsigned int j = 0; j < type->matrix_columns; j++) {
1007          if (c->prog_data.urb_setup[location] == -1) {
1008             /* If there's no incoming setup data for this slot, don't
1009              * emit interpolation for it.
1010              */
1011             attr.reg_offset += type->vector_elements;
1012             location++;
1013             continue;
1014          }
1015
1016          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1017             /* Constant interpolation (flat shading) case. The SF has
1018              * handed us defined values in only the constant offset
1019              * field of the setup reg.
1020              */
1021             for (unsigned int k = 0; k < type->vector_elements; k++) {
1022                struct brw_reg interp = interp_reg(location, k);
1023                interp = suboffset(interp, 3);
1024                interp.type = reg->type;
1025                emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1026                attr.reg_offset++;
1027             }
1028          } else {
1029             /* Smooth/noperspective interpolation case. */
1030             for (unsigned int k = 0; k < type->vector_elements; k++) {
1031                /* FINISHME: At some point we probably want to push
1032                 * this farther by giving similar treatment to the
1033                 * other potentially constant components of the
1034                 * attribute, as well as making brw_vs_constval.c
1035                 * handle varyings other than gl_TexCoord.
1036                 */
1037                struct brw_reg interp = interp_reg(location, k);
1038                emit_linterp(attr, fs_reg(interp), interpolation_mode,
1039                             ir->centroid);
1040                if (brw->needs_unlit_centroid_workaround && ir->centroid) {
1041                   /* Get the pixel/sample mask into f0 so that we know
1042                    * which pixels are lit.  Then, for each channel that is
1043                    * unlit, replace the centroid data with non-centroid
1044                    * data.
1045                    */
1046                   emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1047                   fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1048                                                interpolation_mode, false);
1049                   inst->predicate = BRW_PREDICATE_NORMAL;
1050                   inst->predicate_inverse = true;
1051                }
1052                if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1053                   emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1054                }
1055                attr.reg_offset++;
1056             }
1057
1058          }
1059          location++;
1060       }
1061    }
1062
1063    return reg;
1064 }
1065
1066 fs_reg *
1067 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1068 {
1069    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1070
1071    /* The frontfacing comes in as a bit in the thread payload. */
1072    if (brw->gen >= 6) {
1073       emit(BRW_OPCODE_ASR, *reg,
1074            fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1075            fs_reg(15));
1076       emit(BRW_OPCODE_NOT, *reg, *reg);
1077       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1078    } else {
1079       struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1080       /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1081        * us front face
1082        */
1083       emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1084       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1085    }
1086
1087    return reg;
1088 }
1089
1090 fs_reg
1091 fs_visitor::fix_math_operand(fs_reg src)
1092 {
1093    /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1094     * might be able to do better by doing execsize = 1 math and then
1095     * expanding that result out, but we would need to be careful with
1096     * masking.
1097     *
1098     * The hardware ignores source modifiers (negate and abs) on math
1099     * instructions, so we also move to a temp to set those up.
1100     */
1101    if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1102        !src.abs && !src.negate)
1103       return src;
1104
1105    /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1106     * operands to math
1107     */
1108    if (brw->gen >= 7 && src.file != IMM)
1109       return src;
1110
1111    fs_reg expanded = fs_reg(this, glsl_type::float_type);
1112    expanded.type = src.type;
1113    emit(BRW_OPCODE_MOV, expanded, src);
1114    return expanded;
1115 }
1116
1117 fs_inst *
1118 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1119 {
1120    switch (opcode) {
1121    case SHADER_OPCODE_RCP:
1122    case SHADER_OPCODE_RSQ:
1123    case SHADER_OPCODE_SQRT:
1124    case SHADER_OPCODE_EXP2:
1125    case SHADER_OPCODE_LOG2:
1126    case SHADER_OPCODE_SIN:
1127    case SHADER_OPCODE_COS:
1128       break;
1129    default:
1130       assert(!"not reached: bad math opcode");
1131       return NULL;
1132    }
1133
1134    /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
1135     * might be able to do better by doing execsize = 1 math and then
1136     * expanding that result out, but we would need to be careful with
1137     * masking.
1138     *
1139     * Gen 6 hardware ignores source modifiers (negate and abs) on math
1140     * instructions, so we also move to a temp to set those up.
1141     */
1142    if (brw->gen >= 6)
1143       src = fix_math_operand(src);
1144
1145    fs_inst *inst = emit(opcode, dst, src);
1146
1147    if (brw->gen < 6) {
1148       inst->base_mrf = 2;
1149       inst->mlen = dispatch_width / 8;
1150    }
1151
1152    return inst;
1153 }
1154
1155 fs_inst *
1156 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1157 {
1158    int base_mrf = 2;
1159    fs_inst *inst;
1160
1161    switch (opcode) {
1162    case SHADER_OPCODE_INT_QUOTIENT:
1163    case SHADER_OPCODE_INT_REMAINDER:
1164       if (brw->gen >= 7 && dispatch_width == 16)
1165          fail("16-wide INTDIV unsupported\n");
1166       break;
1167    case SHADER_OPCODE_POW:
1168       break;
1169    default:
1170       assert(!"not reached: unsupported binary math opcode.");
1171       return NULL;
1172    }
1173
1174    if (brw->gen >= 6) {
1175       src0 = fix_math_operand(src0);
1176       src1 = fix_math_operand(src1);
1177
1178       inst = emit(opcode, dst, src0, src1);
1179    } else {
1180       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1181        * "Message Payload":
1182        *
1183        * "Operand0[7].  For the INT DIV functions, this operand is the
1184        *  denominator."
1185        *  ...
1186        * "Operand1[7].  For the INT DIV functions, this operand is the
1187        *  numerator."
1188        */
1189       bool is_int_div = opcode != SHADER_OPCODE_POW;
1190       fs_reg &op0 = is_int_div ? src1 : src0;
1191       fs_reg &op1 = is_int_div ? src0 : src1;
1192
1193       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1194       inst = emit(opcode, dst, op0, reg_null_f);
1195
1196       inst->base_mrf = base_mrf;
1197       inst->mlen = 2 * dispatch_width / 8;
1198    }
1199    return inst;
1200 }
1201
1202 void
1203 fs_visitor::assign_curb_setup()
1204 {
1205    c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
1206    if (dispatch_width == 8) {
1207       c->prog_data.first_curbe_grf = c->nr_payload_regs;
1208    } else {
1209       c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1210    }
1211
1212    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1213    foreach_list(node, &this->instructions) {
1214       fs_inst *inst = (fs_inst *)node;
1215
1216       for (unsigned int i = 0; i < 3; i++) {
1217          if (inst->src[i].file == UNIFORM) {
1218             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1219             struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1220                                                   constant_nr / 8,
1221                                                   constant_nr % 8);
1222
1223             inst->src[i].file = HW_REG;
1224             inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
1225          }
1226       }
1227    }
1228 }
1229
1230 void
1231 fs_visitor::calculate_urb_setup()
1232 {
1233    for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1234       c->prog_data.urb_setup[i] = -1;
1235    }
1236
1237    int urb_next = 0;
1238    /* Figure out where each of the incoming setup attributes lands. */
1239    if (brw->gen >= 6) {
1240       if (_mesa_bitcount_64(fp->Base.InputsRead &
1241                             BRW_FS_VARYING_INPUT_MASK) <= 16) {
1242          /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1243           * first 16 varying inputs, so we can put them wherever we want.
1244           * Just put them in order.
1245           *
1246           * This is useful because it means that (a) inputs not used by the
1247           * fragment shader won't take up valuable register space, and (b) we
1248           * won't have to recompile the fragment shader if it gets paired with
1249           * a different vertex (or geometry) shader.
1250           */
1251          for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1252             if (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1253                 BITFIELD64_BIT(i)) {
1254                c->prog_data.urb_setup[i] = urb_next++;
1255             }
1256          }
1257       } else {
1258          /* We have enough input varyings that the SF/SBE pipeline stage can't
1259           * arbitrarily rearrange them to suit our whim; we have to put them
1260           * in an order that matches the output of the previous pipeline stage
1261           * (geometry or vertex shader).
1262           */
1263          struct brw_vue_map prev_stage_vue_map;
1264          brw_compute_vue_map(brw, &prev_stage_vue_map,
1265                              c->key.input_slots_valid);
1266          int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1267          assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1268          for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1269               slot++) {
1270             int varying = prev_stage_vue_map.slot_to_varying[slot];
1271             /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1272              * unused.
1273              */
1274             if (varying != BRW_VARYING_SLOT_COUNT &&
1275                 (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1276                  BITFIELD64_BIT(varying))) {
1277                c->prog_data.urb_setup[varying] = slot - first_slot;
1278             }
1279          }
1280          urb_next = prev_stage_vue_map.num_slots - first_slot;
1281       }
1282    } else {
1283       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1284       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1285          /* Point size is packed into the header, not as a general attribute */
1286          if (i == VARYING_SLOT_PSIZ)
1287             continue;
1288
1289          if (c->key.input_slots_valid & BITFIELD64_BIT(i)) {
1290             /* The back color slot is skipped when the front color is
1291              * also written to.  In addition, some slots can be
1292              * written in the vertex shader and not read in the
1293              * fragment shader.  So the register number must always be
1294              * incremented, mapped or not.
1295              */
1296             if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1297                c->prog_data.urb_setup[i] = urb_next;
1298             urb_next++;
1299          }
1300       }
1301
1302       /*
1303        * It's a FS only attribute, and we did interpolation for this attribute
1304        * in SF thread. So, count it here, too.
1305        *
1306        * See compile_sf_prog() for more info.
1307        */
1308       if (fp->Base.InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1309          c->prog_data.urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1310    }
1311
1312    c->prog_data.num_varying_inputs = urb_next;
1313 }
1314
1315 void
1316 fs_visitor::assign_urb_setup()
1317 {
1318    int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1319
1320    /* Offset all the urb_setup[] index by the actual position of the
1321     * setup regs, now that the location of the constants has been chosen.
1322     */
1323    foreach_list(node, &this->instructions) {
1324       fs_inst *inst = (fs_inst *)node;
1325
1326       if (inst->opcode == FS_OPCODE_LINTERP) {
1327          assert(inst->src[2].file == HW_REG);
1328          inst->src[2].fixed_hw_reg.nr += urb_start;
1329       }
1330
1331       if (inst->opcode == FS_OPCODE_CINTERP) {
1332          assert(inst->src[0].file == HW_REG);
1333          inst->src[0].fixed_hw_reg.nr += urb_start;
1334       }
1335    }
1336
1337    /* Each attribute is 4 setup channels, each of which is half a reg. */
1338    this->first_non_payload_grf =
1339       urb_start + c->prog_data.num_varying_inputs * 2;
1340 }
1341
1342 /**
1343  * Split large virtual GRFs into separate components if we can.
1344  *
1345  * This is mostly duplicated with what brw_fs_vector_splitting does,
1346  * but that's really conservative because it's afraid of doing
1347  * splitting that doesn't result in real progress after the rest of
1348  * the optimization phases, which would cause infinite looping in
1349  * optimization.  We can do it once here, safely.  This also has the
1350  * opportunity to split interpolated values, or maybe even uniforms,
1351  * which we don't have at the IR level.
1352  *
1353  * We want to split, because virtual GRFs are what we register
1354  * allocate and spill (due to contiguousness requirements for some
1355  * instructions), and they're what we naturally generate in the
1356  * codegen process, but most virtual GRFs don't actually need to be
1357  * contiguous sets of GRFs.  If we split, we'll end up with reduced
1358  * live intervals and better dead code elimination and coalescing.
1359  */
1360 void
1361 fs_visitor::split_virtual_grfs()
1362 {
1363    int num_vars = this->virtual_grf_count;
1364    bool split_grf[num_vars];
1365    int new_virtual_grf[num_vars];
1366
1367    /* Try to split anything > 0 sized. */
1368    for (int i = 0; i < num_vars; i++) {
1369       if (this->virtual_grf_sizes[i] != 1)
1370          split_grf[i] = true;
1371       else
1372          split_grf[i] = false;
1373    }
1374
1375    if (brw->has_pln &&
1376        this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1377       /* PLN opcodes rely on the delta_xy being contiguous.  We only have to
1378        * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1379        * Gen6, that was the only supported interpolation mode, and since Gen6,
1380        * delta_x and delta_y are in fixed hardware registers.
1381        */
1382       split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1383          false;
1384    }
1385
1386    foreach_list(node, &this->instructions) {
1387       fs_inst *inst = (fs_inst *)node;
1388
1389       /* If there's a SEND message that requires contiguous destination
1390        * registers, no splitting is allowed.
1391        */
1392       if (inst->regs_written > 1) {
1393          split_grf[inst->dst.reg] = false;
1394       }
1395
1396       /* If we're sending from a GRF, don't split it, on the assumption that
1397        * the send is reading the whole thing.
1398        */
1399       if (inst->is_send_from_grf()) {
1400          for (int i = 0; i < 3; i++) {
1401             if (inst->src[i].file == GRF) {
1402                split_grf[inst->src[i].reg] = false;
1403             }
1404          }
1405       }
1406    }
1407
1408    /* Allocate new space for split regs.  Note that the virtual
1409     * numbers will be contiguous.
1410     */
1411    for (int i = 0; i < num_vars; i++) {
1412       if (split_grf[i]) {
1413          new_virtual_grf[i] = virtual_grf_alloc(1);
1414          for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1415             int reg = virtual_grf_alloc(1);
1416             assert(reg == new_virtual_grf[i] + j - 1);
1417             (void) reg;
1418          }
1419          this->virtual_grf_sizes[i] = 1;
1420       }
1421    }
1422
1423    foreach_list(node, &this->instructions) {
1424       fs_inst *inst = (fs_inst *)node;
1425
1426       if (inst->dst.file == GRF &&
1427           split_grf[inst->dst.reg] &&
1428           inst->dst.reg_offset != 0) {
1429          inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1430                           inst->dst.reg_offset - 1);
1431          inst->dst.reg_offset = 0;
1432       }
1433       for (int i = 0; i < 3; i++) {
1434          if (inst->src[i].file == GRF &&
1435              split_grf[inst->src[i].reg] &&
1436              inst->src[i].reg_offset != 0) {
1437             inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1438                                 inst->src[i].reg_offset - 1);
1439             inst->src[i].reg_offset = 0;
1440          }
1441       }
1442    }
1443    this->live_intervals_valid = false;
1444 }
1445
1446 /**
1447  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1448  *
1449  * During code generation, we create tons of temporary variables, many of
1450  * which get immediately killed and are never used again.  Yet, in later
1451  * optimization and analysis passes, such as compute_live_intervals, we need
1452  * to loop over all the virtual GRFs.  Compacting them can save a lot of
1453  * overhead.
1454  */
1455 void
1456 fs_visitor::compact_virtual_grfs()
1457 {
1458    /* Mark which virtual GRFs are used, and count how many. */
1459    int remap_table[this->virtual_grf_count];
1460    memset(remap_table, -1, sizeof(remap_table));
1461
1462    foreach_list(node, &this->instructions) {
1463       const fs_inst *inst = (const fs_inst *) node;
1464
1465       if (inst->dst.file == GRF)
1466          remap_table[inst->dst.reg] = 0;
1467
1468       for (int i = 0; i < 3; i++) {
1469          if (inst->src[i].file == GRF)
1470             remap_table[inst->src[i].reg] = 0;
1471       }
1472    }
1473
1474    /* In addition to registers used in instructions, fs_visitor keeps
1475     * direct references to certain special values which must be patched:
1476     */
1477    fs_reg *special[] = {
1478       &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1479       &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1480       &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1481       &delta_x[0], &delta_x[1], &delta_x[2],
1482       &delta_x[3], &delta_x[4], &delta_x[5],
1483       &delta_y[0], &delta_y[1], &delta_y[2],
1484       &delta_y[3], &delta_y[4], &delta_y[5],
1485    };
1486    STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1487    STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1488
1489    /* Treat all special values as used, to be conservative */
1490    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1491       if (special[i]->file == GRF)
1492          remap_table[special[i]->reg] = 0;
1493    }
1494
1495    /* Compact the GRF arrays. */
1496    int new_index = 0;
1497    for (int i = 0; i < this->virtual_grf_count; i++) {
1498       if (remap_table[i] != -1) {
1499          remap_table[i] = new_index;
1500          virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1501          if (live_intervals_valid) {
1502             virtual_grf_start[new_index] = virtual_grf_start[i];
1503             virtual_grf_end[new_index] = virtual_grf_end[i];
1504          }
1505          ++new_index;
1506       }
1507    }
1508
1509    this->virtual_grf_count = new_index;
1510
1511    /* Patch all the instructions to use the newly renumbered registers */
1512    foreach_list(node, &this->instructions) {
1513       fs_inst *inst = (fs_inst *) node;
1514
1515       if (inst->dst.file == GRF)
1516          inst->dst.reg = remap_table[inst->dst.reg];
1517
1518       for (int i = 0; i < 3; i++) {
1519          if (inst->src[i].file == GRF)
1520             inst->src[i].reg = remap_table[inst->src[i].reg];
1521       }
1522    }
1523
1524    /* Patch all the references to special values */
1525    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1526       if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1527          special[i]->reg = remap_table[special[i]->reg];
1528    }
1529 }
1530
1531 bool
1532 fs_visitor::remove_dead_constants()
1533 {
1534    if (dispatch_width == 8) {
1535       this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
1536       this->nr_params_remap = c->prog_data.nr_params;
1537
1538       for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
1539          this->params_remap[i] = -1;
1540
1541       /* Find which params are still in use. */
1542       foreach_list(node, &this->instructions) {
1543          fs_inst *inst = (fs_inst *)node;
1544
1545          for (int i = 0; i < 3; i++) {
1546             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1547
1548             if (inst->src[i].file != UNIFORM)
1549                continue;
1550
1551             /* Section 5.11 of the OpenGL 4.3 spec says:
1552              *
1553              *     "Out-of-bounds reads return undefined values, which include
1554              *     values from other variables of the active program or zero."
1555              */
1556             if (constant_nr < 0 || constant_nr >= (int)c->prog_data.nr_params) {
1557                constant_nr = 0;
1558             }
1559
1560             /* For now, set this to non-negative.  We'll give it the
1561              * actual new number in a moment, in order to keep the
1562              * register numbers nicely ordered.
1563              */
1564             this->params_remap[constant_nr] = 0;
1565          }
1566       }
1567
1568       /* Figure out what the new numbers for the params will be.  At some
1569        * point when we're doing uniform array access, we're going to want
1570        * to keep the distinction between .reg and .reg_offset, but for
1571        * now we don't care.
1572        */
1573       unsigned int new_nr_params = 0;
1574       for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1575          if (this->params_remap[i] != -1) {
1576             this->params_remap[i] = new_nr_params++;
1577          }
1578       }
1579
1580       /* Update the list of params to be uploaded to match our new numbering. */
1581       for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1582          int remapped = this->params_remap[i];
1583
1584          if (remapped == -1)
1585             continue;
1586
1587          c->prog_data.param[remapped] = c->prog_data.param[i];
1588       }
1589
1590       c->prog_data.nr_params = new_nr_params;
1591    } else {
1592       /* This should have been generated in the 8-wide pass already. */
1593       assert(this->params_remap);
1594    }
1595
1596    /* Now do the renumbering of the shader to remove unused params. */
1597    foreach_list(node, &this->instructions) {
1598       fs_inst *inst = (fs_inst *)node;
1599
1600       for (int i = 0; i < 3; i++) {
1601          int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1602
1603          if (inst->src[i].file != UNIFORM)
1604             continue;
1605
1606          /* as above alias to 0 */
1607          if (constant_nr < 0 || constant_nr >= (int)this->nr_params_remap) {
1608             constant_nr = 0;
1609          }
1610          assert(this->params_remap[constant_nr] != -1);
1611          inst->src[i].reg = this->params_remap[constant_nr];
1612          inst->src[i].reg_offset = 0;
1613       }
1614    }
1615
1616    return true;
1617 }
1618
1619 /*
1620  * Implements array access of uniforms by inserting a
1621  * PULL_CONSTANT_LOAD instruction.
1622  *
1623  * Unlike temporary GRF array access (where we don't support it due to
1624  * the difficulty of doing relative addressing on instruction
1625  * destinations), we could potentially do array access of uniforms
1626  * that were loaded in GRF space as push constants.  In real-world
1627  * usage we've seen, though, the arrays being used are always larger
1628  * than we could load as push constants, so just always move all
1629  * uniform array access out to a pull constant buffer.
1630  */
1631 void
1632 fs_visitor::move_uniform_array_access_to_pull_constants()
1633 {
1634    int pull_constant_loc[c->prog_data.nr_params];
1635
1636    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1637       pull_constant_loc[i] = -1;
1638    }
1639
1640    /* Walk through and find array access of uniforms.  Put a copy of that
1641     * uniform in the pull constant buffer.
1642     *
1643     * Note that we don't move constant-indexed accesses to arrays.  No
1644     * testing has been done of the performance impact of this choice.
1645     */
1646    foreach_list_safe(node, &this->instructions) {
1647       fs_inst *inst = (fs_inst *)node;
1648
1649       for (int i = 0 ; i < 3; i++) {
1650          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1651             continue;
1652
1653          int uniform = inst->src[i].reg;
1654
1655          /* If this array isn't already present in the pull constant buffer,
1656           * add it.
1657           */
1658          if (pull_constant_loc[uniform] == -1) {
1659             const float **values = &c->prog_data.param[uniform];
1660
1661             pull_constant_loc[uniform] = c->prog_data.nr_pull_params;
1662
1663             assert(param_size[uniform]);
1664
1665             for (int j = 0; j < param_size[uniform]; j++) {
1666                c->prog_data.pull_param[c->prog_data.nr_pull_params++] =
1667                   values[j];
1668             }
1669          }
1670
1671          /* Set up the annotation tracking for new generated instructions. */
1672          base_ir = inst->ir;
1673          current_annotation = inst->annotation;
1674
1675          fs_reg surf_index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1676          fs_reg temp = fs_reg(this, glsl_type::float_type);
1677          exec_list list = VARYING_PULL_CONSTANT_LOAD(temp,
1678                                                      surf_index,
1679                                                      *inst->src[i].reladdr,
1680                                                      pull_constant_loc[uniform] +
1681                                                      inst->src[i].reg_offset);
1682          inst->insert_before(&list);
1683
1684          inst->src[i].file = temp.file;
1685          inst->src[i].reg = temp.reg;
1686          inst->src[i].reg_offset = temp.reg_offset;
1687          inst->src[i].reladdr = NULL;
1688       }
1689    }
1690 }
1691
1692 /**
1693  * Choose accesses from the UNIFORM file to demote to using the pull
1694  * constant buffer.
1695  *
1696  * We allow a fragment shader to have more than the specified minimum
1697  * maximum number of fragment shader uniform components (64).  If
1698  * there are too many of these, they'd fill up all of register space.
1699  * So, this will push some of them out to the pull constant buffer and
1700  * update the program to load them.
1701  */
1702 void
1703 fs_visitor::setup_pull_constants()
1704 {
1705    /* Only allow 16 registers (128 uniform components) as push constants. */
1706    unsigned int max_uniform_components = 16 * 8;
1707    if (c->prog_data.nr_params <= max_uniform_components)
1708       return;
1709
1710    if (dispatch_width == 16) {
1711       fail("Pull constants not supported in 16-wide\n");
1712       return;
1713    }
1714
1715    /* Just demote the end of the list.  We could probably do better
1716     * here, demoting things that are rarely used in the program first.
1717     */
1718    unsigned int pull_uniform_base = max_uniform_components;
1719
1720    int pull_constant_loc[c->prog_data.nr_params];
1721    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1722       if (i < pull_uniform_base) {
1723          pull_constant_loc[i] = -1;
1724       } else {
1725          pull_constant_loc[i] = -1;
1726          /* If our constant is already being uploaded for reladdr purposes,
1727           * reuse it.
1728           */
1729          for (unsigned int j = 0; j < c->prog_data.nr_pull_params; j++) {
1730             if (c->prog_data.pull_param[j] == c->prog_data.param[i]) {
1731                pull_constant_loc[i] = j;
1732                break;
1733             }
1734          }
1735          if (pull_constant_loc[i] == -1) {
1736             int pull_index = c->prog_data.nr_pull_params++;
1737             c->prog_data.pull_param[pull_index] = c->prog_data.param[i];
1738             pull_constant_loc[i] = pull_index;;
1739          }
1740       }
1741    }
1742    c->prog_data.nr_params = pull_uniform_base;
1743
1744    foreach_list(node, &this->instructions) {
1745       fs_inst *inst = (fs_inst *)node;
1746
1747       for (int i = 0; i < 3; i++) {
1748          if (inst->src[i].file != UNIFORM)
1749             continue;
1750
1751          int pull_index = pull_constant_loc[inst->src[i].reg +
1752                                             inst->src[i].reg_offset];
1753          if (pull_index == -1)
1754             continue;
1755
1756          assert(!inst->src[i].reladdr);
1757
1758          fs_reg dst = fs_reg(this, glsl_type::float_type);
1759          fs_reg index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1760          fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1761          fs_inst *pull =
1762             new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1763                                  dst, index, offset);
1764          pull->ir = inst->ir;
1765          pull->annotation = inst->annotation;
1766
1767          inst->insert_before(pull);
1768
1769          inst->src[i].file = GRF;
1770          inst->src[i].reg = dst.reg;
1771          inst->src[i].reg_offset = 0;
1772          inst->src[i].smear = pull_index & 3;
1773       }
1774    }
1775 }
1776
1777 bool
1778 fs_visitor::opt_algebraic()
1779 {
1780    bool progress = false;
1781
1782    foreach_list(node, &this->instructions) {
1783       fs_inst *inst = (fs_inst *)node;
1784
1785       switch (inst->opcode) {
1786       case BRW_OPCODE_MUL:
1787          if (inst->src[1].file != IMM)
1788             continue;
1789
1790          /* a * 1.0 = a */
1791          if (inst->src[1].is_one()) {
1792             inst->opcode = BRW_OPCODE_MOV;
1793             inst->src[1] = reg_undef;
1794             progress = true;
1795             break;
1796          }
1797
1798          /* a * 0.0 = 0.0 */
1799          if (inst->src[1].is_zero()) {
1800             inst->opcode = BRW_OPCODE_MOV;
1801             inst->src[0] = inst->src[1];
1802             inst->src[1] = reg_undef;
1803             progress = true;
1804             break;
1805          }
1806
1807          break;
1808       case BRW_OPCODE_ADD:
1809          if (inst->src[1].file != IMM)
1810             continue;
1811
1812          /* a + 0.0 = a */
1813          if (inst->src[1].is_zero()) {
1814             inst->opcode = BRW_OPCODE_MOV;
1815             inst->src[1] = reg_undef;
1816             progress = true;
1817             break;
1818          }
1819          break;
1820       default:
1821          break;
1822       }
1823    }
1824
1825    return progress;
1826 }
1827
1828 /**
1829  * Removes any instructions writing a VGRF where that VGRF is not used by any
1830  * later instruction.
1831  */
1832 bool
1833 fs_visitor::dead_code_eliminate()
1834 {
1835    bool progress = false;
1836    int pc = 0;
1837
1838    calculate_live_intervals();
1839
1840    foreach_list_safe(node, &this->instructions) {
1841       fs_inst *inst = (fs_inst *)node;
1842
1843       if (inst->dst.file == GRF) {
1844          assert(this->virtual_grf_end[inst->dst.reg] >= pc);
1845          if (this->virtual_grf_end[inst->dst.reg] == pc) {
1846             inst->remove();
1847             progress = true;
1848          }
1849       }
1850
1851       pc++;
1852    }
1853
1854    if (progress)
1855       live_intervals_valid = false;
1856
1857    return progress;
1858 }
1859
1860 struct dead_code_hash_key
1861 {
1862    int vgrf;
1863    int reg_offset;
1864 };
1865
1866 static bool
1867 dead_code_hash_compare(const void *a, const void *b)
1868 {
1869    return memcmp(a, b, sizeof(struct dead_code_hash_key)) == 0;
1870 }
1871
1872 static void
1873 clear_dead_code_hash(struct hash_table *ht)
1874 {
1875    struct hash_entry *entry;
1876
1877    hash_table_foreach(ht, entry) {
1878       _mesa_hash_table_remove(ht, entry);
1879    }
1880 }
1881
1882 static void
1883 insert_dead_code_hash(struct hash_table *ht,
1884                       int vgrf, int reg_offset, fs_inst *inst)
1885 {
1886    /* We don't bother freeing keys, because they'll be GCed with the ht. */
1887    struct dead_code_hash_key *key = ralloc(ht, struct dead_code_hash_key);
1888
1889    key->vgrf = vgrf;
1890    key->reg_offset = reg_offset;
1891
1892    _mesa_hash_table_insert(ht, _mesa_hash_data(key, sizeof(*key)), key, inst);
1893 }
1894
1895 static struct hash_entry *
1896 get_dead_code_hash_entry(struct hash_table *ht, int vgrf, int reg_offset)
1897 {
1898    struct dead_code_hash_key key;
1899
1900    key.vgrf = vgrf;
1901    key.reg_offset = reg_offset;
1902
1903    return _mesa_hash_table_search(ht, _mesa_hash_data(&key, sizeof(key)), &key);
1904 }
1905
1906 static void
1907 remove_dead_code_hash(struct hash_table *ht,
1908                       int vgrf, int reg_offset)
1909 {
1910    struct hash_entry *entry = get_dead_code_hash_entry(ht, vgrf, reg_offset);
1911    if (!entry)
1912       return;
1913
1914    _mesa_hash_table_remove(ht, entry);
1915 }
1916
1917 /**
1918  * Walks basic blocks, removing any regs that are written but not read before
1919  * being redefined.
1920  *
1921  * The dead_code_eliminate() function implements a global dead code
1922  * elimination, but it only handles the removing the last write to a register
1923  * if it's never read.  This one can handle intermediate writes, but only
1924  * within a basic block.
1925  */
1926 bool
1927 fs_visitor::dead_code_eliminate_local()
1928 {
1929    struct hash_table *ht;
1930    bool progress = false;
1931
1932    ht = _mesa_hash_table_create(mem_ctx, dead_code_hash_compare);
1933
1934    foreach_list_safe(node, &this->instructions) {
1935       fs_inst *inst = (fs_inst *)node;
1936
1937       /* At a basic block, empty the HT since we don't understand dataflow
1938        * here.
1939        */
1940       if (inst->is_control_flow()) {
1941          clear_dead_code_hash(ht);
1942          continue;
1943       }
1944
1945       /* Clear the HT of any instructions that got read. */
1946       for (int i = 0; i < 3; i++) {
1947          fs_reg src = inst->src[i];
1948          if (src.file != GRF)
1949             continue;
1950
1951          int read = 1;
1952          if (inst->is_send_from_grf())
1953             read = virtual_grf_sizes[src.reg] - src.reg_offset;
1954
1955          for (int reg_offset = src.reg_offset;
1956               reg_offset < src.reg_offset + read;
1957               reg_offset++) {
1958             remove_dead_code_hash(ht, src.reg, reg_offset);
1959          }
1960       }
1961
1962       /* Add any update of a GRF to the HT, removing a previous write if it
1963        * wasn't read.
1964        */
1965       if (inst->dst.file == GRF) {
1966          if (inst->regs_written > 1) {
1967             /* We don't know how to trim channels from an instruction's
1968              * writes, so we can't incrementally remove unread channels from
1969              * it.  Just remove whatever it overwrites from the table
1970              */
1971             for (int i = 0; i < inst->regs_written; i++) {
1972                remove_dead_code_hash(ht,
1973                                      inst->dst.reg,
1974                                      inst->dst.reg_offset + i);
1975             }
1976          } else {
1977             struct hash_entry *entry =
1978                get_dead_code_hash_entry(ht, inst->dst.reg,
1979                                         inst->dst.reg_offset);
1980
1981             if (inst->is_partial_write()) {
1982                /* For a partial write, we can't remove any previous dead code
1983                 * candidate, since we're just modifying their result, but we can
1984                 * be dead code eliminiated ourselves.
1985                 */
1986                if (entry) {
1987                   entry->data = inst;
1988                } else {
1989                   insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
1990                                         inst);
1991                }
1992             } else {
1993                if (entry) {
1994                   /* We're completely updating a channel, and there was a
1995                    * previous write to the channel that wasn't read.  Kill it!
1996                    */
1997                   fs_inst *inst = (fs_inst *)entry->data;
1998                   inst->remove();
1999                   progress = true;
2000                   _mesa_hash_table_remove(ht, entry);
2001                }
2002
2003                insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
2004                                      inst);
2005             }
2006          }
2007       }
2008    }
2009
2010    _mesa_hash_table_destroy(ht, NULL);
2011
2012    if (progress)
2013       live_intervals_valid = false;
2014
2015    return progress;
2016 }
2017
2018 /**
2019  * Implements a second type of register coalescing: This one checks if
2020  * the two regs involved in a raw move don't interfere, in which case
2021  * they can both by stored in the same place and the MOV removed.
2022  */
2023 bool
2024 fs_visitor::register_coalesce_2()
2025 {
2026    bool progress = false;
2027
2028    calculate_live_intervals();
2029
2030    foreach_list_safe(node, &this->instructions) {
2031       fs_inst *inst = (fs_inst *)node;
2032
2033       if (inst->opcode != BRW_OPCODE_MOV ||
2034           inst->is_partial_write() ||
2035           inst->saturate ||
2036           inst->src[0].file != GRF ||
2037           inst->src[0].negate ||
2038           inst->src[0].abs ||
2039           inst->src[0].smear != -1 ||
2040           inst->dst.file != GRF ||
2041           inst->dst.type != inst->src[0].type ||
2042           virtual_grf_sizes[inst->src[0].reg] != 1 ||
2043           virtual_grf_interferes(inst->dst.reg, inst->src[0].reg)) {
2044          continue;
2045       }
2046
2047       int reg_from = inst->src[0].reg;
2048       assert(inst->src[0].reg_offset == 0);
2049       int reg_to = inst->dst.reg;
2050       int reg_to_offset = inst->dst.reg_offset;
2051
2052       foreach_list(node, &this->instructions) {
2053          fs_inst *scan_inst = (fs_inst *)node;
2054
2055          if (scan_inst->dst.file == GRF &&
2056              scan_inst->dst.reg == reg_from) {
2057             scan_inst->dst.reg = reg_to;
2058             scan_inst->dst.reg_offset = reg_to_offset;
2059          }
2060          for (int i = 0; i < 3; i++) {
2061             if (scan_inst->src[i].file == GRF &&
2062                 scan_inst->src[i].reg == reg_from) {
2063                scan_inst->src[i].reg = reg_to;
2064                scan_inst->src[i].reg_offset = reg_to_offset;
2065             }
2066          }
2067       }
2068
2069       inst->remove();
2070
2071       /* We don't need to recalculate live intervals inside the loop despite
2072        * flagging live_intervals_valid because we only use live intervals for
2073        * the interferes test, and we must have had a situation where the
2074        * intervals were:
2075        *
2076        *  from  to
2077        *  ^
2078        *  |
2079        *  v
2080        *        ^
2081        *        |
2082        *        v
2083        *
2084        * Some register R that might get coalesced with one of these two could
2085        * only be referencing "to", otherwise "from"'s range would have been
2086        * longer.  R's range could also only start at the end of "to" or later,
2087        * otherwise it will conflict with "to" when we try to coalesce "to"
2088        * into Rw anyway.
2089        */
2090       live_intervals_valid = false;
2091
2092       progress = true;
2093       continue;
2094    }
2095
2096    return progress;
2097 }
2098
2099 bool
2100 fs_visitor::register_coalesce()
2101 {
2102    bool progress = false;
2103    int if_depth = 0;
2104    int loop_depth = 0;
2105
2106    foreach_list_safe(node, &this->instructions) {
2107       fs_inst *inst = (fs_inst *)node;
2108
2109       /* Make sure that we dominate the instructions we're going to
2110        * scan for interfering with our coalescing, or we won't have
2111        * scanned enough to see if anything interferes with our
2112        * coalescing.  We don't dominate the following instructions if
2113        * we're in a loop or an if block.
2114        */
2115       switch (inst->opcode) {
2116       case BRW_OPCODE_DO:
2117          loop_depth++;
2118          break;
2119       case BRW_OPCODE_WHILE:
2120          loop_depth--;
2121          break;
2122       case BRW_OPCODE_IF:
2123          if_depth++;
2124          break;
2125       case BRW_OPCODE_ENDIF:
2126          if_depth--;
2127          break;
2128       default:
2129          break;
2130       }
2131       if (loop_depth || if_depth)
2132          continue;
2133
2134       if (inst->opcode != BRW_OPCODE_MOV ||
2135           inst->is_partial_write() ||
2136           inst->saturate ||
2137           inst->dst.file != GRF || (inst->src[0].file != GRF &&
2138                                     inst->src[0].file != UNIFORM)||
2139           inst->dst.type != inst->src[0].type)
2140          continue;
2141
2142       bool has_source_modifiers = (inst->src[0].abs ||
2143                                    inst->src[0].negate ||
2144                                    inst->src[0].smear != -1 ||
2145                                    inst->src[0].file == UNIFORM);
2146
2147       /* Found a move of a GRF to a GRF.  Let's see if we can coalesce
2148        * them: check for no writes to either one until the exit of the
2149        * program.
2150        */
2151       bool interfered = false;
2152
2153       for (fs_inst *scan_inst = (fs_inst *)inst->next;
2154            !scan_inst->is_tail_sentinel();
2155            scan_inst = (fs_inst *)scan_inst->next) {
2156          if (scan_inst->dst.file == GRF) {
2157             if (scan_inst->overwrites_reg(inst->dst) ||
2158                 scan_inst->overwrites_reg(inst->src[0])) {
2159                interfered = true;
2160                break;
2161             }
2162          }
2163
2164          if (has_source_modifiers) {
2165             for (int i = 0; i < 3; i++) {
2166                if (scan_inst->src[i].file == GRF &&
2167                    scan_inst->src[i].reg == inst->dst.reg &&
2168                    scan_inst->src[i].reg_offset == inst->dst.reg_offset &&
2169                    inst->dst.type != scan_inst->src[i].type)
2170                {
2171                  interfered = true;
2172                  break;
2173                }
2174             }
2175          }
2176
2177
2178          /* The gen6 MATH instruction can't handle source modifiers or
2179           * unusual register regions, so avoid coalescing those for
2180           * now.  We should do something more specific.
2181           */
2182          if (has_source_modifiers && !can_do_source_mods(scan_inst)) {
2183             interfered = true;
2184             break;
2185          }
2186
2187          /* The accumulator result appears to get used for the
2188           * conditional modifier generation.  When negating a UD
2189           * value, there is a 33rd bit generated for the sign in the
2190           * accumulator value, so now you can't check, for example,
2191           * equality with a 32-bit value.  See piglit fs-op-neg-uint.
2192           */
2193          if (scan_inst->conditional_mod &&
2194              inst->src[0].negate &&
2195              inst->src[0].type == BRW_REGISTER_TYPE_UD) {
2196             interfered = true;
2197             break;
2198          }
2199       }
2200       if (interfered) {
2201          continue;
2202       }
2203
2204       /* Rewrite the later usage to point at the source of the move to
2205        * be removed.
2206        */
2207       for (fs_inst *scan_inst = inst;
2208            !scan_inst->is_tail_sentinel();
2209            scan_inst = (fs_inst *)scan_inst->next) {
2210          for (int i = 0; i < 3; i++) {
2211             if (scan_inst->src[i].file == GRF &&
2212                 scan_inst->src[i].reg == inst->dst.reg &&
2213                 scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
2214                fs_reg new_src = inst->src[0];
2215                if (scan_inst->src[i].abs) {
2216                   new_src.negate = 0;
2217                   new_src.abs = 1;
2218                }
2219                new_src.negate ^= scan_inst->src[i].negate;
2220                scan_inst->src[i] = new_src;
2221             }
2222          }
2223       }
2224
2225       inst->remove();
2226       progress = true;
2227    }
2228
2229    if (progress)
2230       live_intervals_valid = false;
2231
2232    return progress;
2233 }
2234
2235
2236 bool
2237 fs_visitor::compute_to_mrf()
2238 {
2239    bool progress = false;
2240    int next_ip = 0;
2241
2242    calculate_live_intervals();
2243
2244    foreach_list_safe(node, &this->instructions) {
2245       fs_inst *inst = (fs_inst *)node;
2246
2247       int ip = next_ip;
2248       next_ip++;
2249
2250       if (inst->opcode != BRW_OPCODE_MOV ||
2251           inst->is_partial_write() ||
2252           inst->dst.file != MRF || inst->src[0].file != GRF ||
2253           inst->dst.type != inst->src[0].type ||
2254           inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
2255          continue;
2256
2257       /* Work out which hardware MRF registers are written by this
2258        * instruction.
2259        */
2260       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2261       int mrf_high;
2262       if (inst->dst.reg & BRW_MRF_COMPR4) {
2263          mrf_high = mrf_low + 4;
2264       } else if (dispatch_width == 16 &&
2265                  (!inst->force_uncompressed && !inst->force_sechalf)) {
2266          mrf_high = mrf_low + 1;
2267       } else {
2268          mrf_high = mrf_low;
2269       }
2270
2271       /* Can't compute-to-MRF this GRF if someone else was going to
2272        * read it later.
2273        */
2274       if (this->virtual_grf_end[inst->src[0].reg] > ip)
2275          continue;
2276
2277       /* Found a move of a GRF to a MRF.  Let's see if we can go
2278        * rewrite the thing that made this GRF to write into the MRF.
2279        */
2280       fs_inst *scan_inst;
2281       for (scan_inst = (fs_inst *)inst->prev;
2282            scan_inst->prev != NULL;
2283            scan_inst = (fs_inst *)scan_inst->prev) {
2284          if (scan_inst->dst.file == GRF &&
2285              scan_inst->dst.reg == inst->src[0].reg) {
2286             /* Found the last thing to write our reg we want to turn
2287              * into a compute-to-MRF.
2288              */
2289
2290             /* If this one instruction didn't populate all the
2291              * channels, bail.  We might be able to rewrite everything
2292              * that writes that reg, but it would require smarter
2293              * tracking to delay the rewriting until complete success.
2294              */
2295             if (scan_inst->is_partial_write())
2296                break;
2297
2298             /* Things returning more than one register would need us to
2299              * understand coalescing out more than one MOV at a time.
2300              */
2301             if (scan_inst->regs_written > 1)
2302                break;
2303
2304             /* SEND instructions can't have MRF as a destination. */
2305             if (scan_inst->mlen)
2306                break;
2307
2308             if (brw->gen == 6) {
2309                /* gen6 math instructions must have the destination be
2310                 * GRF, so no compute-to-MRF for them.
2311                 */
2312                if (scan_inst->is_math()) {
2313                   break;
2314                }
2315             }
2316
2317             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2318                /* Found the creator of our MRF's source value. */
2319                scan_inst->dst.file = MRF;
2320                scan_inst->dst.reg = inst->dst.reg;
2321                scan_inst->saturate |= inst->saturate;
2322                inst->remove();
2323                progress = true;
2324             }
2325             break;
2326          }
2327
2328          /* We don't handle control flow here.  Most computation of
2329           * values that end up in MRFs are shortly before the MRF
2330           * write anyway.
2331           */
2332          if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2333             break;
2334
2335          /* You can't read from an MRF, so if someone else reads our
2336           * MRF's source GRF that we wanted to rewrite, that stops us.
2337           */
2338          bool interfered = false;
2339          for (int i = 0; i < 3; i++) {
2340             if (scan_inst->src[i].file == GRF &&
2341                 scan_inst->src[i].reg == inst->src[0].reg &&
2342                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2343                interfered = true;
2344             }
2345          }
2346          if (interfered)
2347             break;
2348
2349          if (scan_inst->dst.file == MRF) {
2350             /* If somebody else writes our MRF here, we can't
2351              * compute-to-MRF before that.
2352              */
2353             int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2354             int scan_mrf_high;
2355
2356             if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2357                scan_mrf_high = scan_mrf_low + 4;
2358             } else if (dispatch_width == 16 &&
2359                        (!scan_inst->force_uncompressed &&
2360                         !scan_inst->force_sechalf)) {
2361                scan_mrf_high = scan_mrf_low + 1;
2362             } else {
2363                scan_mrf_high = scan_mrf_low;
2364             }
2365
2366             if (mrf_low == scan_mrf_low ||
2367                 mrf_low == scan_mrf_high ||
2368                 mrf_high == scan_mrf_low ||
2369                 mrf_high == scan_mrf_high) {
2370                break;
2371             }
2372          }
2373
2374          if (scan_inst->mlen > 0) {
2375             /* Found a SEND instruction, which means that there are
2376              * live values in MRFs from base_mrf to base_mrf +
2377              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
2378              * above it.
2379              */
2380             if (mrf_low >= scan_inst->base_mrf &&
2381                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2382                break;
2383             }
2384             if (mrf_high >= scan_inst->base_mrf &&
2385                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2386                break;
2387             }
2388          }
2389       }
2390    }
2391
2392    if (progress)
2393       live_intervals_valid = false;
2394
2395    return progress;
2396 }
2397
2398 /**
2399  * Walks through basic blocks, looking for repeated MRF writes and
2400  * removing the later ones.
2401  */
2402 bool
2403 fs_visitor::remove_duplicate_mrf_writes()
2404 {
2405    fs_inst *last_mrf_move[16];
2406    bool progress = false;
2407
2408    /* Need to update the MRF tracking for compressed instructions. */
2409    if (dispatch_width == 16)
2410       return false;
2411
2412    memset(last_mrf_move, 0, sizeof(last_mrf_move));
2413
2414    foreach_list_safe(node, &this->instructions) {
2415       fs_inst *inst = (fs_inst *)node;
2416
2417       if (inst->is_control_flow()) {
2418          memset(last_mrf_move, 0, sizeof(last_mrf_move));
2419       }
2420
2421       if (inst->opcode == BRW_OPCODE_MOV &&
2422           inst->dst.file == MRF) {
2423          fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2424          if (prev_inst && inst->equals(prev_inst)) {
2425             inst->remove();
2426             progress = true;
2427             continue;
2428          }
2429       }
2430
2431       /* Clear out the last-write records for MRFs that were overwritten. */
2432       if (inst->dst.file == MRF) {
2433          last_mrf_move[inst->dst.reg] = NULL;
2434       }
2435
2436       if (inst->mlen > 0) {
2437          /* Found a SEND instruction, which will include two or fewer
2438           * implied MRF writes.  We could do better here.
2439           */
2440          for (int i = 0; i < implied_mrf_writes(inst); i++) {
2441             last_mrf_move[inst->base_mrf + i] = NULL;
2442          }
2443       }
2444
2445       /* Clear out any MRF move records whose sources got overwritten. */
2446       if (inst->dst.file == GRF) {
2447          for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2448             if (last_mrf_move[i] &&
2449                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2450                last_mrf_move[i] = NULL;
2451             }
2452          }
2453       }
2454
2455       if (inst->opcode == BRW_OPCODE_MOV &&
2456           inst->dst.file == MRF &&
2457           inst->src[0].file == GRF &&
2458           !inst->is_partial_write()) {
2459          last_mrf_move[inst->dst.reg] = inst;
2460       }
2461    }
2462
2463    if (progress)
2464       live_intervals_valid = false;
2465
2466    return progress;
2467 }
2468
2469 static void
2470 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2471                         int first_grf, int grf_len)
2472 {
2473    bool inst_16wide = (dispatch_width > 8 &&
2474                        !inst->force_uncompressed &&
2475                        !inst->force_sechalf);
2476
2477    /* Clear the flag for registers that actually got read (as expected). */
2478    for (int i = 0; i < 3; i++) {
2479       int grf;
2480       if (inst->src[i].file == GRF) {
2481          grf = inst->src[i].reg;
2482       } else if (inst->src[i].file == HW_REG &&
2483                  inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2484          grf = inst->src[i].fixed_hw_reg.nr;
2485       } else {
2486          continue;
2487       }
2488
2489       if (grf >= first_grf &&
2490           grf < first_grf + grf_len) {
2491          deps[grf - first_grf] = false;
2492          if (inst_16wide)
2493             deps[grf - first_grf + 1] = false;
2494       }
2495    }
2496 }
2497
2498 /**
2499  * Implements this workaround for the original 965:
2500  *
2501  *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2502  *      check for post destination dependencies on this instruction, software
2503  *      must ensure that there is no destination hazard for the case of ‘write
2504  *      followed by a posted write’ shown in the following example.
2505  *
2506  *      1. mov r3 0
2507  *      2. send r3.xy <rest of send instruction>
2508  *      3. mov r2 r3
2509  *
2510  *      Due to no post-destination dependency check on the ‘send’, the above
2511  *      code sequence could have two instructions (1 and 2) in flight at the
2512  *      same time that both consider ‘r3’ as the target of their final writes.
2513  */
2514 void
2515 fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
2516 {
2517    int reg_size = dispatch_width / 8;
2518    int write_len = inst->regs_written * reg_size;
2519    int first_write_grf = inst->dst.reg;
2520    bool needs_dep[BRW_MAX_MRF];
2521    assert(write_len < (int)sizeof(needs_dep) - 1);
2522
2523    memset(needs_dep, false, sizeof(needs_dep));
2524    memset(needs_dep, true, write_len);
2525
2526    clear_deps_for_inst_src(inst, dispatch_width,
2527                            needs_dep, first_write_grf, write_len);
2528
2529    /* Walk backwards looking for writes to registers we're writing which
2530     * aren't read since being written.  If we hit the start of the program,
2531     * we assume that there are no outstanding dependencies on entry to the
2532     * program.
2533     */
2534    for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2535         scan_inst != NULL;
2536         scan_inst = (fs_inst *)scan_inst->prev) {
2537
2538       /* If we hit control flow, assume that there *are* outstanding
2539        * dependencies, and force their cleanup before our instruction.
2540        */
2541       if (scan_inst->is_control_flow()) {
2542          for (int i = 0; i < write_len; i++) {
2543             if (needs_dep[i]) {
2544                inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2545             }
2546          }
2547          return;
2548       }
2549
2550       bool scan_inst_16wide = (dispatch_width > 8 &&
2551                                !scan_inst->force_uncompressed &&
2552                                !scan_inst->force_sechalf);
2553
2554       /* We insert our reads as late as possible on the assumption that any
2555        * instruction but a MOV that might have left us an outstanding
2556        * dependency has more latency than a MOV.
2557        */
2558       if (scan_inst->dst.file == GRF) {
2559          for (int i = 0; i < scan_inst->regs_written; i++) {
2560             int reg = scan_inst->dst.reg + i * reg_size;
2561
2562             if (reg >= first_write_grf &&
2563                 reg < first_write_grf + write_len &&
2564                 needs_dep[reg - first_write_grf]) {
2565                inst->insert_before(DEP_RESOLVE_MOV(reg));
2566                needs_dep[reg - first_write_grf] = false;
2567                if (scan_inst_16wide)
2568                   needs_dep[reg - first_write_grf + 1] = false;
2569             }
2570          }
2571       }
2572
2573       /* Clear the flag for registers that actually got read (as expected). */
2574       clear_deps_for_inst_src(scan_inst, dispatch_width,
2575                               needs_dep, first_write_grf, write_len);
2576
2577       /* Continue the loop only if we haven't resolved all the dependencies */
2578       int i;
2579       for (i = 0; i < write_len; i++) {
2580          if (needs_dep[i])
2581             break;
2582       }
2583       if (i == write_len)
2584          return;
2585    }
2586 }
2587
2588 /**
2589  * Implements this workaround for the original 965:
2590  *
2591  *     "[DevBW, DevCL] Errata: A destination register from a send can not be
2592  *      used as a destination register until after it has been sourced by an
2593  *      instruction with a different destination register.
2594  */
2595 void
2596 fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
2597 {
2598    int write_len = inst->regs_written * dispatch_width / 8;
2599    int first_write_grf = inst->dst.reg;
2600    bool needs_dep[BRW_MAX_MRF];
2601    assert(write_len < (int)sizeof(needs_dep) - 1);
2602
2603    memset(needs_dep, false, sizeof(needs_dep));
2604    memset(needs_dep, true, write_len);
2605    /* Walk forwards looking for writes to registers we're writing which aren't
2606     * read before being written.
2607     */
2608    for (fs_inst *scan_inst = (fs_inst *)inst->next;
2609         !scan_inst->is_tail_sentinel();
2610         scan_inst = (fs_inst *)scan_inst->next) {
2611       /* If we hit control flow, force resolve all remaining dependencies. */
2612       if (scan_inst->is_control_flow()) {
2613          for (int i = 0; i < write_len; i++) {
2614             if (needs_dep[i])
2615                scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2616          }
2617          return;
2618       }
2619
2620       /* Clear the flag for registers that actually got read (as expected). */
2621       clear_deps_for_inst_src(scan_inst, dispatch_width,
2622                               needs_dep, first_write_grf, write_len);
2623
2624       /* We insert our reads as late as possible since they're reading the
2625        * result of a SEND, which has massive latency.
2626        */
2627       if (scan_inst->dst.file == GRF &&
2628           scan_inst->dst.reg >= first_write_grf &&
2629           scan_inst->dst.reg < first_write_grf + write_len &&
2630           needs_dep[scan_inst->dst.reg - first_write_grf]) {
2631          scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2632          needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2633       }
2634
2635       /* Continue the loop only if we haven't resolved all the dependencies */
2636       int i;
2637       for (i = 0; i < write_len; i++) {
2638          if (needs_dep[i])
2639             break;
2640       }
2641       if (i == write_len)
2642          return;
2643    }
2644
2645    /* If we hit the end of the program, resolve all remaining dependencies out
2646     * of paranoia.
2647     */
2648    fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2649    assert(last_inst->eot);
2650    for (int i = 0; i < write_len; i++) {
2651       if (needs_dep[i])
2652          last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2653    }
2654 }
2655
2656 void
2657 fs_visitor::insert_gen4_send_dependency_workarounds()
2658 {
2659    if (brw->gen != 4 || brw->is_g4x)
2660       return;
2661
2662    /* Note that we're done with register allocation, so GRF fs_regs always
2663     * have a .reg_offset of 0.
2664     */
2665
2666    foreach_list_safe(node, &this->instructions) {
2667       fs_inst *inst = (fs_inst *)node;
2668
2669       if (inst->mlen != 0 && inst->dst.file == GRF) {
2670          insert_gen4_pre_send_dependency_workarounds(inst);
2671          insert_gen4_post_send_dependency_workarounds(inst);
2672       }
2673    }
2674 }
2675
2676 /**
2677  * Turns the generic expression-style uniform pull constant load instruction
2678  * into a hardware-specific series of instructions for loading a pull
2679  * constant.
2680  *
2681  * The expression style allows the CSE pass before this to optimize out
2682  * repeated loads from the same offset, and gives the pre-register-allocation
2683  * scheduling full flexibility, while the conversion to native instructions
2684  * allows the post-register-allocation scheduler the best information
2685  * possible.
2686  *
2687  * Note that execution masking for setting up pull constant loads is special:
2688  * the channels that need to be written are unrelated to the current execution
2689  * mask, since a later instruction will use one of the result channels as a
2690  * source operand for all 8 or 16 of its channels.
2691  */
2692 void
2693 fs_visitor::lower_uniform_pull_constant_loads()
2694 {
2695    foreach_list(node, &this->instructions) {
2696       fs_inst *inst = (fs_inst *)node;
2697
2698       if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2699          continue;
2700
2701       if (brw->gen >= 7) {
2702          /* The offset arg before was a vec4-aligned byte offset.  We need to
2703           * turn it into a dword offset.
2704           */
2705          fs_reg const_offset_reg = inst->src[1];
2706          assert(const_offset_reg.file == IMM &&
2707                 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2708          const_offset_reg.imm.u /= 4;
2709          fs_reg payload = fs_reg(this, glsl_type::uint_type);
2710
2711          /* This is actually going to be a MOV, but since only the first dword
2712           * is accessed, we have a special opcode to do just that one.  Note
2713           * that this needs to be an operation that will be considered a def
2714           * by live variable analysis, or register allocation will explode.
2715           */
2716          fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2717                                                payload, const_offset_reg);
2718          setup->force_writemask_all = true;
2719
2720          setup->ir = inst->ir;
2721          setup->annotation = inst->annotation;
2722          inst->insert_before(setup);
2723
2724          /* Similarly, this will only populate the first 4 channels of the
2725           * result register (since we only use smear values from 0-3), but we
2726           * don't tell the optimizer.
2727           */
2728          inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2729          inst->src[1] = payload;
2730
2731          this->live_intervals_valid = false;
2732       } else {
2733          /* Before register allocation, we didn't tell the scheduler about the
2734           * MRF we use.  We know it's safe to use this MRF because nothing
2735           * else does except for register spill/unspill, which generates and
2736           * uses its MRF within a single IR instruction.
2737           */
2738          inst->base_mrf = 14;
2739          inst->mlen = 1;
2740       }
2741    }
2742 }
2743
2744 void
2745 fs_visitor::dump_instruction(backend_instruction *be_inst)
2746 {
2747    fs_inst *inst = (fs_inst *)be_inst;
2748
2749    if (inst->predicate) {
2750       printf("(%cf0.%d) ",
2751              inst->predicate_inverse ? '-' : '+',
2752              inst->flag_subreg);
2753    }
2754
2755    printf("%s", brw_instruction_name(inst->opcode));
2756    if (inst->saturate)
2757       printf(".sat");
2758    if (inst->conditional_mod) {
2759       printf(".cmod");
2760       if (!inst->predicate &&
2761           (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2762                               inst->opcode != BRW_OPCODE_IF &&
2763                               inst->opcode != BRW_OPCODE_WHILE))) {
2764          printf(".f0.%d", inst->flag_subreg);
2765       }
2766    }
2767    printf(" ");
2768
2769
2770    switch (inst->dst.file) {
2771    case GRF:
2772       printf("vgrf%d", inst->dst.reg);
2773       if (inst->dst.reg_offset)
2774          printf("+%d", inst->dst.reg_offset);
2775       break;
2776    case MRF:
2777       printf("m%d", inst->dst.reg);
2778       break;
2779    case BAD_FILE:
2780       printf("(null)");
2781       break;
2782    case UNIFORM:
2783       printf("***u%d***", inst->dst.reg);
2784       break;
2785    case ARF:
2786       if (inst->dst.reg == BRW_ARF_NULL)
2787          printf("(null)");
2788       else
2789          printf("arf%d", inst->dst.reg);
2790       break;
2791    default:
2792       printf("???");
2793       break;
2794    }
2795    printf(", ");
2796
2797    for (int i = 0; i < 3; i++) {
2798       if (inst->src[i].negate)
2799          printf("-");
2800       if (inst->src[i].abs)
2801          printf("|");
2802       switch (inst->src[i].file) {
2803       case GRF:
2804          printf("vgrf%d", inst->src[i].reg);
2805          if (inst->src[i].reg_offset)
2806             printf("+%d", inst->src[i].reg_offset);
2807          break;
2808       case MRF:
2809          printf("***m%d***", inst->src[i].reg);
2810          break;
2811       case UNIFORM:
2812          printf("u%d", inst->src[i].reg);
2813          if (inst->src[i].reg_offset)
2814             printf(".%d", inst->src[i].reg_offset);
2815          break;
2816       case BAD_FILE:
2817          printf("(null)");
2818          break;
2819       case IMM:
2820          switch (inst->src[i].type) {
2821          case BRW_REGISTER_TYPE_F:
2822             printf("%ff", inst->src[i].imm.f);
2823             break;
2824          case BRW_REGISTER_TYPE_D:
2825             printf("%dd", inst->src[i].imm.i);
2826             break;
2827          case BRW_REGISTER_TYPE_UD:
2828             printf("%uu", inst->src[i].imm.u);
2829             break;
2830          default:
2831             printf("???");
2832             break;
2833          }
2834          break;
2835       default:
2836          printf("???");
2837          break;
2838       }
2839       if (inst->src[i].abs)
2840          printf("|");
2841
2842       if (i < 3)
2843          printf(", ");
2844    }
2845
2846    printf(" ");
2847
2848    if (inst->force_uncompressed)
2849       printf("1sthalf ");
2850
2851    if (inst->force_sechalf)
2852       printf("2ndhalf ");
2853
2854    printf("\n");
2855 }
2856
2857 /**
2858  * Possibly returns an instruction that set up @param reg.
2859  *
2860  * Sometimes we want to take the result of some expression/variable
2861  * dereference tree and rewrite the instruction generating the result
2862  * of the tree.  When processing the tree, we know that the
2863  * instructions generated are all writing temporaries that are dead
2864  * outside of this tree.  So, if we have some instructions that write
2865  * a temporary, we're free to point that temp write somewhere else.
2866  *
2867  * Note that this doesn't guarantee that the instruction generated
2868  * only reg -- it might be the size=4 destination of a texture instruction.
2869  */
2870 fs_inst *
2871 fs_visitor::get_instruction_generating_reg(fs_inst *start,
2872                                            fs_inst *end,
2873                                            fs_reg reg)
2874 {
2875    if (end == start ||
2876        end->is_partial_write() ||
2877        reg.reladdr ||
2878        !reg.equals(end->dst)) {
2879       return NULL;
2880    } else {
2881       return end;
2882    }
2883 }
2884
2885 void
2886 fs_visitor::setup_payload_gen6()
2887 {
2888    bool uses_depth =
2889       (fp->Base.InputsRead & (1 << VARYING_SLOT_POS)) != 0;
2890    unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
2891
2892    assert(brw->gen >= 6);
2893
2894    /* R0-1: masks, pixel X/Y coordinates. */
2895    c->nr_payload_regs = 2;
2896    /* R2: only for 32-pixel dispatch.*/
2897
2898    /* R3-26: barycentric interpolation coordinates.  These appear in the
2899     * same order that they appear in the brw_wm_barycentric_interp_mode
2900     * enum.  Each set of coordinates occupies 2 registers if dispatch width
2901     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
2902     * appear if they were enabled using the "Barycentric Interpolation
2903     * Mode" bits in WM_STATE.
2904     */
2905    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
2906       if (barycentric_interp_modes & (1 << i)) {
2907          c->barycentric_coord_reg[i] = c->nr_payload_regs;
2908          c->nr_payload_regs += 2;
2909          if (dispatch_width == 16) {
2910             c->nr_payload_regs += 2;
2911          }
2912       }
2913    }
2914
2915    /* R27: interpolated depth if uses source depth */
2916    if (uses_depth) {
2917       c->source_depth_reg = c->nr_payload_regs;
2918       c->nr_payload_regs++;
2919       if (dispatch_width == 16) {
2920          /* R28: interpolated depth if not 8-wide. */
2921          c->nr_payload_regs++;
2922       }
2923    }
2924    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
2925    if (uses_depth) {
2926       c->source_w_reg = c->nr_payload_regs;
2927       c->nr_payload_regs++;
2928       if (dispatch_width == 16) {
2929          /* R30: interpolated W if not 8-wide. */
2930          c->nr_payload_regs++;
2931       }
2932    }
2933    /* R31: MSAA position offsets. */
2934    /* R32-: bary for 32-pixel. */
2935    /* R58-59: interp W for 32-pixel. */
2936
2937    if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
2938       c->source_depth_to_render_target = true;
2939    }
2940 }
2941
2942 bool
2943 fs_visitor::run()
2944 {
2945    sanity_param_count = fp->Base.Parameters->NumParameters;
2946    uint32_t orig_nr_params = c->prog_data.nr_params;
2947
2948    if (brw->gen >= 6)
2949       setup_payload_gen6();
2950    else
2951       setup_payload_gen4();
2952
2953    if (0) {
2954       emit_dummy_fs();
2955    } else {
2956       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2957          emit_shader_time_begin();
2958
2959       calculate_urb_setup();
2960       if (brw->gen < 6)
2961          emit_interpolation_setup_gen4();
2962       else
2963          emit_interpolation_setup_gen6();
2964
2965       /* We handle discards by keeping track of the still-live pixels in f0.1.
2966        * Initialize it with the dispatched pixels.
2967        */
2968       if (fp->UsesKill) {
2969          fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
2970          discard_init->flag_subreg = 1;
2971       }
2972
2973       /* Generate FS IR for main().  (the visitor only descends into
2974        * functions called "main").
2975        */
2976       if (shader) {
2977          foreach_list(node, &*shader->ir) {
2978             ir_instruction *ir = (ir_instruction *)node;
2979             base_ir = ir;
2980             this->result = reg_undef;
2981             ir->accept(this);
2982          }
2983       } else {
2984          emit_fragment_program_code();
2985       }
2986       base_ir = NULL;
2987       if (failed)
2988          return false;
2989
2990       emit(FS_OPCODE_PLACEHOLDER_HALT);
2991
2992       emit_fb_writes();
2993
2994       split_virtual_grfs();
2995
2996       move_uniform_array_access_to_pull_constants();
2997       setup_pull_constants();
2998
2999       bool progress;
3000       do {
3001          progress = false;
3002
3003          compact_virtual_grfs();
3004
3005          progress = remove_duplicate_mrf_writes() || progress;
3006
3007          progress = opt_algebraic() || progress;
3008          progress = opt_cse() || progress;
3009          progress = opt_copy_propagate() || progress;
3010          progress = dead_code_eliminate() || progress;
3011          progress = dead_code_eliminate_local() || progress;
3012          progress = register_coalesce() || progress;
3013          progress = register_coalesce_2() || progress;
3014          progress = compute_to_mrf() || progress;
3015       } while (progress);
3016
3017       remove_dead_constants();
3018
3019       schedule_instructions(false);
3020
3021       lower_uniform_pull_constant_loads();
3022
3023       assign_curb_setup();
3024       assign_urb_setup();
3025
3026       if (0) {
3027          /* Debug of register spilling: Go spill everything. */
3028          for (int i = 0; i < virtual_grf_count; i++) {
3029             spill_reg(i);
3030          }
3031       }
3032
3033       if (0)
3034          assign_regs_trivial();
3035       else {
3036          while (!assign_regs()) {
3037             if (failed)
3038                break;
3039          }
3040       }
3041    }
3042    assert(force_uncompressed_stack == 0);
3043    assert(force_sechalf_stack == 0);
3044
3045    /* This must come after all optimization and register allocation, since
3046     * it inserts dead code that happens to have side effects, and it does
3047     * so based on the actual physical registers in use.
3048     */
3049    insert_gen4_send_dependency_workarounds();
3050
3051    if (failed)
3052       return false;
3053
3054    schedule_instructions(true);
3055
3056    if (dispatch_width == 8) {
3057       c->prog_data.reg_blocks = brw_register_blocks(grf_used);
3058    } else {
3059       c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
3060
3061       /* Make sure we didn't try to sneak in an extra uniform */
3062       assert(orig_nr_params == c->prog_data.nr_params);
3063       (void) orig_nr_params;
3064    }
3065
3066    /* If any state parameters were appended, then ParameterValues could have
3067     * been realloced, in which case the driver uniform storage set up by
3068     * _mesa_associate_uniform_storage() would point to freed memory.  Make
3069     * sure that didn't happen.
3070     */
3071    assert(sanity_param_count == fp->Base.Parameters->NumParameters);
3072
3073    return !failed;
3074 }
3075
3076 const unsigned *
3077 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
3078                struct gl_fragment_program *fp,
3079                struct gl_shader_program *prog,
3080                unsigned *final_assembly_size)
3081 {
3082    bool start_busy = false;
3083    float start_time = 0;
3084
3085    if (unlikely(brw->perf_debug)) {
3086       start_busy = (brw->batch.last_bo &&
3087                     drm_intel_bo_busy(brw->batch.last_bo));
3088       start_time = get_time();
3089    }
3090
3091    struct brw_shader *shader = NULL;
3092    if (prog)
3093       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3094
3095    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3096       if (prog) {
3097          printf("GLSL IR for native fragment shader %d:\n", prog->Name);
3098          _mesa_print_ir(shader->ir, NULL);
3099          printf("\n\n");
3100       } else {
3101          printf("ARB_fragment_program %d ir for native fragment shader\n",
3102                 fp->Base.Id);
3103          _mesa_print_program(&fp->Base);
3104       }
3105    }
3106
3107    /* Now the main event: Visit the shader IR and generate our FS IR for it.
3108     */
3109    fs_visitor v(brw, c, prog, fp, 8);
3110    if (!v.run()) {
3111       if (prog) {
3112          prog->LinkStatus = false;
3113          ralloc_strcat(&prog->InfoLog, v.fail_msg);
3114       }
3115
3116       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3117                     v.fail_msg);
3118
3119       return NULL;
3120    }
3121
3122    exec_list *simd16_instructions = NULL;
3123    fs_visitor v2(brw, c, prog, fp, 16);
3124    if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16))) {
3125       if (c->prog_data.nr_pull_params == 0) {
3126          /* Try a 16-wide compile */
3127          v2.import_uniforms(&v);
3128          if (!v2.run()) {
3129             perf_debug("16-wide shader failed to compile, falling back to "
3130                        "8-wide at a 10-20%% performance cost: %s", v2.fail_msg);
3131          } else {
3132             simd16_instructions = &v2.instructions;
3133          }
3134       } else {
3135          perf_debug("Skipping 16-wide due to pull parameters.\n");
3136       }
3137    }
3138
3139    c->prog_data.dispatch_width = 8;
3140
3141    fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
3142    const unsigned *generated = g.generate_assembly(&v.instructions,
3143                                                    simd16_instructions,
3144                                                    final_assembly_size);
3145
3146    if (unlikely(brw->perf_debug) && shader) {
3147       if (shader->compiled_once)
3148          brw_wm_debug_recompile(brw, prog, &c->key);
3149       shader->compiled_once = true;
3150
3151       if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
3152          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3153                     (get_time() - start_time) * 1000);
3154       }
3155    }
3156
3157    return generated;
3158 }
3159
3160 bool
3161 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
3162 {
3163    struct brw_context *brw = brw_context(ctx);
3164    struct brw_wm_prog_key key;
3165
3166    if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
3167       return true;
3168
3169    struct gl_fragment_program *fp = (struct gl_fragment_program *)
3170       prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
3171    struct brw_fragment_program *bfp = brw_fragment_program(fp);
3172    bool program_uses_dfdy = fp->UsesDFdy;
3173
3174    memset(&key, 0, sizeof(key));
3175
3176    if (brw->gen < 6) {
3177       if (fp->UsesKill)
3178          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3179
3180       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3181          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3182
3183       /* Just assume depth testing. */
3184       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3185       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3186    }
3187
3188    if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
3189                                          BRW_FS_VARYING_INPUT_MASK) > 16)
3190       key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
3191
3192    key.clamp_fragment_color = ctx->API == API_OPENGL_COMPAT;
3193
3194    unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
3195    for (unsigned i = 0; i < sampler_count; i++) {
3196       if (fp->Base.ShadowSamplers & (1 << i)) {
3197          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3198          key.tex.swizzles[i] =
3199             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3200       } else {
3201          /* Color sampler: assume no swizzling. */
3202          key.tex.swizzles[i] = SWIZZLE_XYZW;
3203       }
3204    }
3205
3206    if (fp->Base.InputsRead & VARYING_BIT_POS) {
3207       key.drawable_height = ctx->DrawBuffer->Height;
3208    }
3209
3210    if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3211       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
3212    }
3213
3214    key.nr_color_regions = 1;
3215
3216    /* GL_FRAGMENT_SHADER_DERIVATIVE_HINT is almost always GL_DONT_CARE.  The
3217     * quality of the derivatives is likely to be determined by the driconf
3218     * option.
3219     */
3220    key.high_quality_derivatives = brw->disable_derivative_optimization;
3221
3222    key.program_string_id = bfp->id;
3223
3224    uint32_t old_prog_offset = brw->wm.base.prog_offset;
3225    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3226
3227    bool success = do_wm_prog(brw, prog, bfp, &key);
3228
3229    brw->wm.base.prog_offset = old_prog_offset;
3230    brw->wm.prog_data = old_prog_data;
3231
3232    return success;
3233 }