src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 extern "C" {
  32
  33 #include <sys/types.h>
  34
  35 #include "main/hash_table.h"
  36 #include "main/macros.h"
  37 #include "main/shaderobj.h"
  38 #include "main/fbobject.h"
  39 #include "program/prog_parameter.h"
  40 #include "program/prog_print.h"
  41 #include "program/register_allocate.h"
  42 #include "program/sampler.h"
  43 #include "program/hash_table.h"
  44 #include "brw_context.h"
  45 #include "brw_eu.h"
  46 #include "brw_wm.h"
  47 }
  48 #include "brw_fs.h"
  49 #include "main/uniforms.h"
  50 #include "glsl/glsl_types.h"
  51
  52 void
  53 fs_inst::init()
  54 {
  55    memset(this, 0, sizeof(*this));
  56    this->opcode = BRW_OPCODE_NOP;
  57    this->conditional_mod = BRW_CONDITIONAL_NONE;
  58
  59    this->dst = reg_undef;
  60    this->src[0] = reg_undef;
  61    this->src[1] = reg_undef;
  62    this->src[2] = reg_undef;
  63
  64    /* This will be the case for almost all instructions. */
  65    this->regs_written = 1;
  66 }
  67
  68 fs_inst::fs_inst()
  69 {
  70    init();
  71 }
  72
  73 fs_inst::fs_inst(enum opcode opcode)
  74 {
  75    init();
  76    this->opcode = opcode;
  77 }
  78
  79 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
  80 {
  81    init();
  82    this->opcode = opcode;
  83    this->dst = dst;
  84
  85    if (dst.file == GRF)
  86       assert(dst.reg_offset >= 0);
  87 }
  88
  89 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
  90 {
  91    init();
  92    this->opcode = opcode;
  93    this->dst = dst;
  94    this->src[0] = src0;
  95
  96    if (dst.file == GRF)
  97       assert(dst.reg_offset >= 0);
  98    if (src[0].file == GRF)
  99       assert(src[0].reg_offset >= 0);
 100 }
 101
 102 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 103 {
 104    init();
 105    this->opcode = opcode;
 106    this->dst = dst;
 107    this->src[0] = src0;
 108    this->src[1] = src1;
 109
 110    if (dst.file == GRF)
 111       assert(dst.reg_offset >= 0);
 112    if (src[0].file == GRF)
 113       assert(src[0].reg_offset >= 0);
 114    if (src[1].file == GRF)
 115       assert(src[1].reg_offset >= 0);
 116 }
 117
 118 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
 119                  fs_reg src0, fs_reg src1, fs_reg src2)
 120 {
 121    init();
 122    this->opcode = opcode;
 123    this->dst = dst;
 124    this->src[0] = src0;
 125    this->src[1] = src1;
 126    this->src[2] = src2;
 127
 128    if (dst.file == GRF)
 129       assert(dst.reg_offset >= 0);
 130    if (src[0].file == GRF)
 131       assert(src[0].reg_offset >= 0);
 132    if (src[1].file == GRF)
 133       assert(src[1].reg_offset >= 0);
 134    if (src[2].file == GRF)
 135       assert(src[2].reg_offset >= 0);
 136 }
 137
 138 #define ALU1(op)                                                        \
 139    fs_inst *                                                            \
 140    fs_visitor::op(fs_reg dst, fs_reg src0)                              \
 141    {                                                                    \
 142       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
 143    }
 144
 145 #define ALU2(op)                                                        \
 146    fs_inst *                                                            \
 147    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1)                 \
 148    {                                                                    \
 149       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
 150    }
 151
 152 #define ALU3(op)                                                        \
 153    fs_inst *                                                            \
 154    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1, fs_reg src2)    \
 155    {                                                                    \
 156       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
 157    }
 158
 159 ALU1(NOT)
 160 ALU1(MOV)
 161 ALU1(FRC)
 162 ALU1(RNDD)
 163 ALU1(RNDE)
 164 ALU1(RNDZ)
 165 ALU2(ADD)
 166 ALU2(MUL)
 167 ALU2(MACH)
 168 ALU2(AND)
 169 ALU2(OR)
 170 ALU2(XOR)
 171 ALU2(SHL)
 172 ALU2(SHR)
 173 ALU2(ASR)
 174 ALU3(LRP)
 175 ALU1(BFREV)
 176 ALU3(BFE)
 177 ALU2(BFI1)
 178 ALU3(BFI2)
 179 ALU1(FBH)
 180 ALU1(FBL)
 181 ALU1(CBIT)
 182 ALU3(MAD)
 183 ALU2(ADDC)
 184 ALU2(SUBB)
 185
 186 /** Gen4 predicated IF. */
 187 fs_inst *
 188 fs_visitor::IF(uint32_t predicate)
 189 {
 190    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
 191    inst->predicate = predicate;
 192    return inst;
 193 }
 194
 195 /** Gen6+ IF with embedded comparison. */
 196 fs_inst *
 197 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
 198 {
 199    assert(brw->gen >= 6);
 200    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
 201                                         reg_null_d, src0, src1);
 202    inst->conditional_mod = condition;
 203    return inst;
 204 }
 205
 206 /**
 207  * CMP: Sets the low bit of the destination channels with the result
 208  * of the comparison, while the upper bits are undefined, and updates
 209  * the flag register with the packed 16 bits of the result.
 210  */
 211 fs_inst *
 212 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
 213 {
 214    fs_inst *inst;
 215
 216    /* Take the instruction:
 217     *
 218     * CMP null<d> src0<f> src1<f>
 219     *
 220     * Original gen4 does type conversion to the destination type before
 221     * comparison, producing garbage results for floating point comparisons.
 222     * gen5 does the comparison on the execution type (resolved source types),
 223     * so dst type doesn't matter.  gen6 does comparison and then uses the
 224     * result as if it was the dst type with no conversion, which happens to
 225     * mostly work out for float-interpreted-as-int since our comparisons are
 226     * for >0, =0, <0.
 227     */
 228    if (brw->gen == 4) {
 229       dst.type = src0.type;
 230       if (dst.file == HW_REG)
 231          dst.fixed_hw_reg.type = dst.type;
 232    }
 233
 234    resolve_ud_negate(&src0);
 235    resolve_ud_negate(&src1);
 236
 237    inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
 238    inst->conditional_mod = condition;
 239
 240    return inst;
 241 }
 242
 243 exec_list
 244 fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
 245                                        fs_reg varying_offset,
 246                                        uint32_t const_offset)
 247 {
 248    exec_list instructions;
 249    fs_inst *inst;
 250
 251    /* We have our constant surface use a pitch of 4 bytes, so our index can
 252     * be any component of a vector, and then we load 4 contiguous
 253     * components starting from that.
 254     *
 255     * We break down the const_offset to a portion added to the variable
 256     * offset and a portion done using reg_offset, which means that if you
 257     * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
 258     * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
 259     * CSE can later notice that those loads are all the same and eliminate
 260     * the redundant ones.
 261     */
 262    fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
 263    instructions.push_tail(ADD(vec4_offset,
 264                               varying_offset, const_offset & ~3));
 265
 266    int scale = 1;
 267    if (brw->gen == 4 && dispatch_width == 8) {
 268       /* Pre-gen5, we can either use a SIMD8 message that requires (header,
 269        * u, v, r) as parameters, or we can just use the SIMD16 message
 270        * consisting of (header, u).  We choose the second, at the cost of a
 271        * longer return length.
 272        */
 273       scale = 2;
 274    }
 275
 276    enum opcode op;
 277    if (brw->gen >= 7)
 278       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
 279    else
 280       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
 281    fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4 * scale), dst.type);
 282    inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
 283    inst->regs_written = 4 * scale;
 284    instructions.push_tail(inst);
 285
 286    if (brw->gen < 7) {
 287       inst->base_mrf = 13;
 288       inst->header_present = true;
 289       if (brw->gen == 4)
 290          inst->mlen = 3;
 291       else
 292          inst->mlen = 1 + dispatch_width / 8;
 293    }
 294
 295    vec4_result.reg_offset += (const_offset & 3) * scale;
 296    instructions.push_tail(MOV(dst, vec4_result));
 297
 298    return instructions;
 299 }
 300
 301 /**
 302  * A helper for MOV generation for fixing up broken hardware SEND dependency
 303  * handling.
 304  */
 305 fs_inst *
 306 fs_visitor::DEP_RESOLVE_MOV(int grf)
 307 {
 308    fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
 309
 310    inst->ir = NULL;
 311    inst->annotation = "send dependency resolve";
 312
 313    /* The caller always wants uncompressed to emit the minimal extra
 314     * dependencies, and to avoid having to deal with aligning its regs to 2.
 315     */
 316    inst->force_uncompressed = true;
 317
 318    return inst;
 319 }
 320
 321 bool
 322 fs_inst::equals(fs_inst *inst)
 323 {
 324    return (opcode == inst->opcode &&
 325            dst.equals(inst->dst) &&
 326            src[0].equals(inst->src[0]) &&
 327            src[1].equals(inst->src[1]) &&
 328            src[2].equals(inst->src[2]) &&
 329            saturate == inst->saturate &&
 330            predicate == inst->predicate &&
 331            conditional_mod == inst->conditional_mod &&
 332            mlen == inst->mlen &&
 333            base_mrf == inst->base_mrf &&
 334            sampler == inst->sampler &&
 335            target == inst->target &&
 336            eot == inst->eot &&
 337            header_present == inst->header_present &&
 338            shadow_compare == inst->shadow_compare &&
 339            offset == inst->offset);
 340 }
 341
 342 bool
 343 fs_inst::overwrites_reg(const fs_reg &reg)
 344 {
 345    return (reg.file == dst.file &&
 346            reg.reg == dst.reg &&
 347            reg.reg_offset >= dst.reg_offset  &&
 348            reg.reg_offset < dst.reg_offset + regs_written);
 349 }
 350
 351 bool
 352 fs_inst::is_send_from_grf()
 353 {
 354    return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
 355            opcode == SHADER_OPCODE_SHADER_TIME_ADD ||
 356            (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
 357             src[1].file == GRF));
 358 }
 359
 360 bool
 361 fs_visitor::can_do_source_mods(fs_inst *inst)
 362 {
 363    if (brw->gen == 6 && inst->is_math())
 364       return false;
 365
 366    if (inst->is_send_from_grf())
 367       return false;
 368
 369    return true;
 370 }
 371
 372 void
 373 fs_reg::init()
 374 {
 375    memset(this, 0, sizeof(*this));
 376    this->smear = -1;
 377 }
 378
 379 /** Generic unset register constructor. */
 380 fs_reg::fs_reg()
 381 {
 382    init();
 383    this->file = BAD_FILE;
 384 }
 385
 386 /** Immediate value constructor. */
 387 fs_reg::fs_reg(float f)
 388 {
 389    init();
 390    this->file = IMM;
 391    this->type = BRW_REGISTER_TYPE_F;
 392    this->imm.f = f;
 393 }
 394
 395 /** Immediate value constructor. */
 396 fs_reg::fs_reg(int32_t i)
 397 {
 398    init();
 399    this->file = IMM;
 400    this->type = BRW_REGISTER_TYPE_D;
 401    this->imm.i = i;
 402 }
 403
 404 /** Immediate value constructor. */
 405 fs_reg::fs_reg(uint32_t u)
 406 {
 407    init();
 408    this->file = IMM;
 409    this->type = BRW_REGISTER_TYPE_UD;
 410    this->imm.u = u;
 411 }
 412
 413 /** Fixed brw_reg Immediate value constructor. */
 414 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
 415 {
 416    init();
 417    this->file = HW_REG;
 418    this->fixed_hw_reg = fixed_hw_reg;
 419    this->type = fixed_hw_reg.type;
 420 }
 421
 422 bool
 423 fs_reg::equals(const fs_reg &r) const
 424 {
 425    return (file == r.file &&
 426            reg == r.reg &&
 427            reg_offset == r.reg_offset &&
 428            type == r.type &&
 429            negate == r.negate &&
 430            abs == r.abs &&
 431            !reladdr && !r.reladdr &&
 432            memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
 433                   sizeof(fixed_hw_reg)) == 0 &&
 434            smear == r.smear &&
 435            imm.u == r.imm.u);
 436 }
 437
 438 bool
 439 fs_reg::is_zero() const
 440 {
 441    if (file != IMM)
 442       return false;
 443
 444    return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
 445 }
 446
 447 bool
 448 fs_reg::is_one() const
 449 {
 450    if (file != IMM)
 451       return false;
 452
 453    return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
 454 }
 455
 456 bool
 457 fs_reg::is_valid_3src() const
 458 {
 459    return file == GRF || file == UNIFORM;
 460 }
 461
 462 int
 463 fs_visitor::type_size(const struct glsl_type *type)
 464 {
 465    unsigned int size, i;
 466
 467    switch (type->base_type) {
 468    case GLSL_TYPE_UINT:
 469    case GLSL_TYPE_INT:
 470    case GLSL_TYPE_FLOAT:
 471    case GLSL_TYPE_BOOL:
 472       return type->components();
 473    case GLSL_TYPE_ARRAY:
 474       return type_size(type->fields.array) * type->length;
 475    case GLSL_TYPE_STRUCT:
 476       size = 0;
 477       for (i = 0; i < type->length; i++) {
 478          size += type_size(type->fields.structure[i].type);
 479       }
 480       return size;
 481    case GLSL_TYPE_SAMPLER:
 482       /* Samplers take up no register space, since they're baked in at
 483        * link time.
 484        */
 485       return 0;
 486    case GLSL_TYPE_VOID:
 487    case GLSL_TYPE_ERROR:
 488    case GLSL_TYPE_INTERFACE:
 489       assert(!"not reached");
 490       break;
 491    }
 492
 493    return 0;
 494 }
 495
 496 fs_reg
 497 fs_visitor::get_timestamp()
 498 {
 499    assert(brw->gen >= 7);
 500
 501    fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 502                                           BRW_ARF_TIMESTAMP,
 503                                           0),
 504                              BRW_REGISTER_TYPE_UD));
 505
 506    fs_reg dst = fs_reg(this, glsl_type::uint_type);
 507
 508    fs_inst *mov = emit(MOV(dst, ts));
 509    /* We want to read the 3 fields we care about (mostly field 0, but also 2)
 510     * even if it's not enabled in the dispatch.
 511     */
 512    mov->force_writemask_all = true;
 513    mov->force_uncompressed = true;
 514
 515    /* The caller wants the low 32 bits of the timestamp.  Since it's running
 516     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
 517     * which is plenty of time for our purposes.  It is identical across the
 518     * EUs, but since it's tracking GPU core speed it will increment at a
 519     * varying rate as render P-states change.
 520     *
 521     * The caller could also check if render P-states have changed (or anything
 522     * else that might disrupt timing) by setting smear to 2 and checking if
 523     * that field is != 0.
 524     */
 525    dst.smear = 0;
 526
 527    return dst;
 528 }
 529
 530 void
 531 fs_visitor::emit_shader_time_begin()
 532 {
 533    current_annotation = "shader time start";
 534    shader_start_time = get_timestamp();
 535 }
 536
 537 void
 538 fs_visitor::emit_shader_time_end()
 539 {
 540    current_annotation = "shader time end";
 541
 542    enum shader_time_shader_type type, written_type, reset_type;
 543    if (dispatch_width == 8) {
 544       type = ST_FS8;
 545       written_type = ST_FS8_WRITTEN;
 546       reset_type = ST_FS8_RESET;
 547    } else {
 548       assert(dispatch_width == 16);
 549       type = ST_FS16;
 550       written_type = ST_FS16_WRITTEN;
 551       reset_type = ST_FS16_RESET;
 552    }
 553
 554    fs_reg shader_end_time = get_timestamp();
 555
 556    /* Check that there weren't any timestamp reset events (assuming these
 557     * were the only two timestamp reads that happened).
 558     */
 559    fs_reg reset = shader_end_time;
 560    reset.smear = 2;
 561    fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
 562    test->conditional_mod = BRW_CONDITIONAL_Z;
 563    emit(IF(BRW_PREDICATE_NORMAL));
 564
 565    push_force_uncompressed();
 566    fs_reg start = shader_start_time;
 567    start.negate = true;
 568    fs_reg diff = fs_reg(this, glsl_type::uint_type);
 569    emit(ADD(diff, start, shader_end_time));
 570
 571    /* If there were no instructions between the two timestamp gets, the diff
 572     * is 2 cycles.  Remove that overhead, so I can forget about that when
 573     * trying to determine the time taken for single instructions.
 574     */
 575    emit(ADD(diff, diff, fs_reg(-2u)));
 576
 577    emit_shader_time_write(type, diff);
 578    emit_shader_time_write(written_type, fs_reg(1u));
 579    emit(BRW_OPCODE_ELSE);
 580    emit_shader_time_write(reset_type, fs_reg(1u));
 581    emit(BRW_OPCODE_ENDIF);
 582
 583    pop_force_uncompressed();
 584 }
 585
 586 void
 587 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
 588                                    fs_reg value)
 589 {
 590    int shader_time_index =
 591       brw_get_shader_time_index(brw, shader_prog, &fp->Base, type);
 592    fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
 593
 594    fs_reg payload;
 595    if (dispatch_width == 8)
 596       payload = fs_reg(this, glsl_type::uvec2_type);
 597    else
 598       payload = fs_reg(this, glsl_type::uint_type);
 599
 600    emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
 601                 fs_reg(), payload, offset, value));
 602 }
 603
 604 void
 605 fs_visitor::fail(const char *format, ...)
 606 {
 607    va_list va;
 608    char *msg;
 609
 610    if (failed)
 611       return;
 612
 613    failed = true;
 614
 615    va_start(va, format);
 616    msg = ralloc_vasprintf(mem_ctx, format, va);
 617    va_end(va);
 618    msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
 619
 620    this->fail_msg = msg;
 621
 622    if (INTEL_DEBUG & DEBUG_WM) {
 623       fprintf(stderr, "%s",  msg);
 624    }
 625 }
 626
 627 fs_inst *
 628 fs_visitor::emit(enum opcode opcode)
 629 {
 630    return emit(fs_inst(opcode));
 631 }
 632
 633 fs_inst *
 634 fs_visitor::emit(enum opcode opcode, fs_reg dst)
 635 {
 636    return emit(fs_inst(opcode, dst));
 637 }
 638
 639 fs_inst *
 640 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
 641 {
 642    return emit(fs_inst(opcode, dst, src0));
 643 }
 644
 645 fs_inst *
 646 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 647 {
 648    return emit(fs_inst(opcode, dst, src0, src1));
 649 }
 650
 651 fs_inst *
 652 fs_visitor::emit(enum opcode opcode, fs_reg dst,
 653                  fs_reg src0, fs_reg src1, fs_reg src2)
 654 {
 655    return emit(fs_inst(opcode, dst, src0, src1, src2));
 656 }
 657
 658 void
 659 fs_visitor::push_force_uncompressed()
 660 {
 661    force_uncompressed_stack++;
 662 }
 663
 664 void
 665 fs_visitor::pop_force_uncompressed()
 666 {
 667    force_uncompressed_stack--;
 668    assert(force_uncompressed_stack >= 0);
 669 }
 670
 671 void
 672 fs_visitor::push_force_sechalf()
 673 {
 674    force_sechalf_stack++;
 675 }
 676
 677 void
 678 fs_visitor::pop_force_sechalf()
 679 {
 680    force_sechalf_stack--;
 681    assert(force_sechalf_stack >= 0);
 682 }
 683
 684 /**
 685  * Returns true if the instruction has a flag that means it won't
 686  * update an entire destination register.
 687  *
 688  * For example, dead code elimination and live variable analysis want to know
 689  * when a write to a variable screens off any preceding values that were in
 690  * it.
 691  */
 692 bool
 693 fs_inst::is_partial_write()
 694 {
 695    return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
 696            this->force_uncompressed ||
 697            this->force_sechalf);
 698 }
 699
 700 /**
 701  * Returns how many MRFs an FS opcode will write over.
 702  *
 703  * Note that this is not the 0 or 1 implied writes in an actual gen
 704  * instruction -- the FS opcodes often generate MOVs in addition.
 705  */
 706 int
 707 fs_visitor::implied_mrf_writes(fs_inst *inst)
 708 {
 709    if (inst->mlen == 0)
 710       return 0;
 711
 712    switch (inst->opcode) {
 713    case SHADER_OPCODE_RCP:
 714    case SHADER_OPCODE_RSQ:
 715    case SHADER_OPCODE_SQRT:
 716    case SHADER_OPCODE_EXP2:
 717    case SHADER_OPCODE_LOG2:
 718    case SHADER_OPCODE_SIN:
 719    case SHADER_OPCODE_COS:
 720       return 1 * dispatch_width / 8;
 721    case SHADER_OPCODE_POW:
 722    case SHADER_OPCODE_INT_QUOTIENT:
 723    case SHADER_OPCODE_INT_REMAINDER:
 724       return 2 * dispatch_width / 8;
 725    case SHADER_OPCODE_TEX:
 726    case FS_OPCODE_TXB:
 727    case SHADER_OPCODE_TXD:
 728    case SHADER_OPCODE_TXF:
 729    case SHADER_OPCODE_TXF_MS:
 730    case SHADER_OPCODE_TG4:
 731    case SHADER_OPCODE_TXL:
 732    case SHADER_OPCODE_TXS:
 733    case SHADER_OPCODE_LOD:
 734       return 1;
 735    case FS_OPCODE_FB_WRITE:
 736       return 2;
 737    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 738    case FS_OPCODE_UNSPILL:
 739       return 1;
 740    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
 741       return inst->mlen;
 742    case FS_OPCODE_SPILL:
 743       return 2;
 744    default:
 745       assert(!"not reached");
 746       return inst->mlen;
 747    }
 748 }
 749
 750 int
 751 fs_visitor::virtual_grf_alloc(int size)
 752 {
 753    if (virtual_grf_array_size <= virtual_grf_count) {
 754       if (virtual_grf_array_size == 0)
 755          virtual_grf_array_size = 16;
 756       else
 757          virtual_grf_array_size *= 2;
 758       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 759                                    virtual_grf_array_size);
 760    }
 761    virtual_grf_sizes[virtual_grf_count] = size;
 762    return virtual_grf_count++;
 763 }
 764
 765 /** Fixed HW reg constructor. */
 766 fs_reg::fs_reg(enum register_file file, int reg)
 767 {
 768    init();
 769    this->file = file;
 770    this->reg = reg;
 771    this->type = BRW_REGISTER_TYPE_F;
 772 }
 773
 774 /** Fixed HW reg constructor. */
 775 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
 776 {
 777    init();
 778    this->file = file;
 779    this->reg = reg;
 780    this->type = type;
 781 }
 782
 783 /** Automatic reg constructor. */
 784 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
 785 {
 786    init();
 787
 788    this->file = GRF;
 789    this->reg = v->virtual_grf_alloc(v->type_size(type));
 790    this->reg_offset = 0;
 791    this->type = brw_type_for_base_type(type);
 792 }
 793
 794 fs_reg *
 795 fs_visitor::variable_storage(ir_variable *var)
 796 {
 797    return (fs_reg *)hash_table_find(this->variable_ht, var);
 798 }
 799
 800 void
 801 import_uniforms_callback(const void *key,
 802                          void *data,
 803                          void *closure)
 804 {
 805    struct hash_table *dst_ht = (struct hash_table *)closure;
 806    const fs_reg *reg = (const fs_reg *)data;
 807
 808    if (reg->file != UNIFORM)
 809       return;
 810
 811    hash_table_insert(dst_ht, data, key);
 812 }
 813
 814 /* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
 815  * This brings in those uniform definitions
 816  */
 817 void
 818 fs_visitor::import_uniforms(fs_visitor *v)
 819 {
 820    hash_table_call_foreach(v->variable_ht,
 821                            import_uniforms_callback,
 822                            variable_ht);
 823    this->params_remap = v->params_remap;
 824    this->nr_params_remap = v->nr_params_remap;
 825 }
 826
 827 /* Our support for uniforms is piggy-backed on the struct
 828  * gl_fragment_program, because that's where the values actually
 829  * get stored, rather than in some global gl_shader_program uniform
 830  * store.
 831  */
 832 void
 833 fs_visitor::setup_uniform_values(ir_variable *ir)
 834 {
 835    int namelen = strlen(ir->name);
 836
 837    /* The data for our (non-builtin) uniforms is stored in a series of
 838     * gl_uniform_driver_storage structs for each subcomponent that
 839     * glGetUniformLocation() could name.  We know it's been set up in the same
 840     * order we'd walk the type, so walk the list of storage and find anything
 841     * with our name, or the prefix of a component that starts with our name.
 842     */
 843    unsigned params_before = c->prog_data.nr_params;
 844    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
 845       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 846
 847       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 848           (storage->name[namelen] != 0 &&
 849            storage->name[namelen] != '.' &&
 850            storage->name[namelen] != '[')) {
 851          continue;
 852       }
 853
 854       unsigned slots = storage->type->component_slots();
 855       if (storage->array_elements)
 856          slots *= storage->array_elements;
 857
 858       for (unsigned i = 0; i < slots; i++) {
 859          c->prog_data.param[c->prog_data.nr_params++] =
 860             &storage->storage[i].f;
 861       }
 862    }
 863
 864    /* Make sure we actually initialized the right amount of stuff here. */
 865    assert(params_before + ir->type->component_slots() ==
 866           c->prog_data.nr_params);
 867    (void)params_before;
 868 }
 869
 870
 871 /* Our support for builtin uniforms is even scarier than non-builtin.
 872  * It sits on top of the PROG_STATE_VAR parameters that are
 873  * automatically updated from GL context state.
 874  */
 875 void
 876 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
 877 {
 878    const ir_state_slot *const slots = ir->state_slots;
 879    assert(ir->state_slots != NULL);
 880
 881    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 882       /* This state reference has already been setup by ir_to_mesa, but we'll
 883        * get the same index back here.
 884        */
 885       int index = _mesa_add_state_reference(this->fp->Base.Parameters,
 886                                             (gl_state_index *)slots[i].tokens);
 887
 888       /* Add each of the unique swizzles of the element as a parameter.
 889        * This'll end up matching the expected layout of the
 890        * array/matrix/structure we're trying to fill in.
 891        */
 892       int last_swiz = -1;
 893       for (unsigned int j = 0; j < 4; j++) {
 894          int swiz = GET_SWZ(slots[i].swizzle, j);
 895          if (swiz == last_swiz)
 896             break;
 897          last_swiz = swiz;
 898
 899          c->prog_data.param[c->prog_data.nr_params++] =
 900             &fp->Base.Parameters->ParameterValues[index][swiz].f;
 901       }
 902    }
 903 }
 904
 905 fs_reg *
 906 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
 907 {
 908    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 909    fs_reg wpos = *reg;
 910    bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
 911
 912    /* gl_FragCoord.x */
 913    if (ir->pixel_center_integer) {
 914       emit(MOV(wpos, this->pixel_x));
 915    } else {
 916       emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
 917    }
 918    wpos.reg_offset++;
 919
 920    /* gl_FragCoord.y */
 921    if (!flip && ir->pixel_center_integer) {
 922       emit(MOV(wpos, this->pixel_y));
 923    } else {
 924       fs_reg pixel_y = this->pixel_y;
 925       float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
 926
 927       if (flip) {
 928          pixel_y.negate = true;
 929          offset += c->key.drawable_height - 1.0;
 930       }
 931
 932       emit(ADD(wpos, pixel_y, fs_reg(offset)));
 933    }
 934    wpos.reg_offset++;
 935
 936    /* gl_FragCoord.z */
 937    if (brw->gen >= 6) {
 938       emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
 939    } else {
 940       emit(FS_OPCODE_LINTERP, wpos,
 941            this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
 942            this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
 943            interp_reg(VARYING_SLOT_POS, 2));
 944    }
 945    wpos.reg_offset++;
 946
 947    /* gl_FragCoord.w: Already set up in emit_interpolation */
 948    emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
 949
 950    return reg;
 951 }
 952
 953 fs_inst *
 954 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
 955                          glsl_interp_qualifier interpolation_mode,
 956                          bool is_centroid)
 957 {
 958    brw_wm_barycentric_interp_mode barycoord_mode;
 959    if (brw->gen >= 6) {
 960       if (is_centroid) {
 961          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
 962             barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
 963          else
 964             barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
 965       } else {
 966          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
 967             barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
 968          else
 969             barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
 970       }
 971    } else {
 972       /* On Ironlake and below, there is only one interpolation mode.
 973        * Centroid interpolation doesn't mean anything on this hardware --
 974        * there is no multisampling.
 975        */
 976       barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
 977    }
 978    return emit(FS_OPCODE_LINTERP, attr,
 979                this->delta_x[barycoord_mode],
 980                this->delta_y[barycoord_mode], interp);
 981 }
 982
 983 fs_reg *
 984 fs_visitor::emit_general_interpolation(ir_variable *ir)
 985 {
 986    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 987    reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
 988    fs_reg attr = *reg;
 989
 990    unsigned int array_elements;
 991    const glsl_type *type;
 992
 993    if (ir->type->is_array()) {
 994       array_elements = ir->type->length;
 995       if (array_elements == 0) {
 996          fail("dereferenced array '%s' has length 0\n", ir->name);
 997       }
 998       type = ir->type->fields.array;
 999    } else {
1000       array_elements = 1;
1001       type = ir->type;
1002    }
1003
1004    glsl_interp_qualifier interpolation_mode =
1005       ir->determine_interpolation_mode(c->key.flat_shade);
1006
1007    int location = ir->location;
1008    for (unsigned int i = 0; i < array_elements; i++) {
1009       for (unsigned int j = 0; j < type->matrix_columns; j++) {
1010          if (c->prog_data.urb_setup[location] == -1) {
1011             /* If there's no incoming setup data for this slot, don't
1012              * emit interpolation for it.
1013              */
1014             attr.reg_offset += type->vector_elements;
1015             location++;
1016             continue;
1017          }
1018
1019          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1020             /* Constant interpolation (flat shading) case. The SF has
1021              * handed us defined values in only the constant offset
1022              * field of the setup reg.
1023              */
1024             for (unsigned int k = 0; k < type->vector_elements; k++) {
1025                struct brw_reg interp = interp_reg(location, k);
1026                interp = suboffset(interp, 3);
1027                interp.type = reg->type;
1028                emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1029                attr.reg_offset++;
1030             }
1031          } else {
1032             /* Smooth/noperspective interpolation case. */
1033             for (unsigned int k = 0; k < type->vector_elements; k++) {
1034                /* FINISHME: At some point we probably want to push
1035                 * this farther by giving similar treatment to the
1036                 * other potentially constant components of the
1037                 * attribute, as well as making brw_vs_constval.c
1038                 * handle varyings other than gl_TexCoord.
1039                 */
1040                struct brw_reg interp = interp_reg(location, k);
1041                emit_linterp(attr, fs_reg(interp), interpolation_mode,
1042                             ir->centroid);
1043                if (brw->needs_unlit_centroid_workaround && ir->centroid) {
1044                   /* Get the pixel/sample mask into f0 so that we know
1045                    * which pixels are lit.  Then, for each channel that is
1046                    * unlit, replace the centroid data with non-centroid
1047                    * data.
1048                    */
1049                   emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1050                   fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1051                                                interpolation_mode, false);
1052                   inst->predicate = BRW_PREDICATE_NORMAL;
1053                   inst->predicate_inverse = true;
1054                }
1055                if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1056                   emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1057                }
1058                attr.reg_offset++;
1059             }
1060
1061          }
1062          location++;
1063       }
1064    }
1065
1066    return reg;
1067 }
1068
1069 fs_reg *
1070 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1071 {
1072    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1073
1074    /* The frontfacing comes in as a bit in the thread payload. */
1075    if (brw->gen >= 6) {
1076       emit(BRW_OPCODE_ASR, *reg,
1077            fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1078            fs_reg(15));
1079       emit(BRW_OPCODE_NOT, *reg, *reg);
1080       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1081    } else {
1082       struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1083       /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1084        * us front face
1085        */
1086       emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1087       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1088    }
1089
1090    return reg;
1091 }
1092
1093 fs_reg
1094 fs_visitor::fix_math_operand(fs_reg src)
1095 {
1096    /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1097     * might be able to do better by doing execsize = 1 math and then
1098     * expanding that result out, but we would need to be careful with
1099     * masking.
1100     *
1101     * The hardware ignores source modifiers (negate and abs) on math
1102     * instructions, so we also move to a temp to set those up.
1103     */
1104    if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1105        !src.abs && !src.negate)
1106       return src;
1107
1108    /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1109     * operands to math
1110     */
1111    if (brw->gen >= 7 && src.file != IMM)
1112       return src;
1113
1114    fs_reg expanded = fs_reg(this, glsl_type::float_type);
1115    expanded.type = src.type;
1116    emit(BRW_OPCODE_MOV, expanded, src);
1117    return expanded;
1118 }
1119
1120 fs_inst *
1121 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1122 {
1123    switch (opcode) {
1124    case SHADER_OPCODE_RCP:
1125    case SHADER_OPCODE_RSQ:
1126    case SHADER_OPCODE_SQRT:
1127    case SHADER_OPCODE_EXP2:
1128    case SHADER_OPCODE_LOG2:
1129    case SHADER_OPCODE_SIN:
1130    case SHADER_OPCODE_COS:
1131       break;
1132    default:
1133       assert(!"not reached: bad math opcode");
1134       return NULL;
1135    }
1136
1137    /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
1138     * might be able to do better by doing execsize = 1 math and then
1139     * expanding that result out, but we would need to be careful with
1140     * masking.
1141     *
1142     * Gen 6 hardware ignores source modifiers (negate and abs) on math
1143     * instructions, so we also move to a temp to set those up.
1144     */
1145    if (brw->gen >= 6)
1146       src = fix_math_operand(src);
1147
1148    fs_inst *inst = emit(opcode, dst, src);
1149
1150    if (brw->gen < 6) {
1151       inst->base_mrf = 2;
1152       inst->mlen = dispatch_width / 8;
1153    }
1154
1155    return inst;
1156 }
1157
1158 fs_inst *
1159 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1160 {
1161    int base_mrf = 2;
1162    fs_inst *inst;
1163
1164    switch (opcode) {
1165    case SHADER_OPCODE_INT_QUOTIENT:
1166    case SHADER_OPCODE_INT_REMAINDER:
1167       if (brw->gen >= 7 && dispatch_width == 16)
1168          fail("16-wide INTDIV unsupported\n");
1169       break;
1170    case SHADER_OPCODE_POW:
1171       break;
1172    default:
1173       assert(!"not reached: unsupported binary math opcode.");
1174       return NULL;
1175    }
1176
1177    if (brw->gen >= 6) {
1178       src0 = fix_math_operand(src0);
1179       src1 = fix_math_operand(src1);
1180
1181       inst = emit(opcode, dst, src0, src1);
1182    } else {
1183       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1184        * "Message Payload":
1185        *
1186        * "Operand0[7].  For the INT DIV functions, this operand is the
1187        *  denominator."
1188        *  ...
1189        * "Operand1[7].  For the INT DIV functions, this operand is the
1190        *  numerator."
1191        */
1192       bool is_int_div = opcode != SHADER_OPCODE_POW;
1193       fs_reg &op0 = is_int_div ? src1 : src0;
1194       fs_reg &op1 = is_int_div ? src0 : src1;
1195
1196       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1197       inst = emit(opcode, dst, op0, reg_null_f);
1198
1199       inst->base_mrf = base_mrf;
1200       inst->mlen = 2 * dispatch_width / 8;
1201    }
1202    return inst;
1203 }
1204
1205 void
1206 fs_visitor::assign_curb_setup()
1207 {
1208    c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
1209    if (dispatch_width == 8) {
1210       c->prog_data.first_curbe_grf = c->nr_payload_regs;
1211    } else {
1212       c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1213    }
1214
1215    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1216    foreach_list(node, &this->instructions) {
1217       fs_inst *inst = (fs_inst *)node;
1218
1219       for (unsigned int i = 0; i < 3; i++) {
1220          if (inst->src[i].file == UNIFORM) {
1221             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1222             struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1223                                                   constant_nr / 8,
1224                                                   constant_nr % 8);
1225
1226             inst->src[i].file = HW_REG;
1227             inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
1228          }
1229       }
1230    }
1231 }
1232
1233 void
1234 fs_visitor::calculate_urb_setup()
1235 {
1236    for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1237       c->prog_data.urb_setup[i] = -1;
1238    }
1239
1240    int urb_next = 0;
1241    /* Figure out where each of the incoming setup attributes lands. */
1242    if (brw->gen >= 6) {
1243       if (_mesa_bitcount_64(fp->Base.InputsRead &
1244                             BRW_FS_VARYING_INPUT_MASK) <= 16) {
1245          /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1246           * first 16 varying inputs, so we can put them wherever we want.
1247           * Just put them in order.
1248           *
1249           * This is useful because it means that (a) inputs not used by the
1250           * fragment shader won't take up valuable register space, and (b) we
1251           * won't have to recompile the fragment shader if it gets paired with
1252           * a different vertex (or geometry) shader.
1253           */
1254          for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1255             if (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1256                 BITFIELD64_BIT(i)) {
1257                c->prog_data.urb_setup[i] = urb_next++;
1258             }
1259          }
1260       } else {
1261          /* We have enough input varyings that the SF/SBE pipeline stage can't
1262           * arbitrarily rearrange them to suit our whim; we have to put them
1263           * in an order that matches the output of the previous pipeline stage
1264           * (geometry or vertex shader).
1265           */
1266          struct brw_vue_map prev_stage_vue_map;
1267          brw_compute_vue_map(brw, &prev_stage_vue_map,
1268                              c->key.input_slots_valid);
1269          int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1270          assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1271          for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1272               slot++) {
1273             int varying = prev_stage_vue_map.slot_to_varying[slot];
1274             /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1275              * unused.
1276              */
1277             if (varying != BRW_VARYING_SLOT_COUNT &&
1278                 (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1279                  BITFIELD64_BIT(varying))) {
1280                c->prog_data.urb_setup[varying] = slot - first_slot;
1281             }
1282          }
1283          urb_next = prev_stage_vue_map.num_slots - first_slot;
1284       }
1285    } else {
1286       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1287       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1288          /* Point size is packed into the header, not as a general attribute */
1289          if (i == VARYING_SLOT_PSIZ)
1290             continue;
1291
1292          if (c->key.input_slots_valid & BITFIELD64_BIT(i)) {
1293             /* The back color slot is skipped when the front color is
1294              * also written to.  In addition, some slots can be
1295              * written in the vertex shader and not read in the
1296              * fragment shader.  So the register number must always be
1297              * incremented, mapped or not.
1298              */
1299             if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1300                c->prog_data.urb_setup[i] = urb_next;
1301             urb_next++;
1302          }
1303       }
1304
1305       /*
1306        * It's a FS only attribute, and we did interpolation for this attribute
1307        * in SF thread. So, count it here, too.
1308        *
1309        * See compile_sf_prog() for more info.
1310        */
1311       if (fp->Base.InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1312          c->prog_data.urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1313    }
1314
1315    c->prog_data.num_varying_inputs = urb_next;
1316 }
1317
1318 void
1319 fs_visitor::assign_urb_setup()
1320 {
1321    int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1322
1323    /* Offset all the urb_setup[] index by the actual position of the
1324     * setup regs, now that the location of the constants has been chosen.
1325     */
1326    foreach_list(node, &this->instructions) {
1327       fs_inst *inst = (fs_inst *)node;
1328
1329       if (inst->opcode == FS_OPCODE_LINTERP) {
1330          assert(inst->src[2].file == HW_REG);
1331          inst->src[2].fixed_hw_reg.nr += urb_start;
1332       }
1333
1334       if (inst->opcode == FS_OPCODE_CINTERP) {
1335          assert(inst->src[0].file == HW_REG);
1336          inst->src[0].fixed_hw_reg.nr += urb_start;
1337       }
1338    }
1339
1340    /* Each attribute is 4 setup channels, each of which is half a reg. */
1341    this->first_non_payload_grf =
1342       urb_start + c->prog_data.num_varying_inputs * 2;
1343 }
1344
1345 /**
1346  * Split large virtual GRFs into separate components if we can.
1347  *
1348  * This is mostly duplicated with what brw_fs_vector_splitting does,
1349  * but that's really conservative because it's afraid of doing
1350  * splitting that doesn't result in real progress after the rest of
1351  * the optimization phases, which would cause infinite looping in
1352  * optimization.  We can do it once here, safely.  This also has the
1353  * opportunity to split interpolated values, or maybe even uniforms,
1354  * which we don't have at the IR level.
1355  *
1356  * We want to split, because virtual GRFs are what we register
1357  * allocate and spill (due to contiguousness requirements for some
1358  * instructions), and they're what we naturally generate in the
1359  * codegen process, but most virtual GRFs don't actually need to be
1360  * contiguous sets of GRFs.  If we split, we'll end up with reduced
1361  * live intervals and better dead code elimination and coalescing.
1362  */
1363 void
1364 fs_visitor::split_virtual_grfs()
1365 {
1366    int num_vars = this->virtual_grf_count;
1367    bool split_grf[num_vars];
1368    int new_virtual_grf[num_vars];
1369
1370    /* Try to split anything > 0 sized. */
1371    for (int i = 0; i < num_vars; i++) {
1372       if (this->virtual_grf_sizes[i] != 1)
1373          split_grf[i] = true;
1374       else
1375          split_grf[i] = false;
1376    }
1377
1378    if (brw->has_pln &&
1379        this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1380       /* PLN opcodes rely on the delta_xy being contiguous.  We only have to
1381        * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1382        * Gen6, that was the only supported interpolation mode, and since Gen6,
1383        * delta_x and delta_y are in fixed hardware registers.
1384        */
1385       split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1386          false;
1387    }
1388
1389    foreach_list(node, &this->instructions) {
1390       fs_inst *inst = (fs_inst *)node;
1391
1392       /* If there's a SEND message that requires contiguous destination
1393        * registers, no splitting is allowed.
1394        */
1395       if (inst->regs_written > 1) {
1396          split_grf[inst->dst.reg] = false;
1397       }
1398
1399       /* If we're sending from a GRF, don't split it, on the assumption that
1400        * the send is reading the whole thing.
1401        */
1402       if (inst->is_send_from_grf()) {
1403          for (int i = 0; i < 3; i++) {
1404             if (inst->src[i].file == GRF) {
1405                split_grf[inst->src[i].reg] = false;
1406             }
1407          }
1408       }
1409    }
1410
1411    /* Allocate new space for split regs.  Note that the virtual
1412     * numbers will be contiguous.
1413     */
1414    for (int i = 0; i < num_vars; i++) {
1415       if (split_grf[i]) {
1416          new_virtual_grf[i] = virtual_grf_alloc(1);
1417          for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1418             int reg = virtual_grf_alloc(1);
1419             assert(reg == new_virtual_grf[i] + j - 1);
1420             (void) reg;
1421          }
1422          this->virtual_grf_sizes[i] = 1;
1423       }
1424    }
1425
1426    foreach_list(node, &this->instructions) {
1427       fs_inst *inst = (fs_inst *)node;
1428
1429       if (inst->dst.file == GRF &&
1430           split_grf[inst->dst.reg] &&
1431           inst->dst.reg_offset != 0) {
1432          inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1433                           inst->dst.reg_offset - 1);
1434          inst->dst.reg_offset = 0;
1435       }
1436       for (int i = 0; i < 3; i++) {
1437          if (inst->src[i].file == GRF &&
1438              split_grf[inst->src[i].reg] &&
1439              inst->src[i].reg_offset != 0) {
1440             inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1441                                 inst->src[i].reg_offset - 1);
1442             inst->src[i].reg_offset = 0;
1443          }
1444       }
1445    }
1446    this->live_intervals_valid = false;
1447 }
1448
1449 /**
1450  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1451  *
1452  * During code generation, we create tons of temporary variables, many of
1453  * which get immediately killed and are never used again.  Yet, in later
1454  * optimization and analysis passes, such as compute_live_intervals, we need
1455  * to loop over all the virtual GRFs.  Compacting them can save a lot of
1456  * overhead.
1457  */
1458 void
1459 fs_visitor::compact_virtual_grfs()
1460 {
1461    /* Mark which virtual GRFs are used, and count how many. */
1462    int remap_table[this->virtual_grf_count];
1463    memset(remap_table, -1, sizeof(remap_table));
1464
1465    foreach_list(node, &this->instructions) {
1466       const fs_inst *inst = (const fs_inst *) node;
1467
1468       if (inst->dst.file == GRF)
1469          remap_table[inst->dst.reg] = 0;
1470
1471       for (int i = 0; i < 3; i++) {
1472          if (inst->src[i].file == GRF)
1473             remap_table[inst->src[i].reg] = 0;
1474       }
1475    }
1476
1477    /* In addition to registers used in instructions, fs_visitor keeps
1478     * direct references to certain special values which must be patched:
1479     */
1480    fs_reg *special[] = {
1481       &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1482       &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1483       &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1484       &delta_x[0], &delta_x[1], &delta_x[2],
1485       &delta_x[3], &delta_x[4], &delta_x[5],
1486       &delta_y[0], &delta_y[1], &delta_y[2],
1487       &delta_y[3], &delta_y[4], &delta_y[5],
1488    };
1489    STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1490    STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1491
1492    /* Treat all special values as used, to be conservative */
1493    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1494       if (special[i]->file == GRF)
1495          remap_table[special[i]->reg] = 0;
1496    }
1497
1498    /* Compact the GRF arrays. */
1499    int new_index = 0;
1500    for (int i = 0; i < this->virtual_grf_count; i++) {
1501       if (remap_table[i] != -1) {
1502          remap_table[i] = new_index;
1503          virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1504          if (live_intervals_valid) {
1505             virtual_grf_start[new_index] = virtual_grf_start[i];
1506             virtual_grf_end[new_index] = virtual_grf_end[i];
1507          }
1508          ++new_index;
1509       }
1510    }
1511
1512    this->virtual_grf_count = new_index;
1513
1514    /* Patch all the instructions to use the newly renumbered registers */
1515    foreach_list(node, &this->instructions) {
1516       fs_inst *inst = (fs_inst *) node;
1517
1518       if (inst->dst.file == GRF)
1519          inst->dst.reg = remap_table[inst->dst.reg];
1520
1521       for (int i = 0; i < 3; i++) {
1522          if (inst->src[i].file == GRF)
1523             inst->src[i].reg = remap_table[inst->src[i].reg];
1524       }
1525    }
1526
1527    /* Patch all the references to special values */
1528    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1529       if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1530          special[i]->reg = remap_table[special[i]->reg];
1531    }
1532 }
1533
1534 bool
1535 fs_visitor::remove_dead_constants()
1536 {
1537    if (dispatch_width == 8) {
1538       this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
1539       this->nr_params_remap = c->prog_data.nr_params;
1540
1541       for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
1542          this->params_remap[i] = -1;
1543
1544       /* Find which params are still in use. */
1545       foreach_list(node, &this->instructions) {
1546          fs_inst *inst = (fs_inst *)node;
1547
1548          for (int i = 0; i < 3; i++) {
1549             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1550
1551             if (inst->src[i].file != UNIFORM)
1552                continue;
1553
1554             /* Section 5.11 of the OpenGL 4.3 spec says:
1555              *
1556              *     "Out-of-bounds reads return undefined values, which include
1557              *     values from other variables of the active program or zero."
1558              */
1559             if (constant_nr < 0 || constant_nr >= (int)c->prog_data.nr_params) {
1560                constant_nr = 0;
1561             }
1562
1563             /* For now, set this to non-negative.  We'll give it the
1564              * actual new number in a moment, in order to keep the
1565              * register numbers nicely ordered.
1566              */
1567             this->params_remap[constant_nr] = 0;
1568          }
1569       }
1570
1571       /* Figure out what the new numbers for the params will be.  At some
1572        * point when we're doing uniform array access, we're going to want
1573        * to keep the distinction between .reg and .reg_offset, but for
1574        * now we don't care.
1575        */
1576       unsigned int new_nr_params = 0;
1577       for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1578          if (this->params_remap[i] != -1) {
1579             this->params_remap[i] = new_nr_params++;
1580          }
1581       }
1582
1583       /* Update the list of params to be uploaded to match our new numbering. */
1584       for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1585          int remapped = this->params_remap[i];
1586
1587          if (remapped == -1)
1588             continue;
1589
1590          c->prog_data.param[remapped] = c->prog_data.param[i];
1591       }
1592
1593       c->prog_data.nr_params = new_nr_params;
1594    } else {
1595       /* This should have been generated in the 8-wide pass already. */
1596       assert(this->params_remap);
1597    }
1598
1599    /* Now do the renumbering of the shader to remove unused params. */
1600    foreach_list(node, &this->instructions) {
1601       fs_inst *inst = (fs_inst *)node;
1602
1603       for (int i = 0; i < 3; i++) {
1604          int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1605
1606          if (inst->src[i].file != UNIFORM)
1607             continue;
1608
1609          /* as above alias to 0 */
1610          if (constant_nr < 0 || constant_nr >= (int)this->nr_params_remap) {
1611             constant_nr = 0;
1612          }
1613          assert(this->params_remap[constant_nr] != -1);
1614          inst->src[i].reg = this->params_remap[constant_nr];
1615          inst->src[i].reg_offset = 0;
1616       }
1617    }
1618
1619    return true;
1620 }
1621
1622 /*
1623  * Implements array access of uniforms by inserting a
1624  * PULL_CONSTANT_LOAD instruction.
1625  *
1626  * Unlike temporary GRF array access (where we don't support it due to
1627  * the difficulty of doing relative addressing on instruction
1628  * destinations), we could potentially do array access of uniforms
1629  * that were loaded in GRF space as push constants.  In real-world
1630  * usage we've seen, though, the arrays being used are always larger
1631  * than we could load as push constants, so just always move all
1632  * uniform array access out to a pull constant buffer.
1633  */
1634 void
1635 fs_visitor::move_uniform_array_access_to_pull_constants()
1636 {
1637    int pull_constant_loc[c->prog_data.nr_params];
1638
1639    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1640       pull_constant_loc[i] = -1;
1641    }
1642
1643    /* Walk through and find array access of uniforms.  Put a copy of that
1644     * uniform in the pull constant buffer.
1645     *
1646     * Note that we don't move constant-indexed accesses to arrays.  No
1647     * testing has been done of the performance impact of this choice.
1648     */
1649    foreach_list_safe(node, &this->instructions) {
1650       fs_inst *inst = (fs_inst *)node;
1651
1652       for (int i = 0 ; i < 3; i++) {
1653          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1654             continue;
1655
1656          int uniform = inst->src[i].reg;
1657
1658          /* If this array isn't already present in the pull constant buffer,
1659           * add it.
1660           */
1661          if (pull_constant_loc[uniform] == -1) {
1662             const float **values = &c->prog_data.param[uniform];
1663
1664             pull_constant_loc[uniform] = c->prog_data.nr_pull_params;
1665
1666             assert(param_size[uniform]);
1667
1668             for (int j = 0; j < param_size[uniform]; j++) {
1669                c->prog_data.pull_param[c->prog_data.nr_pull_params++] =
1670                   values[j];
1671             }
1672          }
1673
1674          /* Set up the annotation tracking for new generated instructions. */
1675          base_ir = inst->ir;
1676          current_annotation = inst->annotation;
1677
1678          fs_reg surf_index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1679          fs_reg temp = fs_reg(this, glsl_type::float_type);
1680          exec_list list = VARYING_PULL_CONSTANT_LOAD(temp,
1681                                                      surf_index,
1682                                                      *inst->src[i].reladdr,
1683                                                      pull_constant_loc[uniform] +
1684                                                      inst->src[i].reg_offset);
1685          inst->insert_before(&list);
1686
1687          inst->src[i].file = temp.file;
1688          inst->src[i].reg = temp.reg;
1689          inst->src[i].reg_offset = temp.reg_offset;
1690          inst->src[i].reladdr = NULL;
1691       }
1692    }
1693 }
1694
1695 /**
1696  * Choose accesses from the UNIFORM file to demote to using the pull
1697  * constant buffer.
1698  *
1699  * We allow a fragment shader to have more than the specified minimum
1700  * maximum number of fragment shader uniform components (64).  If
1701  * there are too many of these, they'd fill up all of register space.
1702  * So, this will push some of them out to the pull constant buffer and
1703  * update the program to load them.
1704  */
1705 void
1706 fs_visitor::setup_pull_constants()
1707 {
1708    /* Only allow 16 registers (128 uniform components) as push constants. */
1709    unsigned int max_uniform_components = 16 * 8;
1710    if (c->prog_data.nr_params <= max_uniform_components)
1711       return;
1712
1713    if (dispatch_width == 16) {
1714       fail("Pull constants not supported in 16-wide\n");
1715       return;
1716    }
1717
1718    /* Just demote the end of the list.  We could probably do better
1719     * here, demoting things that are rarely used in the program first.
1720     */
1721    unsigned int pull_uniform_base = max_uniform_components;
1722
1723    int pull_constant_loc[c->prog_data.nr_params];
1724    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1725       if (i < pull_uniform_base) {
1726          pull_constant_loc[i] = -1;
1727       } else {
1728          pull_constant_loc[i] = -1;
1729          /* If our constant is already being uploaded for reladdr purposes,
1730           * reuse it.
1731           */
1732          for (unsigned int j = 0; j < c->prog_data.nr_pull_params; j++) {
1733             if (c->prog_data.pull_param[j] == c->prog_data.param[i]) {
1734                pull_constant_loc[i] = j;
1735                break;
1736             }
1737          }
1738          if (pull_constant_loc[i] == -1) {
1739             int pull_index = c->prog_data.nr_pull_params++;
1740             c->prog_data.pull_param[pull_index] = c->prog_data.param[i];
1741             pull_constant_loc[i] = pull_index;;
1742          }
1743       }
1744    }
1745    c->prog_data.nr_params = pull_uniform_base;
1746
1747    foreach_list(node, &this->instructions) {
1748       fs_inst *inst = (fs_inst *)node;
1749
1750       for (int i = 0; i < 3; i++) {
1751          if (inst->src[i].file != UNIFORM)
1752             continue;
1753
1754          int pull_index = pull_constant_loc[inst->src[i].reg +
1755                                             inst->src[i].reg_offset];
1756          if (pull_index == -1)
1757             continue;
1758
1759          assert(!inst->src[i].reladdr);
1760
1761          fs_reg dst = fs_reg(this, glsl_type::float_type);
1762          fs_reg index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1763          fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1764          fs_inst *pull =
1765             new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1766                                  dst, index, offset);
1767          pull->ir = inst->ir;
1768          pull->annotation = inst->annotation;
1769
1770          inst->insert_before(pull);
1771
1772          inst->src[i].file = GRF;
1773          inst->src[i].reg = dst.reg;
1774          inst->src[i].reg_offset = 0;
1775          inst->src[i].smear = pull_index & 3;
1776       }
1777    }
1778 }
1779
1780 bool
1781 fs_visitor::opt_algebraic()
1782 {
1783    bool progress = false;
1784
1785    foreach_list(node, &this->instructions) {
1786       fs_inst *inst = (fs_inst *)node;
1787
1788       switch (inst->opcode) {
1789       case BRW_OPCODE_MUL:
1790          if (inst->src[1].file != IMM)
1791             continue;
1792
1793          /* a * 1.0 = a */
1794          if (inst->src[1].is_one()) {
1795             inst->opcode = BRW_OPCODE_MOV;
1796             inst->src[1] = reg_undef;
1797             progress = true;
1798             break;
1799          }
1800
1801          /* a * 0.0 = 0.0 */
1802          if (inst->src[1].is_zero()) {
1803             inst->opcode = BRW_OPCODE_MOV;
1804             inst->src[0] = inst->src[1];
1805             inst->src[1] = reg_undef;
1806             progress = true;
1807             break;
1808          }
1809
1810          break;
1811       case BRW_OPCODE_ADD:
1812          if (inst->src[1].file != IMM)
1813             continue;
1814
1815          /* a + 0.0 = a */
1816          if (inst->src[1].is_zero()) {
1817             inst->opcode = BRW_OPCODE_MOV;
1818             inst->src[1] = reg_undef;
1819             progress = true;
1820             break;
1821          }
1822          break;
1823       default:
1824          break;
1825       }
1826    }
1827
1828    return progress;
1829 }
1830
1831 /**
1832  * Removes any instructions writing a VGRF where that VGRF is not used by any
1833  * later instruction.
1834  */
1835 bool
1836 fs_visitor::dead_code_eliminate()
1837 {
1838    bool progress = false;
1839    int pc = 0;
1840
1841    calculate_live_intervals();
1842
1843    foreach_list_safe(node, &this->instructions) {
1844       fs_inst *inst = (fs_inst *)node;
1845
1846       if (inst->dst.file == GRF) {
1847          assert(this->virtual_grf_end[inst->dst.reg] >= pc);
1848          if (this->virtual_grf_end[inst->dst.reg] == pc) {
1849             /* Don't dead code eliminate instructions that write to the
1850              * accumulator as a side-effect. Instead just set the destination
1851              * to the null register to free it.
1852              */
1853             switch (inst->opcode) {
1854             case BRW_OPCODE_ADDC:
1855             case BRW_OPCODE_SUBB:
1856             case BRW_OPCODE_MACH:
1857                inst->dst = fs_reg(retype(brw_null_reg(), inst->dst.type));
1858                break;
1859             default:
1860                inst->remove();
1861                break;
1862             }
1863             progress = true;
1864          }
1865       }
1866
1867       pc++;
1868    }
1869
1870    if (progress)
1871       live_intervals_valid = false;
1872
1873    return progress;
1874 }
1875
1876 struct dead_code_hash_key
1877 {
1878    int vgrf;
1879    int reg_offset;
1880 };
1881
1882 static bool
1883 dead_code_hash_compare(const void *a, const void *b)
1884 {
1885    return memcmp(a, b, sizeof(struct dead_code_hash_key)) == 0;
1886 }
1887
1888 static void
1889 clear_dead_code_hash(struct hash_table *ht)
1890 {
1891    struct hash_entry *entry;
1892
1893    hash_table_foreach(ht, entry) {
1894       _mesa_hash_table_remove(ht, entry);
1895    }
1896 }
1897
1898 static void
1899 insert_dead_code_hash(struct hash_table *ht,
1900                       int vgrf, int reg_offset, fs_inst *inst)
1901 {
1902    /* We don't bother freeing keys, because they'll be GCed with the ht. */
1903    struct dead_code_hash_key *key = ralloc(ht, struct dead_code_hash_key);
1904
1905    key->vgrf = vgrf;
1906    key->reg_offset = reg_offset;
1907
1908    _mesa_hash_table_insert(ht, _mesa_hash_data(key, sizeof(*key)), key, inst);
1909 }
1910
1911 static struct hash_entry *
1912 get_dead_code_hash_entry(struct hash_table *ht, int vgrf, int reg_offset)
1913 {
1914    struct dead_code_hash_key key;
1915
1916    key.vgrf = vgrf;
1917    key.reg_offset = reg_offset;
1918
1919    return _mesa_hash_table_search(ht, _mesa_hash_data(&key, sizeof(key)), &key);
1920 }
1921
1922 static void
1923 remove_dead_code_hash(struct hash_table *ht,
1924                       int vgrf, int reg_offset)
1925 {
1926    struct hash_entry *entry = get_dead_code_hash_entry(ht, vgrf, reg_offset);
1927    if (!entry)
1928       return;
1929
1930    _mesa_hash_table_remove(ht, entry);
1931 }
1932
1933 /**
1934  * Walks basic blocks, removing any regs that are written but not read before
1935  * being redefined.
1936  *
1937  * The dead_code_eliminate() function implements a global dead code
1938  * elimination, but it only handles the removing the last write to a register
1939  * if it's never read.  This one can handle intermediate writes, but only
1940  * within a basic block.
1941  */
1942 bool
1943 fs_visitor::dead_code_eliminate_local()
1944 {
1945    struct hash_table *ht;
1946    bool progress = false;
1947
1948    ht = _mesa_hash_table_create(mem_ctx, dead_code_hash_compare);
1949
1950    foreach_list_safe(node, &this->instructions) {
1951       fs_inst *inst = (fs_inst *)node;
1952
1953       /* At a basic block, empty the HT since we don't understand dataflow
1954        * here.
1955        */
1956       if (inst->is_control_flow()) {
1957          clear_dead_code_hash(ht);
1958          continue;
1959       }
1960
1961       /* Clear the HT of any instructions that got read. */
1962       for (int i = 0; i < 3; i++) {
1963          fs_reg src = inst->src[i];
1964          if (src.file != GRF)
1965             continue;
1966
1967          int read = 1;
1968          if (inst->is_send_from_grf())
1969             read = virtual_grf_sizes[src.reg] - src.reg_offset;
1970
1971          for (int reg_offset = src.reg_offset;
1972               reg_offset < src.reg_offset + read;
1973               reg_offset++) {
1974             remove_dead_code_hash(ht, src.reg, reg_offset);
1975          }
1976       }
1977
1978       /* Add any update of a GRF to the HT, removing a previous write if it
1979        * wasn't read.
1980        */
1981       if (inst->dst.file == GRF) {
1982          if (inst->regs_written > 1) {
1983             /* We don't know how to trim channels from an instruction's
1984              * writes, so we can't incrementally remove unread channels from
1985              * it.  Just remove whatever it overwrites from the table
1986              */
1987             for (int i = 0; i < inst->regs_written; i++) {
1988                remove_dead_code_hash(ht,
1989                                      inst->dst.reg,
1990                                      inst->dst.reg_offset + i);
1991             }
1992          } else {
1993             struct hash_entry *entry =
1994                get_dead_code_hash_entry(ht, inst->dst.reg,
1995                                         inst->dst.reg_offset);
1996
1997             if (inst->is_partial_write()) {
1998                /* For a partial write, we can't remove any previous dead code
1999                 * candidate, since we're just modifying their result, but we can
2000                 * be dead code eliminiated ourselves.
2001                 */
2002                if (entry) {
2003                   entry->data = inst;
2004                } else {
2005                   insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
2006                                         inst);
2007                }
2008             } else {
2009                if (entry) {
2010                   /* We're completely updating a channel, and there was a
2011                    * previous write to the channel that wasn't read.  Kill it!
2012                    */
2013                   fs_inst *inst = (fs_inst *)entry->data;
2014                   inst->remove();
2015                   progress = true;
2016                   _mesa_hash_table_remove(ht, entry);
2017                }
2018
2019                insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
2020                                      inst);
2021             }
2022          }
2023       }
2024    }
2025
2026    _mesa_hash_table_destroy(ht, NULL);
2027
2028    if (progress)
2029       live_intervals_valid = false;
2030
2031    return progress;
2032 }
2033
2034 /**
2035  * Implements a second type of register coalescing: This one checks if
2036  * the two regs involved in a raw move don't interfere, in which case
2037  * they can both by stored in the same place and the MOV removed.
2038  */
2039 bool
2040 fs_visitor::register_coalesce_2()
2041 {
2042    bool progress = false;
2043
2044    calculate_live_intervals();
2045
2046    foreach_list_safe(node, &this->instructions) {
2047       fs_inst *inst = (fs_inst *)node;
2048
2049       if (inst->opcode != BRW_OPCODE_MOV ||
2050           inst->is_partial_write() ||
2051           inst->saturate ||
2052           inst->src[0].file != GRF ||
2053           inst->src[0].negate ||
2054           inst->src[0].abs ||
2055           inst->src[0].smear != -1 ||
2056           inst->dst.file != GRF ||
2057           inst->dst.type != inst->src[0].type ||
2058           virtual_grf_sizes[inst->src[0].reg] != 1 ||
2059           virtual_grf_interferes(inst->dst.reg, inst->src[0].reg)) {
2060          continue;
2061       }
2062
2063       int reg_from = inst->src[0].reg;
2064       assert(inst->src[0].reg_offset == 0);
2065       int reg_to = inst->dst.reg;
2066       int reg_to_offset = inst->dst.reg_offset;
2067
2068       foreach_list(node, &this->instructions) {
2069          fs_inst *scan_inst = (fs_inst *)node;
2070
2071          if (scan_inst->dst.file == GRF &&
2072              scan_inst->dst.reg == reg_from) {
2073             scan_inst->dst.reg = reg_to;
2074             scan_inst->dst.reg_offset = reg_to_offset;
2075          }
2076          for (int i = 0; i < 3; i++) {
2077             if (scan_inst->src[i].file == GRF &&
2078                 scan_inst->src[i].reg == reg_from) {
2079                scan_inst->src[i].reg = reg_to;
2080                scan_inst->src[i].reg_offset = reg_to_offset;
2081             }
2082          }
2083       }
2084
2085       inst->remove();
2086
2087       /* We don't need to recalculate live intervals inside the loop despite
2088        * flagging live_intervals_valid because we only use live intervals for
2089        * the interferes test, and we must have had a situation where the
2090        * intervals were:
2091        *
2092        *  from  to
2093        *  ^
2094        *  |
2095        *  v
2096        *        ^
2097        *        |
2098        *        v
2099        *
2100        * Some register R that might get coalesced with one of these two could
2101        * only be referencing "to", otherwise "from"'s range would have been
2102        * longer.  R's range could also only start at the end of "to" or later,
2103        * otherwise it will conflict with "to" when we try to coalesce "to"
2104        * into Rw anyway.
2105        */
2106       live_intervals_valid = false;
2107
2108       progress = true;
2109       continue;
2110    }
2111
2112    return progress;
2113 }
2114
2115 bool
2116 fs_visitor::register_coalesce()
2117 {
2118    bool progress = false;
2119    int if_depth = 0;
2120    int loop_depth = 0;
2121
2122    foreach_list_safe(node, &this->instructions) {
2123       fs_inst *inst = (fs_inst *)node;
2124
2125       /* Make sure that we dominate the instructions we're going to
2126        * scan for interfering with our coalescing, or we won't have
2127        * scanned enough to see if anything interferes with our
2128        * coalescing.  We don't dominate the following instructions if
2129        * we're in a loop or an if block.
2130        */
2131       switch (inst->opcode) {
2132       case BRW_OPCODE_DO:
2133          loop_depth++;
2134          break;
2135       case BRW_OPCODE_WHILE:
2136          loop_depth--;
2137          break;
2138       case BRW_OPCODE_IF:
2139          if_depth++;
2140          break;
2141       case BRW_OPCODE_ENDIF:
2142          if_depth--;
2143          break;
2144       default:
2145          break;
2146       }
2147       if (loop_depth || if_depth)
2148          continue;
2149
2150       if (inst->opcode != BRW_OPCODE_MOV ||
2151           inst->is_partial_write() ||
2152           inst->saturate ||
2153           inst->dst.file != GRF || (inst->src[0].file != GRF &&
2154                                     inst->src[0].file != UNIFORM)||
2155           inst->dst.type != inst->src[0].type)
2156          continue;
2157
2158       bool has_source_modifiers = (inst->src[0].abs ||
2159                                    inst->src[0].negate ||
2160                                    inst->src[0].smear != -1 ||
2161                                    inst->src[0].file == UNIFORM);
2162
2163       /* Found a move of a GRF to a GRF.  Let's see if we can coalesce
2164        * them: check for no writes to either one until the exit of the
2165        * program.
2166        */
2167       bool interfered = false;
2168
2169       for (fs_inst *scan_inst = (fs_inst *)inst->next;
2170            !scan_inst->is_tail_sentinel();
2171            scan_inst = (fs_inst *)scan_inst->next) {
2172          if (scan_inst->dst.file == GRF) {
2173             if (scan_inst->overwrites_reg(inst->dst) ||
2174                 scan_inst->overwrites_reg(inst->src[0])) {
2175                interfered = true;
2176                break;
2177             }
2178          }
2179
2180          if (has_source_modifiers) {
2181             for (int i = 0; i < 3; i++) {
2182                if (scan_inst->src[i].file == GRF &&
2183                    scan_inst->src[i].reg == inst->dst.reg &&
2184                    scan_inst->src[i].reg_offset == inst->dst.reg_offset &&
2185                    inst->dst.type != scan_inst->src[i].type)
2186                {
2187                  interfered = true;
2188                  break;
2189                }
2190             }
2191          }
2192
2193
2194          /* The gen6 MATH instruction can't handle source modifiers or
2195           * unusual register regions, so avoid coalescing those for
2196           * now.  We should do something more specific.
2197           */
2198          if (has_source_modifiers && !can_do_source_mods(scan_inst)) {
2199             interfered = true;
2200             break;
2201          }
2202
2203          /* The accumulator result appears to get used for the
2204           * conditional modifier generation.  When negating a UD
2205           * value, there is a 33rd bit generated for the sign in the
2206           * accumulator value, so now you can't check, for example,
2207           * equality with a 32-bit value.  See piglit fs-op-neg-uint.
2208           */
2209          if (scan_inst->conditional_mod &&
2210              inst->src[0].negate &&
2211              inst->src[0].type == BRW_REGISTER_TYPE_UD) {
2212             interfered = true;
2213             break;
2214          }
2215       }
2216       if (interfered) {
2217          continue;
2218       }
2219
2220       /* Rewrite the later usage to point at the source of the move to
2221        * be removed.
2222        */
2223       for (fs_inst *scan_inst = inst;
2224            !scan_inst->is_tail_sentinel();
2225            scan_inst = (fs_inst *)scan_inst->next) {
2226          for (int i = 0; i < 3; i++) {
2227             if (scan_inst->src[i].file == GRF &&
2228                 scan_inst->src[i].reg == inst->dst.reg &&
2229                 scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
2230                fs_reg new_src = inst->src[0];
2231                if (scan_inst->src[i].abs) {
2232                   new_src.negate = 0;
2233                   new_src.abs = 1;
2234                }
2235                new_src.negate ^= scan_inst->src[i].negate;
2236                scan_inst->src[i] = new_src;
2237             }
2238          }
2239       }
2240
2241       inst->remove();
2242       progress = true;
2243    }
2244
2245    if (progress)
2246       live_intervals_valid = false;
2247
2248    return progress;
2249 }
2250
2251
2252 bool
2253 fs_visitor::compute_to_mrf()
2254 {
2255    bool progress = false;
2256    int next_ip = 0;
2257
2258    calculate_live_intervals();
2259
2260    foreach_list_safe(node, &this->instructions) {
2261       fs_inst *inst = (fs_inst *)node;
2262
2263       int ip = next_ip;
2264       next_ip++;
2265
2266       if (inst->opcode != BRW_OPCODE_MOV ||
2267           inst->is_partial_write() ||
2268           inst->dst.file != MRF || inst->src[0].file != GRF ||
2269           inst->dst.type != inst->src[0].type ||
2270           inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
2271          continue;
2272
2273       /* Work out which hardware MRF registers are written by this
2274        * instruction.
2275        */
2276       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2277       int mrf_high;
2278       if (inst->dst.reg & BRW_MRF_COMPR4) {
2279          mrf_high = mrf_low + 4;
2280       } else if (dispatch_width == 16 &&
2281                  (!inst->force_uncompressed && !inst->force_sechalf)) {
2282          mrf_high = mrf_low + 1;
2283       } else {
2284          mrf_high = mrf_low;
2285       }
2286
2287       /* Can't compute-to-MRF this GRF if someone else was going to
2288        * read it later.
2289        */
2290       if (this->virtual_grf_end[inst->src[0].reg] > ip)
2291          continue;
2292
2293       /* Found a move of a GRF to a MRF.  Let's see if we can go
2294        * rewrite the thing that made this GRF to write into the MRF.
2295        */
2296       fs_inst *scan_inst;
2297       for (scan_inst = (fs_inst *)inst->prev;
2298            scan_inst->prev != NULL;
2299            scan_inst = (fs_inst *)scan_inst->prev) {
2300          if (scan_inst->dst.file == GRF &&
2301              scan_inst->dst.reg == inst->src[0].reg) {
2302             /* Found the last thing to write our reg we want to turn
2303              * into a compute-to-MRF.
2304              */
2305
2306             /* If this one instruction didn't populate all the
2307              * channels, bail.  We might be able to rewrite everything
2308              * that writes that reg, but it would require smarter
2309              * tracking to delay the rewriting until complete success.
2310              */
2311             if (scan_inst->is_partial_write())
2312                break;
2313
2314             /* Things returning more than one register would need us to
2315              * understand coalescing out more than one MOV at a time.
2316              */
2317             if (scan_inst->regs_written > 1)
2318                break;
2319
2320             /* SEND instructions can't have MRF as a destination. */
2321             if (scan_inst->mlen)
2322                break;
2323
2324             if (brw->gen == 6) {
2325                /* gen6 math instructions must have the destination be
2326                 * GRF, so no compute-to-MRF for them.
2327                 */
2328                if (scan_inst->is_math()) {
2329                   break;
2330                }
2331             }
2332
2333             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2334                /* Found the creator of our MRF's source value. */
2335                scan_inst->dst.file = MRF;
2336                scan_inst->dst.reg = inst->dst.reg;
2337                scan_inst->saturate |= inst->saturate;
2338                inst->remove();
2339                progress = true;
2340             }
2341             break;
2342          }
2343
2344          /* We don't handle control flow here.  Most computation of
2345           * values that end up in MRFs are shortly before the MRF
2346           * write anyway.
2347           */
2348          if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2349             break;
2350
2351          /* You can't read from an MRF, so if someone else reads our
2352           * MRF's source GRF that we wanted to rewrite, that stops us.
2353           */
2354          bool interfered = false;
2355          for (int i = 0; i < 3; i++) {
2356             if (scan_inst->src[i].file == GRF &&
2357                 scan_inst->src[i].reg == inst->src[0].reg &&
2358                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2359                interfered = true;
2360             }
2361          }
2362          if (interfered)
2363             break;
2364
2365          if (scan_inst->dst.file == MRF) {
2366             /* If somebody else writes our MRF here, we can't
2367              * compute-to-MRF before that.
2368              */
2369             int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2370             int scan_mrf_high;
2371
2372             if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2373                scan_mrf_high = scan_mrf_low + 4;
2374             } else if (dispatch_width == 16 &&
2375                        (!scan_inst->force_uncompressed &&
2376                         !scan_inst->force_sechalf)) {
2377                scan_mrf_high = scan_mrf_low + 1;
2378             } else {
2379                scan_mrf_high = scan_mrf_low;
2380             }
2381
2382             if (mrf_low == scan_mrf_low ||
2383                 mrf_low == scan_mrf_high ||
2384                 mrf_high == scan_mrf_low ||
2385                 mrf_high == scan_mrf_high) {
2386                break;
2387             }
2388          }
2389
2390          if (scan_inst->mlen > 0) {
2391             /* Found a SEND instruction, which means that there are
2392              * live values in MRFs from base_mrf to base_mrf +
2393              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
2394              * above it.
2395              */
2396             if (mrf_low >= scan_inst->base_mrf &&
2397                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2398                break;
2399             }
2400             if (mrf_high >= scan_inst->base_mrf &&
2401                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2402                break;
2403             }
2404          }
2405       }
2406    }
2407
2408    if (progress)
2409       live_intervals_valid = false;
2410
2411    return progress;
2412 }
2413
2414 /**
2415  * Walks through basic blocks, looking for repeated MRF writes and
2416  * removing the later ones.
2417  */
2418 bool
2419 fs_visitor::remove_duplicate_mrf_writes()
2420 {
2421    fs_inst *last_mrf_move[16];
2422    bool progress = false;
2423
2424    /* Need to update the MRF tracking for compressed instructions. */
2425    if (dispatch_width == 16)
2426       return false;
2427
2428    memset(last_mrf_move, 0, sizeof(last_mrf_move));
2429
2430    foreach_list_safe(node, &this->instructions) {
2431       fs_inst *inst = (fs_inst *)node;
2432
2433       if (inst->is_control_flow()) {
2434          memset(last_mrf_move, 0, sizeof(last_mrf_move));
2435       }
2436
2437       if (inst->opcode == BRW_OPCODE_MOV &&
2438           inst->dst.file == MRF) {
2439          fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2440          if (prev_inst && inst->equals(prev_inst)) {
2441             inst->remove();
2442             progress = true;
2443             continue;
2444          }
2445       }
2446
2447       /* Clear out the last-write records for MRFs that were overwritten. */
2448       if (inst->dst.file == MRF) {
2449          last_mrf_move[inst->dst.reg] = NULL;
2450       }
2451
2452       if (inst->mlen > 0) {
2453          /* Found a SEND instruction, which will include two or fewer
2454           * implied MRF writes.  We could do better here.
2455           */
2456          for (int i = 0; i < implied_mrf_writes(inst); i++) {
2457             last_mrf_move[inst->base_mrf + i] = NULL;
2458          }
2459       }
2460
2461       /* Clear out any MRF move records whose sources got overwritten. */
2462       if (inst->dst.file == GRF) {
2463          for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2464             if (last_mrf_move[i] &&
2465                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2466                last_mrf_move[i] = NULL;
2467             }
2468          }
2469       }
2470
2471       if (inst->opcode == BRW_OPCODE_MOV &&
2472           inst->dst.file == MRF &&
2473           inst->src[0].file == GRF &&
2474           !inst->is_partial_write()) {
2475          last_mrf_move[inst->dst.reg] = inst;
2476       }
2477    }
2478
2479    if (progress)
2480       live_intervals_valid = false;
2481
2482    return progress;
2483 }
2484
2485 static void
2486 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2487                         int first_grf, int grf_len)
2488 {
2489    bool inst_16wide = (dispatch_width > 8 &&
2490                        !inst->force_uncompressed &&
2491                        !inst->force_sechalf);
2492
2493    /* Clear the flag for registers that actually got read (as expected). */
2494    for (int i = 0; i < 3; i++) {
2495       int grf;
2496       if (inst->src[i].file == GRF) {
2497          grf = inst->src[i].reg;
2498       } else if (inst->src[i].file == HW_REG &&
2499                  inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2500          grf = inst->src[i].fixed_hw_reg.nr;
2501       } else {
2502          continue;
2503       }
2504
2505       if (grf >= first_grf &&
2506           grf < first_grf + grf_len) {
2507          deps[grf - first_grf] = false;
2508          if (inst_16wide)
2509             deps[grf - first_grf + 1] = false;
2510       }
2511    }
2512 }
2513
2514 /**
2515  * Implements this workaround for the original 965:
2516  *
2517  *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2518  *      check for post destination dependencies on this instruction, software
2519  *      must ensure that there is no destination hazard for the case of ‘write
2520  *      followed by a posted write’ shown in the following example.
2521  *
2522  *      1. mov r3 0
2523  *      2. send r3.xy <rest of send instruction>
2524  *      3. mov r2 r3
2525  *
2526  *      Due to no post-destination dependency check on the ‘send’, the above
2527  *      code sequence could have two instructions (1 and 2) in flight at the
2528  *      same time that both consider ‘r3’ as the target of their final writes.
2529  */
2530 void
2531 fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
2532 {
2533    int reg_size = dispatch_width / 8;
2534    int write_len = inst->regs_written * reg_size;
2535    int first_write_grf = inst->dst.reg;
2536    bool needs_dep[BRW_MAX_MRF];
2537    assert(write_len < (int)sizeof(needs_dep) - 1);
2538
2539    memset(needs_dep, false, sizeof(needs_dep));
2540    memset(needs_dep, true, write_len);
2541
2542    clear_deps_for_inst_src(inst, dispatch_width,
2543                            needs_dep, first_write_grf, write_len);
2544
2545    /* Walk backwards looking for writes to registers we're writing which
2546     * aren't read since being written.  If we hit the start of the program,
2547     * we assume that there are no outstanding dependencies on entry to the
2548     * program.
2549     */
2550    for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2551         scan_inst != NULL;
2552         scan_inst = (fs_inst *)scan_inst->prev) {
2553
2554       /* If we hit control flow, assume that there *are* outstanding
2555        * dependencies, and force their cleanup before our instruction.
2556        */
2557       if (scan_inst->is_control_flow()) {
2558          for (int i = 0; i < write_len; i++) {
2559             if (needs_dep[i]) {
2560                inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2561             }
2562          }
2563          return;
2564       }
2565
2566       bool scan_inst_16wide = (dispatch_width > 8 &&
2567                                !scan_inst->force_uncompressed &&
2568                                !scan_inst->force_sechalf);
2569
2570       /* We insert our reads as late as possible on the assumption that any
2571        * instruction but a MOV that might have left us an outstanding
2572        * dependency has more latency than a MOV.
2573        */
2574       if (scan_inst->dst.file == GRF) {
2575          for (int i = 0; i < scan_inst->regs_written; i++) {
2576             int reg = scan_inst->dst.reg + i * reg_size;
2577
2578             if (reg >= first_write_grf &&
2579                 reg < first_write_grf + write_len &&
2580                 needs_dep[reg - first_write_grf]) {
2581                inst->insert_before(DEP_RESOLVE_MOV(reg));
2582                needs_dep[reg - first_write_grf] = false;
2583                if (scan_inst_16wide)
2584                   needs_dep[reg - first_write_grf + 1] = false;
2585             }
2586          }
2587       }
2588
2589       /* Clear the flag for registers that actually got read (as expected). */
2590       clear_deps_for_inst_src(scan_inst, dispatch_width,
2591                               needs_dep, first_write_grf, write_len);
2592
2593       /* Continue the loop only if we haven't resolved all the dependencies */
2594       int i;
2595       for (i = 0; i < write_len; i++) {
2596          if (needs_dep[i])
2597             break;
2598       }
2599       if (i == write_len)
2600          return;
2601    }
2602 }
2603
2604 /**
2605  * Implements this workaround for the original 965:
2606  *
2607  *     "[DevBW, DevCL] Errata: A destination register from a send can not be
2608  *      used as a destination register until after it has been sourced by an
2609  *      instruction with a different destination register.
2610  */
2611 void
2612 fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
2613 {
2614    int write_len = inst->regs_written * dispatch_width / 8;
2615    int first_write_grf = inst->dst.reg;
2616    bool needs_dep[BRW_MAX_MRF];
2617    assert(write_len < (int)sizeof(needs_dep) - 1);
2618
2619    memset(needs_dep, false, sizeof(needs_dep));
2620    memset(needs_dep, true, write_len);
2621    /* Walk forwards looking for writes to registers we're writing which aren't
2622     * read before being written.
2623     */
2624    for (fs_inst *scan_inst = (fs_inst *)inst->next;
2625         !scan_inst->is_tail_sentinel();
2626         scan_inst = (fs_inst *)scan_inst->next) {
2627       /* If we hit control flow, force resolve all remaining dependencies. */
2628       if (scan_inst->is_control_flow()) {
2629          for (int i = 0; i < write_len; i++) {
2630             if (needs_dep[i])
2631                scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2632          }
2633          return;
2634       }
2635
2636       /* Clear the flag for registers that actually got read (as expected). */
2637       clear_deps_for_inst_src(scan_inst, dispatch_width,
2638                               needs_dep, first_write_grf, write_len);
2639
2640       /* We insert our reads as late as possible since they're reading the
2641        * result of a SEND, which has massive latency.
2642        */
2643       if (scan_inst->dst.file == GRF &&
2644           scan_inst->dst.reg >= first_write_grf &&
2645           scan_inst->dst.reg < first_write_grf + write_len &&
2646           needs_dep[scan_inst->dst.reg - first_write_grf]) {
2647          scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2648          needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2649       }
2650
2651       /* Continue the loop only if we haven't resolved all the dependencies */
2652       int i;
2653       for (i = 0; i < write_len; i++) {
2654          if (needs_dep[i])
2655             break;
2656       }
2657       if (i == write_len)
2658          return;
2659    }
2660
2661    /* If we hit the end of the program, resolve all remaining dependencies out
2662     * of paranoia.
2663     */
2664    fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2665    assert(last_inst->eot);
2666    for (int i = 0; i < write_len; i++) {
2667       if (needs_dep[i])
2668          last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2669    }
2670 }
2671
2672 void
2673 fs_visitor::insert_gen4_send_dependency_workarounds()
2674 {
2675    if (brw->gen != 4 || brw->is_g4x)
2676       return;
2677
2678    /* Note that we're done with register allocation, so GRF fs_regs always
2679     * have a .reg_offset of 0.
2680     */
2681
2682    foreach_list_safe(node, &this->instructions) {
2683       fs_inst *inst = (fs_inst *)node;
2684
2685       if (inst->mlen != 0 && inst->dst.file == GRF) {
2686          insert_gen4_pre_send_dependency_workarounds(inst);
2687          insert_gen4_post_send_dependency_workarounds(inst);
2688       }
2689    }
2690 }
2691
2692 /**
2693  * Turns the generic expression-style uniform pull constant load instruction
2694  * into a hardware-specific series of instructions for loading a pull
2695  * constant.
2696  *
2697  * The expression style allows the CSE pass before this to optimize out
2698  * repeated loads from the same offset, and gives the pre-register-allocation
2699  * scheduling full flexibility, while the conversion to native instructions
2700  * allows the post-register-allocation scheduler the best information
2701  * possible.
2702  *
2703  * Note that execution masking for setting up pull constant loads is special:
2704  * the channels that need to be written are unrelated to the current execution
2705  * mask, since a later instruction will use one of the result channels as a
2706  * source operand for all 8 or 16 of its channels.
2707  */
2708 void
2709 fs_visitor::lower_uniform_pull_constant_loads()
2710 {
2711    foreach_list(node, &this->instructions) {
2712       fs_inst *inst = (fs_inst *)node;
2713
2714       if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2715          continue;
2716
2717       if (brw->gen >= 7) {
2718          /* The offset arg before was a vec4-aligned byte offset.  We need to
2719           * turn it into a dword offset.
2720           */
2721          fs_reg const_offset_reg = inst->src[1];
2722          assert(const_offset_reg.file == IMM &&
2723                 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2724          const_offset_reg.imm.u /= 4;
2725          fs_reg payload = fs_reg(this, glsl_type::uint_type);
2726
2727          /* This is actually going to be a MOV, but since only the first dword
2728           * is accessed, we have a special opcode to do just that one.  Note
2729           * that this needs to be an operation that will be considered a def
2730           * by live variable analysis, or register allocation will explode.
2731           */
2732          fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2733                                                payload, const_offset_reg);
2734          setup->force_writemask_all = true;
2735
2736          setup->ir = inst->ir;
2737          setup->annotation = inst->annotation;
2738          inst->insert_before(setup);
2739
2740          /* Similarly, this will only populate the first 4 channels of the
2741           * result register (since we only use smear values from 0-3), but we
2742           * don't tell the optimizer.
2743           */
2744          inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2745          inst->src[1] = payload;
2746
2747          this->live_intervals_valid = false;
2748       } else {
2749          /* Before register allocation, we didn't tell the scheduler about the
2750           * MRF we use.  We know it's safe to use this MRF because nothing
2751           * else does except for register spill/unspill, which generates and
2752           * uses its MRF within a single IR instruction.
2753           */
2754          inst->base_mrf = 14;
2755          inst->mlen = 1;
2756       }
2757    }
2758 }
2759
2760 void
2761 fs_visitor::dump_instruction(backend_instruction *be_inst)
2762 {
2763    fs_inst *inst = (fs_inst *)be_inst;
2764
2765    if (inst->predicate) {
2766       printf("(%cf0.%d) ",
2767              inst->predicate_inverse ? '-' : '+',
2768              inst->flag_subreg);
2769    }
2770
2771    printf("%s", brw_instruction_name(inst->opcode));
2772    if (inst->saturate)
2773       printf(".sat");
2774    if (inst->conditional_mod) {
2775       printf(".cmod");
2776       if (!inst->predicate &&
2777           (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2778                               inst->opcode != BRW_OPCODE_IF &&
2779                               inst->opcode != BRW_OPCODE_WHILE))) {
2780          printf(".f0.%d", inst->flag_subreg);
2781       }
2782    }
2783    printf(" ");
2784
2785
2786    switch (inst->dst.file) {
2787    case GRF:
2788       printf("vgrf%d", inst->dst.reg);
2789       if (inst->dst.reg_offset)
2790          printf("+%d", inst->dst.reg_offset);
2791       break;
2792    case MRF:
2793       printf("m%d", inst->dst.reg);
2794       break;
2795    case BAD_FILE:
2796       printf("(null)");
2797       break;
2798    case UNIFORM:
2799       printf("***u%d***", inst->dst.reg);
2800       break;
2801    default:
2802       printf("???");
2803       break;
2804    }
2805    printf(", ");
2806
2807    for (int i = 0; i < 3; i++) {
2808       if (inst->src[i].negate)
2809          printf("-");
2810       if (inst->src[i].abs)
2811          printf("|");
2812       switch (inst->src[i].file) {
2813       case GRF:
2814          printf("vgrf%d", inst->src[i].reg);
2815          if (inst->src[i].reg_offset)
2816             printf("+%d", inst->src[i].reg_offset);
2817          break;
2818       case MRF:
2819          printf("***m%d***", inst->src[i].reg);
2820          break;
2821       case UNIFORM:
2822          printf("u%d", inst->src[i].reg);
2823          if (inst->src[i].reg_offset)
2824             printf(".%d", inst->src[i].reg_offset);
2825          break;
2826       case BAD_FILE:
2827          printf("(null)");
2828          break;
2829       case IMM:
2830          switch (inst->src[i].type) {
2831          case BRW_REGISTER_TYPE_F:
2832             printf("%ff", inst->src[i].imm.f);
2833             break;
2834          case BRW_REGISTER_TYPE_D:
2835             printf("%dd", inst->src[i].imm.i);
2836             break;
2837          case BRW_REGISTER_TYPE_UD:
2838             printf("%uu", inst->src[i].imm.u);
2839             break;
2840          default:
2841             printf("???");
2842             break;
2843          }
2844          break;
2845       default:
2846          printf("???");
2847          break;
2848       }
2849       if (inst->src[i].abs)
2850          printf("|");
2851
2852       if (i < 3)
2853          printf(", ");
2854    }
2855
2856    printf(" ");
2857
2858    if (inst->force_uncompressed)
2859       printf("1sthalf ");
2860
2861    if (inst->force_sechalf)
2862       printf("2ndhalf ");
2863
2864    printf("\n");
2865 }
2866
2867 /**
2868  * Possibly returns an instruction that set up @param reg.
2869  *
2870  * Sometimes we want to take the result of some expression/variable
2871  * dereference tree and rewrite the instruction generating the result
2872  * of the tree.  When processing the tree, we know that the
2873  * instructions generated are all writing temporaries that are dead
2874  * outside of this tree.  So, if we have some instructions that write
2875  * a temporary, we're free to point that temp write somewhere else.
2876  *
2877  * Note that this doesn't guarantee that the instruction generated
2878  * only reg -- it might be the size=4 destination of a texture instruction.
2879  */
2880 fs_inst *
2881 fs_visitor::get_instruction_generating_reg(fs_inst *start,
2882                                            fs_inst *end,
2883                                            fs_reg reg)
2884 {
2885    if (end == start ||
2886        end->is_partial_write() ||
2887        reg.reladdr ||
2888        !reg.equals(end->dst)) {
2889       return NULL;
2890    } else {
2891       return end;
2892    }
2893 }
2894
2895 void
2896 fs_visitor::setup_payload_gen6()
2897 {
2898    bool uses_depth =
2899       (fp->Base.InputsRead & (1 << VARYING_SLOT_POS)) != 0;
2900    unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
2901
2902    assert(brw->gen >= 6);
2903
2904    /* R0-1: masks, pixel X/Y coordinates. */
2905    c->nr_payload_regs = 2;
2906    /* R2: only for 32-pixel dispatch.*/
2907
2908    /* R3-26: barycentric interpolation coordinates.  These appear in the
2909     * same order that they appear in the brw_wm_barycentric_interp_mode
2910     * enum.  Each set of coordinates occupies 2 registers if dispatch width
2911     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
2912     * appear if they were enabled using the "Barycentric Interpolation
2913     * Mode" bits in WM_STATE.
2914     */
2915    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
2916       if (barycentric_interp_modes & (1 << i)) {
2917          c->barycentric_coord_reg[i] = c->nr_payload_regs;
2918          c->nr_payload_regs += 2;
2919          if (dispatch_width == 16) {
2920             c->nr_payload_regs += 2;
2921          }
2922       }
2923    }
2924
2925    /* R27: interpolated depth if uses source depth */
2926    if (uses_depth) {
2927       c->source_depth_reg = c->nr_payload_regs;
2928       c->nr_payload_regs++;
2929       if (dispatch_width == 16) {
2930          /* R28: interpolated depth if not 8-wide. */
2931          c->nr_payload_regs++;
2932       }
2933    }
2934    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
2935    if (uses_depth) {
2936       c->source_w_reg = c->nr_payload_regs;
2937       c->nr_payload_regs++;
2938       if (dispatch_width == 16) {
2939          /* R30: interpolated W if not 8-wide. */
2940          c->nr_payload_regs++;
2941       }
2942    }
2943    /* R31: MSAA position offsets. */
2944    /* R32-: bary for 32-pixel. */
2945    /* R58-59: interp W for 32-pixel. */
2946
2947    if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
2948       c->source_depth_to_render_target = true;
2949    }
2950 }
2951
2952 bool
2953 fs_visitor::run()
2954 {
2955    sanity_param_count = fp->Base.Parameters->NumParameters;
2956    uint32_t orig_nr_params = c->prog_data.nr_params;
2957
2958    if (brw->gen >= 6)
2959       setup_payload_gen6();
2960    else
2961       setup_payload_gen4();
2962
2963    if (0) {
2964       emit_dummy_fs();
2965    } else {
2966       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2967          emit_shader_time_begin();
2968
2969       calculate_urb_setup();
2970       if (brw->gen < 6)
2971          emit_interpolation_setup_gen4();
2972       else
2973          emit_interpolation_setup_gen6();
2974
2975       /* We handle discards by keeping track of the still-live pixels in f0.1.
2976        * Initialize it with the dispatched pixels.
2977        */
2978       if (fp->UsesKill) {
2979          fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
2980          discard_init->flag_subreg = 1;
2981       }
2982
2983       /* Generate FS IR for main().  (the visitor only descends into
2984        * functions called "main").
2985        */
2986       if (shader) {
2987          foreach_list(node, &*shader->ir) {
2988             ir_instruction *ir = (ir_instruction *)node;
2989             base_ir = ir;
2990             this->result = reg_undef;
2991             ir->accept(this);
2992          }
2993       } else {
2994          emit_fragment_program_code();
2995       }
2996       base_ir = NULL;
2997       if (failed)
2998          return false;
2999
3000       emit(FS_OPCODE_PLACEHOLDER_HALT);
3001
3002       emit_fb_writes();
3003
3004       split_virtual_grfs();
3005
3006       move_uniform_array_access_to_pull_constants();
3007       setup_pull_constants();
3008
3009       bool progress;
3010       do {
3011          progress = false;
3012
3013          compact_virtual_grfs();
3014
3015          progress = remove_duplicate_mrf_writes() || progress;
3016
3017          progress = opt_algebraic() || progress;
3018          progress = opt_cse() || progress;
3019          progress = opt_copy_propagate() || progress;
3020          progress = dead_code_eliminate() || progress;
3021          progress = dead_code_eliminate_local() || progress;
3022          progress = register_coalesce() || progress;
3023          progress = register_coalesce_2() || progress;
3024          progress = compute_to_mrf() || progress;
3025       } while (progress);
3026
3027       remove_dead_constants();
3028
3029       schedule_instructions(false);
3030
3031       lower_uniform_pull_constant_loads();
3032
3033       assign_curb_setup();
3034       assign_urb_setup();
3035
3036       if (0) {
3037          /* Debug of register spilling: Go spill everything. */
3038          for (int i = 0; i < virtual_grf_count; i++) {
3039             spill_reg(i);
3040          }
3041       }
3042
3043       if (0)
3044          assign_regs_trivial();
3045       else {
3046          while (!assign_regs()) {
3047             if (failed)
3048                break;
3049          }
3050       }
3051    }
3052    assert(force_uncompressed_stack == 0);
3053    assert(force_sechalf_stack == 0);
3054
3055    /* This must come after all optimization and register allocation, since
3056     * it inserts dead code that happens to have side effects, and it does
3057     * so based on the actual physical registers in use.
3058     */
3059    insert_gen4_send_dependency_workarounds();
3060
3061    if (failed)
3062       return false;
3063
3064    schedule_instructions(true);
3065
3066    if (dispatch_width == 8) {
3067       c->prog_data.reg_blocks = brw_register_blocks(grf_used);
3068    } else {
3069       c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
3070
3071       /* Make sure we didn't try to sneak in an extra uniform */
3072       assert(orig_nr_params == c->prog_data.nr_params);
3073       (void) orig_nr_params;
3074    }
3075
3076    /* If any state parameters were appended, then ParameterValues could have
3077     * been realloced, in which case the driver uniform storage set up by
3078     * _mesa_associate_uniform_storage() would point to freed memory.  Make
3079     * sure that didn't happen.
3080     */
3081    assert(sanity_param_count == fp->Base.Parameters->NumParameters);
3082
3083    return !failed;
3084 }
3085
3086 const unsigned *
3087 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
3088                struct gl_fragment_program *fp,
3089                struct gl_shader_program *prog,
3090                unsigned *final_assembly_size)
3091 {
3092    bool start_busy = false;
3093    float start_time = 0;
3094
3095    if (unlikely(brw->perf_debug)) {
3096       start_busy = (brw->batch.last_bo &&
3097                     drm_intel_bo_busy(brw->batch.last_bo));
3098       start_time = get_time();
3099    }
3100
3101    struct brw_shader *shader = NULL;
3102    if (prog)
3103       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3104
3105    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3106       if (prog) {
3107          printf("GLSL IR for native fragment shader %d:\n", prog->Name);
3108          _mesa_print_ir(shader->ir, NULL);
3109          printf("\n\n");
3110       } else {
3111          printf("ARB_fragment_program %d ir for native fragment shader\n",
3112                 fp->Base.Id);
3113          _mesa_print_program(&fp->Base);
3114       }
3115    }
3116
3117    /* Now the main event: Visit the shader IR and generate our FS IR for it.
3118     */
3119    fs_visitor v(brw, c, prog, fp, 8);
3120    if (!v.run()) {
3121       if (prog) {
3122          prog->LinkStatus = false;
3123          ralloc_strcat(&prog->InfoLog, v.fail_msg);
3124       }
3125
3126       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3127                     v.fail_msg);
3128
3129       return NULL;
3130    }
3131
3132    exec_list *simd16_instructions = NULL;
3133    fs_visitor v2(brw, c, prog, fp, 16);
3134    if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16))) {
3135       if (c->prog_data.nr_pull_params == 0) {
3136          /* Try a 16-wide compile */
3137          v2.import_uniforms(&v);
3138          if (!v2.run()) {
3139             perf_debug("16-wide shader failed to compile, falling back to "
3140                        "8-wide at a 10-20%% performance cost: %s", v2.fail_msg);
3141          } else {
3142             simd16_instructions = &v2.instructions;
3143          }
3144       } else {
3145          perf_debug("Skipping 16-wide due to pull parameters.\n");
3146       }
3147    }
3148
3149    c->prog_data.dispatch_width = 8;
3150
3151    fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
3152    const unsigned *generated = g.generate_assembly(&v.instructions,
3153                                                    simd16_instructions,
3154                                                    final_assembly_size);
3155
3156    if (unlikely(brw->perf_debug) && shader) {
3157       if (shader->compiled_once)
3158          brw_wm_debug_recompile(brw, prog, &c->key);
3159       shader->compiled_once = true;
3160
3161       if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
3162          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3163                     (get_time() - start_time) * 1000);
3164       }
3165    }
3166
3167    return generated;
3168 }
3169
3170 bool
3171 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
3172 {
3173    struct brw_context *brw = brw_context(ctx);
3174    struct brw_wm_prog_key key;
3175
3176    if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
3177       return true;
3178
3179    struct gl_fragment_program *fp = (struct gl_fragment_program *)
3180       prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
3181    struct brw_fragment_program *bfp = brw_fragment_program(fp);
3182    bool program_uses_dfdy = fp->UsesDFdy;
3183
3184    memset(&key, 0, sizeof(key));
3185
3186    if (brw->gen < 6) {
3187       if (fp->UsesKill)
3188          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3189
3190       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3191          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3192
3193       /* Just assume depth testing. */
3194       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3195       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3196    }
3197
3198    if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
3199                                          BRW_FS_VARYING_INPUT_MASK) > 16)
3200       key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
3201
3202    key.clamp_fragment_color = ctx->API == API_OPENGL_COMPAT;
3203
3204    unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
3205    for (unsigned i = 0; i < sampler_count; i++) {
3206       if (fp->Base.ShadowSamplers & (1 << i)) {
3207          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3208          key.tex.swizzles[i] =
3209             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3210       } else {
3211          /* Color sampler: assume no swizzling. */
3212          key.tex.swizzles[i] = SWIZZLE_XYZW;
3213       }
3214    }
3215
3216    if (fp->Base.InputsRead & VARYING_BIT_POS) {
3217       key.drawable_height = ctx->DrawBuffer->Height;
3218    }
3219
3220    if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3221       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
3222    }
3223
3224    key.nr_color_regions = 1;
3225
3226    /* GL_FRAGMENT_SHADER_DERIVATIVE_HINT is almost always GL_DONT_CARE.  The
3227     * quality of the derivatives is likely to be determined by the driconf
3228     * option.
3229     */
3230    key.high_quality_derivatives = brw->disable_derivative_optimization;
3231
3232    key.program_string_id = bfp->id;
3233
3234    uint32_t old_prog_offset = brw->wm.base.prog_offset;
3235    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3236
3237    bool success = do_wm_prog(brw, prog, bfp, &key);
3238
3239    brw->wm.base.prog_offset = old_prog_offset;
3240    brw->wm.prog_data = old_prog_data;
3241
3242    return success;
3243 }