src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 extern "C" {
  32
  33 #include <sys/types.h>
  34
  35 #include "main/macros.h"
  36 #include "main/shaderobj.h"
  37 #include "main/uniforms.h"
  38 #include "main/fbobject.h"
  39 #include "program/prog_parameter.h"
  40 #include "program/prog_print.h"
  41 #include "program/register_allocate.h"
  42 #include "program/sampler.h"
  43 #include "program/hash_table.h"
  44 #include "brw_context.h"
  45 #include "brw_eu.h"
  46 #include "brw_wm.h"
  47 }
  48 #include "brw_fs.h"
  49 #include "glsl/glsl_types.h"
  50 #include "glsl/ir_print_visitor.h"
  51
  52 void
  53 fs_inst::init()
  54 {
  55    memset(this, 0, sizeof(*this));
  56    this->opcode = BRW_OPCODE_NOP;
  57    this->conditional_mod = BRW_CONDITIONAL_NONE;
  58
  59    this->dst = reg_undef;
  60    this->src[0] = reg_undef;
  61    this->src[1] = reg_undef;
  62    this->src[2] = reg_undef;
  63 }
  64
  65 fs_inst::fs_inst()
  66 {
  67    init();
  68 }
  69
  70 fs_inst::fs_inst(enum opcode opcode)
  71 {
  72    init();
  73    this->opcode = opcode;
  74 }
  75
  76 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
  77 {
  78    init();
  79    this->opcode = opcode;
  80    this->dst = dst;
  81
  82    if (dst.file == GRF)
  83       assert(dst.reg_offset >= 0);
  84 }
  85
  86 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
  87 {
  88    init();
  89    this->opcode = opcode;
  90    this->dst = dst;
  91    this->src[0] = src0;
  92
  93    if (dst.file == GRF)
  94       assert(dst.reg_offset >= 0);
  95    if (src[0].file == GRF)
  96       assert(src[0].reg_offset >= 0);
  97 }
  98
  99 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 100 {
 101    init();
 102    this->opcode = opcode;
 103    this->dst = dst;
 104    this->src[0] = src0;
 105    this->src[1] = src1;
 106
 107    if (dst.file == GRF)
 108       assert(dst.reg_offset >= 0);
 109    if (src[0].file == GRF)
 110       assert(src[0].reg_offset >= 0);
 111    if (src[1].file == GRF)
 112       assert(src[1].reg_offset >= 0);
 113 }
 114
 115 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
 116                  fs_reg src0, fs_reg src1, fs_reg src2)
 117 {
 118    init();
 119    this->opcode = opcode;
 120    this->dst = dst;
 121    this->src[0] = src0;
 122    this->src[1] = src1;
 123    this->src[2] = src2;
 124
 125    if (dst.file == GRF)
 126       assert(dst.reg_offset >= 0);
 127    if (src[0].file == GRF)
 128       assert(src[0].reg_offset >= 0);
 129    if (src[1].file == GRF)
 130       assert(src[1].reg_offset >= 0);
 131    if (src[2].file == GRF)
 132       assert(src[2].reg_offset >= 0);
 133 }
 134
 135 #define ALU1(op)                                                        \
 136    fs_inst *                                                            \
 137    fs_visitor::op(fs_reg dst, fs_reg src0)                              \
 138    {                                                                    \
 139       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
 140    }
 141
 142 #define ALU2(op)                                                        \
 143    fs_inst *                                                            \
 144    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1)                 \
 145    {                                                                    \
 146       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
 147    }
 148
 149 #define ALU3(op)                                                        \
 150    fs_inst *                                                            \
 151    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1, fs_reg src2)    \
 152    {                                                                    \
 153       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
 154    }
 155
 156 ALU1(NOT)
 157 ALU1(MOV)
 158 ALU1(FRC)
 159 ALU1(RNDD)
 160 ALU1(RNDE)
 161 ALU1(RNDZ)
 162 ALU2(ADD)
 163 ALU2(MUL)
 164 ALU2(MACH)
 165 ALU2(AND)
 166 ALU2(OR)
 167 ALU2(XOR)
 168 ALU2(SHL)
 169 ALU2(SHR)
 170 ALU2(ASR)
 171 ALU3(LRP)
 172
 173 /** Gen4 predicated IF. */
 174 fs_inst *
 175 fs_visitor::IF(uint32_t predicate)
 176 {
 177    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
 178    inst->predicate = predicate;
 179    return inst;
 180 }
 181
 182 /** Gen6+ IF with embedded comparison. */
 183 fs_inst *
 184 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
 185 {
 186    assert(intel->gen >= 6);
 187    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
 188                                         reg_null_d, src0, src1);
 189    inst->conditional_mod = condition;
 190    return inst;
 191 }
 192
 193 /**
 194  * CMP: Sets the low bit of the destination channels with the result
 195  * of the comparison, while the upper bits are undefined, and updates
 196  * the flag register with the packed 16 bits of the result.
 197  */
 198 fs_inst *
 199 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
 200 {
 201    fs_inst *inst;
 202
 203    /* Take the instruction:
 204     *
 205     * CMP null<d> src0<f> src1<f>
 206     *
 207     * Original gen4 does type conversion to the destination type before
 208     * comparison, producing garbage results for floating point comparisons.
 209     * gen5 does the comparison on the execution type (resolved source types),
 210     * so dst type doesn't matter.  gen6 does comparison and then uses the
 211     * result as if it was the dst type with no conversion, which happens to
 212     * mostly work out for float-interpreted-as-int since our comparisons are
 213     * for >0, =0, <0.
 214     */
 215    if (intel->gen == 4) {
 216       dst.type = src0.type;
 217       if (dst.file == FIXED_HW_REG)
 218          dst.fixed_hw_reg.type = dst.type;
 219    }
 220
 221    resolve_ud_negate(&src0);
 222    resolve_ud_negate(&src1);
 223
 224    inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
 225    inst->conditional_mod = condition;
 226
 227    return inst;
 228 }
 229
 230 exec_list
 231 fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
 232                                        fs_reg varying_offset,
 233                                        uint32_t const_offset)
 234 {
 235    exec_list instructions;
 236    fs_inst *inst;
 237
 238    if (intel->gen >= 7) {
 239       /* We have our constant surface use a pitch of 4 bytes, so our index can
 240        * be any component of a vector, and then we load 4 contiguous
 241        * components starting from that.
 242        *
 243        * We break down the const_offset to a portion added to the variable
 244        * offset and a portion done using reg_offset, which means that if you
 245        * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
 246        * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
 247        * CSE can later notice that those loads are all the same and eliminate
 248        * the redundant ones.
 249        */
 250       fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
 251       instructions.push_tail(ADD(vec4_offset,
 252                                  varying_offset, const_offset & ~3));
 253
 254       fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4), dst.type);
 255       inst = new(mem_ctx) fs_inst(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7,
 256                                   vec4_result, surf_index, vec4_offset);
 257       instructions.push_tail(inst);
 258
 259       vec4_result.reg_offset += const_offset & 3;
 260       instructions.push_tail(MOV(dst, vec4_result));
 261    } else {
 262       fs_reg offset = fs_reg(this, glsl_type::uint_type);
 263       instructions.push_tail(ADD(offset, varying_offset, fs_reg(const_offset)));
 264
 265       int base_mrf = 13;
 266       bool header_present = true;
 267
 268       fs_reg mrf = fs_reg(MRF, base_mrf + header_present);
 269       mrf.type = BRW_REGISTER_TYPE_D;
 270
 271       /* On gen6+ we want the dword offset passed in, but on gen4/5 we need a
 272        * dword-aligned byte offset.
 273        */
 274       if (intel->gen == 6) {
 275          instructions.push_tail(MOV(mrf, offset));
 276       } else {
 277          instructions.push_tail(MUL(mrf, offset, fs_reg(4)));
 278       }
 279       inst = new(mem_ctx) fs_inst(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD,
 280                                   dst, surf_index);
 281       inst->header_present = header_present;
 282       inst->base_mrf = base_mrf;
 283       inst->mlen = header_present + dispatch_width / 8;
 284
 285       instructions.push_tail(inst);
 286    }
 287
 288    return instructions;
 289 }
 290
 291 /**
 292  * A helper for MOV generation for fixing up broken hardware SEND dependency
 293  * handling.
 294  */
 295 fs_inst *
 296 fs_visitor::DEP_RESOLVE_MOV(int grf)
 297 {
 298    fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
 299
 300    inst->ir = NULL;
 301    inst->annotation = "send dependency resolve";
 302
 303    /* The caller always wants uncompressed to emit the minimal extra
 304     * dependencies, and to avoid having to deal with aligning its regs to 2.
 305     */
 306    inst->force_uncompressed = true;
 307
 308    return inst;
 309 }
 310
 311 bool
 312 fs_inst::equals(fs_inst *inst)
 313 {
 314    return (opcode == inst->opcode &&
 315            dst.equals(inst->dst) &&
 316            src[0].equals(inst->src[0]) &&
 317            src[1].equals(inst->src[1]) &&
 318            src[2].equals(inst->src[2]) &&
 319            saturate == inst->saturate &&
 320            predicate == inst->predicate &&
 321            conditional_mod == inst->conditional_mod &&
 322            mlen == inst->mlen &&
 323            base_mrf == inst->base_mrf &&
 324            sampler == inst->sampler &&
 325            target == inst->target &&
 326            eot == inst->eot &&
 327            header_present == inst->header_present &&
 328            shadow_compare == inst->shadow_compare &&
 329            offset == inst->offset);
 330 }
 331
 332 int
 333 fs_inst::regs_written()
 334 {
 335    if (is_tex() || opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7)
 336       return 4;
 337
 338    /* The SINCOS and INT_DIV_QUOTIENT_AND_REMAINDER math functions return 2,
 339     * but we don't currently use them...nor do we have an opcode for them.
 340     */
 341
 342    return 1;
 343 }
 344
 345 bool
 346 fs_inst::overwrites_reg(const fs_reg &reg)
 347 {
 348    return (reg.file == dst.file &&
 349            reg.reg == dst.reg &&
 350            reg.reg_offset >= dst.reg_offset  &&
 351            reg.reg_offset < dst.reg_offset + regs_written());
 352 }
 353
 354 bool
 355 fs_inst::is_tex()
 356 {
 357    return (opcode == SHADER_OPCODE_TEX ||
 358            opcode == FS_OPCODE_TXB ||
 359            opcode == SHADER_OPCODE_TXD ||
 360            opcode == SHADER_OPCODE_TXF ||
 361            opcode == SHADER_OPCODE_TXF_MS ||
 362            opcode == SHADER_OPCODE_TXL ||
 363            opcode == SHADER_OPCODE_TXS ||
 364            opcode == SHADER_OPCODE_LOD);
 365 }
 366
 367 bool
 368 fs_inst::is_math()
 369 {
 370    return (opcode == SHADER_OPCODE_RCP ||
 371            opcode == SHADER_OPCODE_RSQ ||
 372            opcode == SHADER_OPCODE_SQRT ||
 373            opcode == SHADER_OPCODE_EXP2 ||
 374            opcode == SHADER_OPCODE_LOG2 ||
 375            opcode == SHADER_OPCODE_SIN ||
 376            opcode == SHADER_OPCODE_COS ||
 377            opcode == SHADER_OPCODE_INT_QUOTIENT ||
 378            opcode == SHADER_OPCODE_INT_REMAINDER ||
 379            opcode == SHADER_OPCODE_POW);
 380 }
 381
 382 bool
 383 fs_inst::is_control_flow()
 384 {
 385    switch (opcode) {
 386    case BRW_OPCODE_DO:
 387    case BRW_OPCODE_WHILE:
 388    case BRW_OPCODE_IF:
 389    case BRW_OPCODE_ELSE:
 390    case BRW_OPCODE_ENDIF:
 391    case BRW_OPCODE_BREAK:
 392    case BRW_OPCODE_CONTINUE:
 393       return true;
 394    default:
 395       return false;
 396    }
 397 }
 398
 399 bool
 400 fs_inst::is_send_from_grf()
 401 {
 402    return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
 403            opcode == SHADER_OPCODE_SHADER_TIME_ADD ||
 404            (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
 405             src[1].file == GRF));
 406 }
 407
 408 bool
 409 fs_visitor::can_do_source_mods(fs_inst *inst)
 410 {
 411    if (intel->gen == 6 && inst->is_math())
 412       return false;
 413
 414    if (inst->is_send_from_grf())
 415       return false;
 416
 417    return true;
 418 }
 419
 420 void
 421 fs_reg::init()
 422 {
 423    memset(this, 0, sizeof(*this));
 424    this->smear = -1;
 425 }
 426
 427 /** Generic unset register constructor. */
 428 fs_reg::fs_reg()
 429 {
 430    init();
 431    this->file = BAD_FILE;
 432 }
 433
 434 /** Immediate value constructor. */
 435 fs_reg::fs_reg(float f)
 436 {
 437    init();
 438    this->file = IMM;
 439    this->type = BRW_REGISTER_TYPE_F;
 440    this->imm.f = f;
 441 }
 442
 443 /** Immediate value constructor. */
 444 fs_reg::fs_reg(int32_t i)
 445 {
 446    init();
 447    this->file = IMM;
 448    this->type = BRW_REGISTER_TYPE_D;
 449    this->imm.i = i;
 450 }
 451
 452 /** Immediate value constructor. */
 453 fs_reg::fs_reg(uint32_t u)
 454 {
 455    init();
 456    this->file = IMM;
 457    this->type = BRW_REGISTER_TYPE_UD;
 458    this->imm.u = u;
 459 }
 460
 461 /** Fixed brw_reg Immediate value constructor. */
 462 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
 463 {
 464    init();
 465    this->file = FIXED_HW_REG;
 466    this->fixed_hw_reg = fixed_hw_reg;
 467    this->type = fixed_hw_reg.type;
 468 }
 469
 470 bool
 471 fs_reg::equals(const fs_reg &r) const
 472 {
 473    return (file == r.file &&
 474            reg == r.reg &&
 475            reg_offset == r.reg_offset &&
 476            type == r.type &&
 477            negate == r.negate &&
 478            abs == r.abs &&
 479            !reladdr && !r.reladdr &&
 480            memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
 481                   sizeof(fixed_hw_reg)) == 0 &&
 482            smear == r.smear &&
 483            imm.u == r.imm.u);
 484 }
 485
 486 bool
 487 fs_reg::is_zero() const
 488 {
 489    if (file != IMM)
 490       return false;
 491
 492    return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
 493 }
 494
 495 bool
 496 fs_reg::is_one() const
 497 {
 498    if (file != IMM)
 499       return false;
 500
 501    return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
 502 }
 503
 504 int
 505 fs_visitor::type_size(const struct glsl_type *type)
 506 {
 507    unsigned int size, i;
 508
 509    switch (type->base_type) {
 510    case GLSL_TYPE_UINT:
 511    case GLSL_TYPE_INT:
 512    case GLSL_TYPE_FLOAT:
 513    case GLSL_TYPE_BOOL:
 514       return type->components();
 515    case GLSL_TYPE_ARRAY:
 516       return type_size(type->fields.array) * type->length;
 517    case GLSL_TYPE_STRUCT:
 518       size = 0;
 519       for (i = 0; i < type->length; i++) {
 520          size += type_size(type->fields.structure[i].type);
 521       }
 522       return size;
 523    case GLSL_TYPE_SAMPLER:
 524       /* Samplers take up no register space, since they're baked in at
 525        * link time.
 526        */
 527       return 0;
 528    case GLSL_TYPE_VOID:
 529    case GLSL_TYPE_ERROR:
 530    case GLSL_TYPE_INTERFACE:
 531       assert(!"not reached");
 532       break;
 533    }
 534
 535    return 0;
 536 }
 537
 538 fs_reg
 539 fs_visitor::get_timestamp()
 540 {
 541    assert(intel->gen >= 7);
 542
 543    fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 544                                           BRW_ARF_TIMESTAMP,
 545                                           0),
 546                              BRW_REGISTER_TYPE_UD));
 547
 548    fs_reg dst = fs_reg(this, glsl_type::uint_type);
 549
 550    fs_inst *mov = emit(MOV(dst, ts));
 551    /* We want to read the 3 fields we care about (mostly field 0, but also 2)
 552     * even if it's not enabled in the dispatch.
 553     */
 554    mov->force_writemask_all = true;
 555    mov->force_uncompressed = true;
 556
 557    /* The caller wants the low 32 bits of the timestamp.  Since it's running
 558     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
 559     * which is plenty of time for our purposes.  It is identical across the
 560     * EUs, but since it's tracking GPU core speed it will increment at a
 561     * varying rate as render P-states change.
 562     *
 563     * The caller could also check if render P-states have changed (or anything
 564     * else that might disrupt timing) by setting smear to 2 and checking if
 565     * that field is != 0.
 566     */
 567    dst.smear = 0;
 568
 569    return dst;
 570 }
 571
 572 void
 573 fs_visitor::emit_shader_time_begin()
 574 {
 575    current_annotation = "shader time start";
 576    shader_start_time = get_timestamp();
 577 }
 578
 579 void
 580 fs_visitor::emit_shader_time_end()
 581 {
 582    current_annotation = "shader time end";
 583
 584    enum shader_time_shader_type type, written_type, reset_type;
 585    if (dispatch_width == 8) {
 586       type = ST_FS8;
 587       written_type = ST_FS8_WRITTEN;
 588       reset_type = ST_FS8_RESET;
 589    } else {
 590       assert(dispatch_width == 16);
 591       type = ST_FS16;
 592       written_type = ST_FS16_WRITTEN;
 593       reset_type = ST_FS16_RESET;
 594    }
 595
 596    fs_reg shader_end_time = get_timestamp();
 597
 598    /* Check that there weren't any timestamp reset events (assuming these
 599     * were the only two timestamp reads that happened).
 600     */
 601    fs_reg reset = shader_end_time;
 602    reset.smear = 2;
 603    fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
 604    test->conditional_mod = BRW_CONDITIONAL_Z;
 605    emit(IF(BRW_PREDICATE_NORMAL));
 606
 607    push_force_uncompressed();
 608    fs_reg start = shader_start_time;
 609    start.negate = true;
 610    fs_reg diff = fs_reg(this, glsl_type::uint_type);
 611    emit(ADD(diff, start, shader_end_time));
 612
 613    /* If there were no instructions between the two timestamp gets, the diff
 614     * is 2 cycles.  Remove that overhead, so I can forget about that when
 615     * trying to determine the time taken for single instructions.
 616     */
 617    emit(ADD(diff, diff, fs_reg(-2u)));
 618
 619    emit_shader_time_write(type, diff);
 620    emit_shader_time_write(written_type, fs_reg(1u));
 621    emit(BRW_OPCODE_ELSE);
 622    emit_shader_time_write(reset_type, fs_reg(1u));
 623    emit(BRW_OPCODE_ENDIF);
 624
 625    pop_force_uncompressed();
 626 }
 627
 628 void
 629 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
 630                                    fs_reg value)
 631 {
 632    int shader_time_index = brw_get_shader_time_index(brw, prog, &fp->Base,
 633                                                      type);
 634    fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
 635
 636    fs_reg payload;
 637    if (dispatch_width == 8)
 638       payload = fs_reg(this, glsl_type::uvec2_type);
 639    else
 640       payload = fs_reg(this, glsl_type::uint_type);
 641
 642    emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
 643                 fs_reg(), payload, offset, value));
 644 }
 645
 646 void
 647 fs_visitor::fail(const char *format, ...)
 648 {
 649    va_list va;
 650    char *msg;
 651
 652    if (failed)
 653       return;
 654
 655    failed = true;
 656
 657    va_start(va, format);
 658    msg = ralloc_vasprintf(mem_ctx, format, va);
 659    va_end(va);
 660    msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
 661
 662    this->fail_msg = msg;
 663
 664    if (INTEL_DEBUG & DEBUG_WM) {
 665       fprintf(stderr, "%s",  msg);
 666    }
 667 }
 668
 669 fs_inst *
 670 fs_visitor::emit(enum opcode opcode)
 671 {
 672    return emit(fs_inst(opcode));
 673 }
 674
 675 fs_inst *
 676 fs_visitor::emit(enum opcode opcode, fs_reg dst)
 677 {
 678    return emit(fs_inst(opcode, dst));
 679 }
 680
 681 fs_inst *
 682 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
 683 {
 684    return emit(fs_inst(opcode, dst, src0));
 685 }
 686
 687 fs_inst *
 688 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 689 {
 690    return emit(fs_inst(opcode, dst, src0, src1));
 691 }
 692
 693 fs_inst *
 694 fs_visitor::emit(enum opcode opcode, fs_reg dst,
 695                  fs_reg src0, fs_reg src1, fs_reg src2)
 696 {
 697    return emit(fs_inst(opcode, dst, src0, src1, src2));
 698 }
 699
 700 void
 701 fs_visitor::push_force_uncompressed()
 702 {
 703    force_uncompressed_stack++;
 704 }
 705
 706 void
 707 fs_visitor::pop_force_uncompressed()
 708 {
 709    force_uncompressed_stack--;
 710    assert(force_uncompressed_stack >= 0);
 711 }
 712
 713 void
 714 fs_visitor::push_force_sechalf()
 715 {
 716    force_sechalf_stack++;
 717 }
 718
 719 void
 720 fs_visitor::pop_force_sechalf()
 721 {
 722    force_sechalf_stack--;
 723    assert(force_sechalf_stack >= 0);
 724 }
 725
 726 /**
 727  * Returns how many MRFs an FS opcode will write over.
 728  *
 729  * Note that this is not the 0 or 1 implied writes in an actual gen
 730  * instruction -- the FS opcodes often generate MOVs in addition.
 731  */
 732 int
 733 fs_visitor::implied_mrf_writes(fs_inst *inst)
 734 {
 735    if (inst->mlen == 0)
 736       return 0;
 737
 738    switch (inst->opcode) {
 739    case SHADER_OPCODE_RCP:
 740    case SHADER_OPCODE_RSQ:
 741    case SHADER_OPCODE_SQRT:
 742    case SHADER_OPCODE_EXP2:
 743    case SHADER_OPCODE_LOG2:
 744    case SHADER_OPCODE_SIN:
 745    case SHADER_OPCODE_COS:
 746       return 1 * dispatch_width / 8;
 747    case SHADER_OPCODE_POW:
 748    case SHADER_OPCODE_INT_QUOTIENT:
 749    case SHADER_OPCODE_INT_REMAINDER:
 750       return 2 * dispatch_width / 8;
 751    case SHADER_OPCODE_TEX:
 752    case FS_OPCODE_TXB:
 753    case SHADER_OPCODE_TXD:
 754    case SHADER_OPCODE_TXF:
 755    case SHADER_OPCODE_TXF_MS:
 756    case SHADER_OPCODE_TXL:
 757    case SHADER_OPCODE_TXS:
 758    case SHADER_OPCODE_LOD:
 759       return 1;
 760    case FS_OPCODE_FB_WRITE:
 761       return 2;
 762    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 763    case FS_OPCODE_UNSPILL:
 764       return 1;
 765    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
 766       return inst->header_present;
 767    case FS_OPCODE_SPILL:
 768       return 2;
 769    default:
 770       assert(!"not reached");
 771       return inst->mlen;
 772    }
 773 }
 774
 775 int
 776 fs_visitor::virtual_grf_alloc(int size)
 777 {
 778    if (virtual_grf_array_size <= virtual_grf_count) {
 779       if (virtual_grf_array_size == 0)
 780          virtual_grf_array_size = 16;
 781       else
 782          virtual_grf_array_size *= 2;
 783       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 784                                    virtual_grf_array_size);
 785    }
 786    virtual_grf_sizes[virtual_grf_count] = size;
 787    return virtual_grf_count++;
 788 }
 789
 790 /** Fixed HW reg constructor. */
 791 fs_reg::fs_reg(enum register_file file, int reg)
 792 {
 793    init();
 794    this->file = file;
 795    this->reg = reg;
 796    this->type = BRW_REGISTER_TYPE_F;
 797 }
 798
 799 /** Fixed HW reg constructor. */
 800 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
 801 {
 802    init();
 803    this->file = file;
 804    this->reg = reg;
 805    this->type = type;
 806 }
 807
 808 /** Automatic reg constructor. */
 809 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
 810 {
 811    init();
 812
 813    this->file = GRF;
 814    this->reg = v->virtual_grf_alloc(v->type_size(type));
 815    this->reg_offset = 0;
 816    this->type = brw_type_for_base_type(type);
 817 }
 818
 819 fs_reg *
 820 fs_visitor::variable_storage(ir_variable *var)
 821 {
 822    return (fs_reg *)hash_table_find(this->variable_ht, var);
 823 }
 824
 825 void
 826 import_uniforms_callback(const void *key,
 827                          void *data,
 828                          void *closure)
 829 {
 830    struct hash_table *dst_ht = (struct hash_table *)closure;
 831    const fs_reg *reg = (const fs_reg *)data;
 832
 833    if (reg->file != UNIFORM)
 834       return;
 835
 836    hash_table_insert(dst_ht, data, key);
 837 }
 838
 839 /* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
 840  * This brings in those uniform definitions
 841  */
 842 void
 843 fs_visitor::import_uniforms(fs_visitor *v)
 844 {
 845    hash_table_call_foreach(v->variable_ht,
 846                            import_uniforms_callback,
 847                            variable_ht);
 848    this->params_remap = v->params_remap;
 849 }
 850
 851 /* Our support for uniforms is piggy-backed on the struct
 852  * gl_fragment_program, because that's where the values actually
 853  * get stored, rather than in some global gl_shader_program uniform
 854  * store.
 855  */
 856 void
 857 fs_visitor::setup_uniform_values(ir_variable *ir)
 858 {
 859    int namelen = strlen(ir->name);
 860
 861    /* The data for our (non-builtin) uniforms is stored in a series of
 862     * gl_uniform_driver_storage structs for each subcomponent that
 863     * glGetUniformLocation() could name.  We know it's been set up in the same
 864     * order we'd walk the type, so walk the list of storage and find anything
 865     * with our name, or the prefix of a component that starts with our name.
 866     */
 867    unsigned params_before = c->prog_data.nr_params;
 868    for (unsigned u = 0; u < prog->NumUserUniformStorage; u++) {
 869       struct gl_uniform_storage *storage = &prog->UniformStorage[u];
 870
 871       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 872           (storage->name[namelen] != 0 &&
 873            storage->name[namelen] != '.' &&
 874            storage->name[namelen] != '[')) {
 875          continue;
 876       }
 877
 878       unsigned slots = storage->type->component_slots();
 879       if (storage->array_elements)
 880          slots *= storage->array_elements;
 881
 882       for (unsigned i = 0; i < slots; i++) {
 883          c->prog_data.param[c->prog_data.nr_params++] =
 884             &storage->storage[i].f;
 885       }
 886    }
 887
 888    /* Make sure we actually initialized the right amount of stuff here. */
 889    assert(params_before + ir->type->component_slots() ==
 890           c->prog_data.nr_params);
 891 }
 892
 893
 894 /* Our support for builtin uniforms is even scarier than non-builtin.
 895  * It sits on top of the PROG_STATE_VAR parameters that are
 896  * automatically updated from GL context state.
 897  */
 898 void
 899 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
 900 {
 901    const ir_state_slot *const slots = ir->state_slots;
 902    assert(ir->state_slots != NULL);
 903
 904    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 905       /* This state reference has already been setup by ir_to_mesa, but we'll
 906        * get the same index back here.
 907        */
 908       int index = _mesa_add_state_reference(this->fp->Base.Parameters,
 909                                             (gl_state_index *)slots[i].tokens);
 910
 911       /* Add each of the unique swizzles of the element as a parameter.
 912        * This'll end up matching the expected layout of the
 913        * array/matrix/structure we're trying to fill in.
 914        */
 915       int last_swiz = -1;
 916       for (unsigned int j = 0; j < 4; j++) {
 917          int swiz = GET_SWZ(slots[i].swizzle, j);
 918          if (swiz == last_swiz)
 919             break;
 920          last_swiz = swiz;
 921
 922          c->prog_data.param[c->prog_data.nr_params++] =
 923             &fp->Base.Parameters->ParameterValues[index][swiz].f;
 924       }
 925    }
 926 }
 927
 928 fs_reg *
 929 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
 930 {
 931    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 932    fs_reg wpos = *reg;
 933    bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
 934
 935    /* gl_FragCoord.x */
 936    if (ir->pixel_center_integer) {
 937       emit(MOV(wpos, this->pixel_x));
 938    } else {
 939       emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
 940    }
 941    wpos.reg_offset++;
 942
 943    /* gl_FragCoord.y */
 944    if (!flip && ir->pixel_center_integer) {
 945       emit(MOV(wpos, this->pixel_y));
 946    } else {
 947       fs_reg pixel_y = this->pixel_y;
 948       float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
 949
 950       if (flip) {
 951          pixel_y.negate = true;
 952          offset += c->key.drawable_height - 1.0;
 953       }
 954
 955       emit(ADD(wpos, pixel_y, fs_reg(offset)));
 956    }
 957    wpos.reg_offset++;
 958
 959    /* gl_FragCoord.z */
 960    if (intel->gen >= 6) {
 961       emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
 962    } else {
 963       emit(FS_OPCODE_LINTERP, wpos,
 964            this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
 965            this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
 966            interp_reg(VARYING_SLOT_POS, 2));
 967    }
 968    wpos.reg_offset++;
 969
 970    /* gl_FragCoord.w: Already set up in emit_interpolation */
 971    emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
 972
 973    return reg;
 974 }
 975
 976 fs_inst *
 977 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
 978                          glsl_interp_qualifier interpolation_mode,
 979                          bool is_centroid)
 980 {
 981    brw_wm_barycentric_interp_mode barycoord_mode;
 982    if (is_centroid) {
 983       if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
 984          barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
 985       else
 986          barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
 987    } else {
 988       if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
 989          barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
 990       else
 991          barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
 992    }
 993    return emit(FS_OPCODE_LINTERP, attr,
 994                this->delta_x[barycoord_mode],
 995                this->delta_y[barycoord_mode], interp);
 996 }
 997
 998 fs_reg *
 999 fs_visitor::emit_general_interpolation(ir_variable *ir)
1000 {
1001    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1002    reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
1003    fs_reg attr = *reg;
1004
1005    unsigned int array_elements;
1006    const glsl_type *type;
1007
1008    if (ir->type->is_array()) {
1009       array_elements = ir->type->length;
1010       if (array_elements == 0) {
1011          fail("dereferenced array '%s' has length 0\n", ir->name);
1012       }
1013       type = ir->type->fields.array;
1014    } else {
1015       array_elements = 1;
1016       type = ir->type;
1017    }
1018
1019    glsl_interp_qualifier interpolation_mode =
1020       ir->determine_interpolation_mode(c->key.flat_shade);
1021
1022    int location = ir->location;
1023    for (unsigned int i = 0; i < array_elements; i++) {
1024       for (unsigned int j = 0; j < type->matrix_columns; j++) {
1025          if (urb_setup[location] == -1) {
1026             /* If there's no incoming setup data for this slot, don't
1027              * emit interpolation for it.
1028              */
1029             attr.reg_offset += type->vector_elements;
1030             location++;
1031             continue;
1032          }
1033
1034          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1035             /* Constant interpolation (flat shading) case. The SF has
1036              * handed us defined values in only the constant offset
1037              * field of the setup reg.
1038              */
1039             for (unsigned int k = 0; k < type->vector_elements; k++) {
1040                struct brw_reg interp = interp_reg(location, k);
1041                interp = suboffset(interp, 3);
1042                interp.type = reg->type;
1043                emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1044                attr.reg_offset++;
1045             }
1046          } else {
1047             /* Smooth/noperspective interpolation case. */
1048             for (unsigned int k = 0; k < type->vector_elements; k++) {
1049                /* FINISHME: At some point we probably want to push
1050                 * this farther by giving similar treatment to the
1051                 * other potentially constant components of the
1052                 * attribute, as well as making brw_vs_constval.c
1053                 * handle varyings other than gl_TexCoord.
1054                 */
1055                if (location >= VARYING_SLOT_TEX0 &&
1056                    location <= VARYING_SLOT_TEX7 &&
1057                    k == 3 && !(c->key.proj_attrib_mask
1058                                & BITFIELD64_BIT(location))) {
1059                   emit(BRW_OPCODE_MOV, attr, fs_reg(1.0f));
1060                } else {
1061                   struct brw_reg interp = interp_reg(location, k);
1062                   emit_linterp(attr, fs_reg(interp), interpolation_mode,
1063                                ir->centroid);
1064                   if (brw->needs_unlit_centroid_workaround && ir->centroid) {
1065                      /* Get the pixel/sample mask into f0 so that we know
1066                       * which pixels are lit.  Then, for each channel that is
1067                       * unlit, replace the centroid data with non-centroid
1068                       * data.
1069                       */
1070                      emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS, attr);
1071                      fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1072                                                   interpolation_mode, false);
1073                      inst->predicate = BRW_PREDICATE_NORMAL;
1074                      inst->predicate_inverse = true;
1075                   }
1076                   if (intel->gen < 6) {
1077                      emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1078                   }
1079                }
1080                attr.reg_offset++;
1081             }
1082
1083          }
1084          location++;
1085       }
1086    }
1087
1088    return reg;
1089 }
1090
1091 fs_reg *
1092 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1093 {
1094    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1095
1096    /* The frontfacing comes in as a bit in the thread payload. */
1097    if (intel->gen >= 6) {
1098       emit(BRW_OPCODE_ASR, *reg,
1099            fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1100            fs_reg(15));
1101       emit(BRW_OPCODE_NOT, *reg, *reg);
1102       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1103    } else {
1104       struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1105       /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1106        * us front face
1107        */
1108       emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1109       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1110    }
1111
1112    return reg;
1113 }
1114
1115 fs_reg
1116 fs_visitor::fix_math_operand(fs_reg src)
1117 {
1118    /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1119     * might be able to do better by doing execsize = 1 math and then
1120     * expanding that result out, but we would need to be careful with
1121     * masking.
1122     *
1123     * The hardware ignores source modifiers (negate and abs) on math
1124     * instructions, so we also move to a temp to set those up.
1125     */
1126    if (intel->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1127        !src.abs && !src.negate)
1128       return src;
1129
1130    /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1131     * operands to math
1132     */
1133    if (intel->gen >= 7 && src.file != IMM)
1134       return src;
1135
1136    fs_reg expanded = fs_reg(this, glsl_type::float_type);
1137    expanded.type = src.type;
1138    emit(BRW_OPCODE_MOV, expanded, src);
1139    return expanded;
1140 }
1141
1142 fs_inst *
1143 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1144 {
1145    switch (opcode) {
1146    case SHADER_OPCODE_RCP:
1147    case SHADER_OPCODE_RSQ:
1148    case SHADER_OPCODE_SQRT:
1149    case SHADER_OPCODE_EXP2:
1150    case SHADER_OPCODE_LOG2:
1151    case SHADER_OPCODE_SIN:
1152    case SHADER_OPCODE_COS:
1153       break;
1154    default:
1155       assert(!"not reached: bad math opcode");
1156       return NULL;
1157    }
1158
1159    /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
1160     * might be able to do better by doing execsize = 1 math and then
1161     * expanding that result out, but we would need to be careful with
1162     * masking.
1163     *
1164     * Gen 6 hardware ignores source modifiers (negate and abs) on math
1165     * instructions, so we also move to a temp to set those up.
1166     */
1167    if (intel->gen >= 6)
1168       src = fix_math_operand(src);
1169
1170    fs_inst *inst = emit(opcode, dst, src);
1171
1172    if (intel->gen < 6) {
1173       inst->base_mrf = 2;
1174       inst->mlen = dispatch_width / 8;
1175    }
1176
1177    return inst;
1178 }
1179
1180 fs_inst *
1181 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1182 {
1183    int base_mrf = 2;
1184    fs_inst *inst;
1185
1186    switch (opcode) {
1187    case SHADER_OPCODE_INT_QUOTIENT:
1188    case SHADER_OPCODE_INT_REMAINDER:
1189       if (intel->gen >= 7 && dispatch_width == 16)
1190          fail("16-wide INTDIV unsupported\n");
1191       break;
1192    case SHADER_OPCODE_POW:
1193       break;
1194    default:
1195       assert(!"not reached: unsupported binary math opcode.");
1196       return NULL;
1197    }
1198
1199    if (intel->gen >= 6) {
1200       src0 = fix_math_operand(src0);
1201       src1 = fix_math_operand(src1);
1202
1203       inst = emit(opcode, dst, src0, src1);
1204    } else {
1205       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1206        * "Message Payload":
1207        *
1208        * "Operand0[7].  For the INT DIV functions, this operand is the
1209        *  denominator."
1210        *  ...
1211        * "Operand1[7].  For the INT DIV functions, this operand is the
1212        *  numerator."
1213        */
1214       bool is_int_div = opcode != SHADER_OPCODE_POW;
1215       fs_reg &op0 = is_int_div ? src1 : src0;
1216       fs_reg &op1 = is_int_div ? src0 : src1;
1217
1218       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1219       inst = emit(opcode, dst, op0, reg_null_f);
1220
1221       inst->base_mrf = base_mrf;
1222       inst->mlen = 2 * dispatch_width / 8;
1223    }
1224    return inst;
1225 }
1226
1227 void
1228 fs_visitor::assign_curb_setup()
1229 {
1230    c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
1231    if (dispatch_width == 8) {
1232       c->prog_data.first_curbe_grf = c->nr_payload_regs;
1233    } else {
1234       c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1235    }
1236
1237    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1238    foreach_list(node, &this->instructions) {
1239       fs_inst *inst = (fs_inst *)node;
1240
1241       for (unsigned int i = 0; i < 3; i++) {
1242          if (inst->src[i].file == UNIFORM) {
1243             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1244             struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1245                                                   constant_nr / 8,
1246                                                   constant_nr % 8);
1247
1248             inst->src[i].file = FIXED_HW_REG;
1249             inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
1250          }
1251       }
1252    }
1253 }
1254
1255 void
1256 fs_visitor::calculate_urb_setup()
1257 {
1258    for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1259       urb_setup[i] = -1;
1260    }
1261
1262    int urb_next = 0;
1263    /* Figure out where each of the incoming setup attributes lands. */
1264    if (intel->gen >= 6) {
1265       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1266          if (fp->Base.InputsRead & BITFIELD64_BIT(i)) {
1267             urb_setup[i] = urb_next++;
1268          }
1269       }
1270    } else {
1271       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1272       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1273          /* Point size is packed into the header, not as a general attribute */
1274          if (i == VARYING_SLOT_PSIZ)
1275             continue;
1276
1277          if (c->key.input_slots_valid & BITFIELD64_BIT(i)) {
1278             /* The back color slot is skipped when the front color is
1279              * also written to.  In addition, some slots can be
1280              * written in the vertex shader and not read in the
1281              * fragment shader.  So the register number must always be
1282              * incremented, mapped or not.
1283              */
1284             if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1285                urb_setup[i] = urb_next;
1286             urb_next++;
1287          }
1288       }
1289
1290       /*
1291        * It's a FS only attribute, and we did interpolation for this attribute
1292        * in SF thread. So, count it here, too.
1293        *
1294        * See compile_sf_prog() for more info.
1295        */
1296       if (fp->Base.InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1297          urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1298    }
1299
1300    /* Each attribute is 4 setup channels, each of which is half a reg. */
1301    c->prog_data.urb_read_length = urb_next * 2;
1302 }
1303
1304 void
1305 fs_visitor::assign_urb_setup()
1306 {
1307    int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1308
1309    /* Offset all the urb_setup[] index by the actual position of the
1310     * setup regs, now that the location of the constants has been chosen.
1311     */
1312    foreach_list(node, &this->instructions) {
1313       fs_inst *inst = (fs_inst *)node;
1314
1315       if (inst->opcode == FS_OPCODE_LINTERP) {
1316          assert(inst->src[2].file == FIXED_HW_REG);
1317          inst->src[2].fixed_hw_reg.nr += urb_start;
1318       }
1319
1320       if (inst->opcode == FS_OPCODE_CINTERP) {
1321          assert(inst->src[0].file == FIXED_HW_REG);
1322          inst->src[0].fixed_hw_reg.nr += urb_start;
1323       }
1324    }
1325
1326    this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length;
1327 }
1328
1329 /**
1330  * Split large virtual GRFs into separate components if we can.
1331  *
1332  * This is mostly duplicated with what brw_fs_vector_splitting does,
1333  * but that's really conservative because it's afraid of doing
1334  * splitting that doesn't result in real progress after the rest of
1335  * the optimization phases, which would cause infinite looping in
1336  * optimization.  We can do it once here, safely.  This also has the
1337  * opportunity to split interpolated values, or maybe even uniforms,
1338  * which we don't have at the IR level.
1339  *
1340  * We want to split, because virtual GRFs are what we register
1341  * allocate and spill (due to contiguousness requirements for some
1342  * instructions), and they're what we naturally generate in the
1343  * codegen process, but most virtual GRFs don't actually need to be
1344  * contiguous sets of GRFs.  If we split, we'll end up with reduced
1345  * live intervals and better dead code elimination and coalescing.
1346  */
1347 void
1348 fs_visitor::split_virtual_grfs()
1349 {
1350    int num_vars = this->virtual_grf_count;
1351    bool split_grf[num_vars];
1352    int new_virtual_grf[num_vars];
1353
1354    /* Try to split anything > 0 sized. */
1355    for (int i = 0; i < num_vars; i++) {
1356       if (this->virtual_grf_sizes[i] != 1)
1357          split_grf[i] = true;
1358       else
1359          split_grf[i] = false;
1360    }
1361
1362    if (brw->has_pln &&
1363        this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1364       /* PLN opcodes rely on the delta_xy being contiguous.  We only have to
1365        * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1366        * Gen6, that was the only supported interpolation mode, and since Gen6,
1367        * delta_x and delta_y are in fixed hardware registers.
1368        */
1369       split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1370          false;
1371    }
1372
1373    foreach_list(node, &this->instructions) {
1374       fs_inst *inst = (fs_inst *)node;
1375
1376       /* If there's a SEND message that requires contiguous destination
1377        * registers, no splitting is allowed.
1378        */
1379       if (inst->regs_written() > 1) {
1380          split_grf[inst->dst.reg] = false;
1381       }
1382
1383       /* If we're sending from a GRF, don't split it, on the assumption that
1384        * the send is reading the whole thing.
1385        */
1386       if (inst->is_send_from_grf()) {
1387          split_grf[inst->src[0].reg] = false;
1388       }
1389    }
1390
1391    /* Allocate new space for split regs.  Note that the virtual
1392     * numbers will be contiguous.
1393     */
1394    for (int i = 0; i < num_vars; i++) {
1395       if (split_grf[i]) {
1396          new_virtual_grf[i] = virtual_grf_alloc(1);
1397          for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1398             int reg = virtual_grf_alloc(1);
1399             assert(reg == new_virtual_grf[i] + j - 1);
1400             (void) reg;
1401          }
1402          this->virtual_grf_sizes[i] = 1;
1403       }
1404    }
1405
1406    foreach_list(node, &this->instructions) {
1407       fs_inst *inst = (fs_inst *)node;
1408
1409       if (inst->dst.file == GRF &&
1410           split_grf[inst->dst.reg] &&
1411           inst->dst.reg_offset != 0) {
1412          inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1413                           inst->dst.reg_offset - 1);
1414          inst->dst.reg_offset = 0;
1415       }
1416       for (int i = 0; i < 3; i++) {
1417          if (inst->src[i].file == GRF &&
1418              split_grf[inst->src[i].reg] &&
1419              inst->src[i].reg_offset != 0) {
1420             inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1421                                 inst->src[i].reg_offset - 1);
1422             inst->src[i].reg_offset = 0;
1423          }
1424       }
1425    }
1426    this->live_intervals_valid = false;
1427 }
1428
1429 /**
1430  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1431  *
1432  * During code generation, we create tons of temporary variables, many of
1433  * which get immediately killed and are never used again.  Yet, in later
1434  * optimization and analysis passes, such as compute_live_intervals, we need
1435  * to loop over all the virtual GRFs.  Compacting them can save a lot of
1436  * overhead.
1437  */
1438 void
1439 fs_visitor::compact_virtual_grfs()
1440 {
1441    /* Mark which virtual GRFs are used, and count how many. */
1442    int remap_table[this->virtual_grf_count];
1443    memset(remap_table, -1, sizeof(remap_table));
1444
1445    foreach_list(node, &this->instructions) {
1446       const fs_inst *inst = (const fs_inst *) node;
1447
1448       if (inst->dst.file == GRF)
1449          remap_table[inst->dst.reg] = 0;
1450
1451       for (int i = 0; i < 3; i++) {
1452          if (inst->src[i].file == GRF)
1453             remap_table[inst->src[i].reg] = 0;
1454       }
1455    }
1456
1457    /* In addition to registers used in instructions, fs_visitor keeps
1458     * direct references to certain special values which must be patched:
1459     */
1460    fs_reg *special[] = {
1461       &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1462       &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1463       &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1464       &delta_x[0], &delta_x[1], &delta_x[2],
1465       &delta_x[3], &delta_x[4], &delta_x[5],
1466       &delta_y[0], &delta_y[1], &delta_y[2],
1467       &delta_y[3], &delta_y[4], &delta_y[5],
1468    };
1469    STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1470    STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1471
1472    /* Treat all special values as used, to be conservative */
1473    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1474       if (special[i]->file == GRF)
1475          remap_table[special[i]->reg] = 0;
1476    }
1477
1478    /* Compact the GRF arrays. */
1479    int new_index = 0;
1480    for (int i = 0; i < this->virtual_grf_count; i++) {
1481       if (remap_table[i] != -1) {
1482          remap_table[i] = new_index;
1483          virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1484          if (live_intervals_valid) {
1485             virtual_grf_use[new_index] = virtual_grf_use[i];
1486             virtual_grf_def[new_index] = virtual_grf_def[i];
1487          }
1488          ++new_index;
1489       }
1490    }
1491
1492    this->virtual_grf_count = new_index;
1493
1494    /* Patch all the instructions to use the newly renumbered registers */
1495    foreach_list(node, &this->instructions) {
1496       fs_inst *inst = (fs_inst *) node;
1497
1498       if (inst->dst.file == GRF)
1499          inst->dst.reg = remap_table[inst->dst.reg];
1500
1501       for (int i = 0; i < 3; i++) {
1502          if (inst->src[i].file == GRF)
1503             inst->src[i].reg = remap_table[inst->src[i].reg];
1504       }
1505    }
1506
1507    /* Patch all the references to special values */
1508    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1509       if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1510          special[i]->reg = remap_table[special[i]->reg];
1511    }
1512 }
1513
1514 bool
1515 fs_visitor::remove_dead_constants()
1516 {
1517    if (dispatch_width == 8) {
1518       this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
1519
1520       for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
1521          this->params_remap[i] = -1;
1522
1523       /* Find which params are still in use. */
1524       foreach_list(node, &this->instructions) {
1525          fs_inst *inst = (fs_inst *)node;
1526
1527          for (int i = 0; i < 3; i++) {
1528             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1529
1530             if (inst->src[i].file != UNIFORM)
1531                continue;
1532
1533             assert(constant_nr < (int)c->prog_data.nr_params);
1534
1535             /* For now, set this to non-negative.  We'll give it the
1536              * actual new number in a moment, in order to keep the
1537              * register numbers nicely ordered.
1538              */
1539             this->params_remap[constant_nr] = 0;
1540          }
1541       }
1542
1543       /* Figure out what the new numbers for the params will be.  At some
1544        * point when we're doing uniform array access, we're going to want
1545        * to keep the distinction between .reg and .reg_offset, but for
1546        * now we don't care.
1547        */
1548       unsigned int new_nr_params = 0;
1549       for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1550          if (this->params_remap[i] != -1) {
1551             this->params_remap[i] = new_nr_params++;
1552          }
1553       }
1554
1555       /* Update the list of params to be uploaded to match our new numbering. */
1556       for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1557          int remapped = this->params_remap[i];
1558
1559          if (remapped == -1)
1560             continue;
1561
1562          c->prog_data.param[remapped] = c->prog_data.param[i];
1563       }
1564
1565       c->prog_data.nr_params = new_nr_params;
1566    } else {
1567       /* This should have been generated in the 8-wide pass already. */
1568       assert(this->params_remap);
1569    }
1570
1571    /* Now do the renumbering of the shader to remove unused params. */
1572    foreach_list(node, &this->instructions) {
1573       fs_inst *inst = (fs_inst *)node;
1574
1575       for (int i = 0; i < 3; i++) {
1576          int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1577
1578          if (inst->src[i].file != UNIFORM)
1579             continue;
1580
1581          assert(this->params_remap[constant_nr] != -1);
1582          inst->src[i].reg = this->params_remap[constant_nr];
1583          inst->src[i].reg_offset = 0;
1584       }
1585    }
1586
1587    return true;
1588 }
1589
1590 /*
1591  * Implements array access of uniforms by inserting a
1592  * PULL_CONSTANT_LOAD instruction.
1593  *
1594  * Unlike temporary GRF array access (where we don't support it due to
1595  * the difficulty of doing relative addressing on instruction
1596  * destinations), we could potentially do array access of uniforms
1597  * that were loaded in GRF space as push constants.  In real-world
1598  * usage we've seen, though, the arrays being used are always larger
1599  * than we could load as push constants, so just always move all
1600  * uniform array access out to a pull constant buffer.
1601  */
1602 void
1603 fs_visitor::move_uniform_array_access_to_pull_constants()
1604 {
1605    int pull_constant_loc[c->prog_data.nr_params];
1606
1607    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1608       pull_constant_loc[i] = -1;
1609    }
1610
1611    /* Walk through and find array access of uniforms.  Put a copy of that
1612     * uniform in the pull constant buffer.
1613     *
1614     * Note that we don't move constant-indexed accesses to arrays.  No
1615     * testing has been done of the performance impact of this choice.
1616     */
1617    foreach_list_safe(node, &this->instructions) {
1618       fs_inst *inst = (fs_inst *)node;
1619
1620       for (int i = 0 ; i < 3; i++) {
1621          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1622             continue;
1623
1624          int uniform = inst->src[i].reg;
1625
1626          /* If this array isn't already present in the pull constant buffer,
1627           * add it.
1628           */
1629          if (pull_constant_loc[uniform] == -1) {
1630             const float **values = &c->prog_data.param[uniform];
1631
1632             pull_constant_loc[uniform] = c->prog_data.nr_pull_params;
1633
1634             assert(param_size[uniform]);
1635
1636             for (int j = 0; j < param_size[uniform]; j++) {
1637                c->prog_data.pull_param[c->prog_data.nr_pull_params++] =
1638                   values[j];
1639             }
1640          }
1641
1642          /* Set up the annotation tracking for new generated instructions. */
1643          base_ir = inst->ir;
1644          current_annotation = inst->annotation;
1645
1646          fs_reg surf_index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1647          fs_reg temp = fs_reg(this, glsl_type::float_type);
1648          exec_list list = VARYING_PULL_CONSTANT_LOAD(temp,
1649                                                      surf_index,
1650                                                      *inst->src[i].reladdr,
1651                                                      pull_constant_loc[uniform] +
1652                                                      inst->src[i].reg_offset);
1653          inst->insert_before(&list);
1654
1655          inst->src[i].file = temp.file;
1656          inst->src[i].reg = temp.reg;
1657          inst->src[i].reg_offset = temp.reg_offset;
1658          inst->src[i].reladdr = NULL;
1659       }
1660    }
1661 }
1662
1663 /**
1664  * Choose accesses from the UNIFORM file to demote to using the pull
1665  * constant buffer.
1666  *
1667  * We allow a fragment shader to have more than the specified minimum
1668  * maximum number of fragment shader uniform components (64).  If
1669  * there are too many of these, they'd fill up all of register space.
1670  * So, this will push some of them out to the pull constant buffer and
1671  * update the program to load them.
1672  */
1673 void
1674 fs_visitor::setup_pull_constants()
1675 {
1676    /* Only allow 16 registers (128 uniform components) as push constants. */
1677    unsigned int max_uniform_components = 16 * 8;
1678    if (c->prog_data.nr_params <= max_uniform_components)
1679       return;
1680
1681    if (dispatch_width == 16) {
1682       fail("Pull constants not supported in 16-wide\n");
1683       return;
1684    }
1685
1686    /* Just demote the end of the list.  We could probably do better
1687     * here, demoting things that are rarely used in the program first.
1688     */
1689    unsigned int pull_uniform_base = max_uniform_components;
1690
1691    int pull_constant_loc[c->prog_data.nr_params];
1692    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1693       if (i < pull_uniform_base) {
1694          pull_constant_loc[i] = -1;
1695       } else {
1696          pull_constant_loc[i] = -1;
1697          /* If our constant is already being uploaded for reladdr purposes,
1698           * reuse it.
1699           */
1700          for (unsigned int j = 0; j < c->prog_data.nr_pull_params; j++) {
1701             if (c->prog_data.pull_param[j] == c->prog_data.param[i]) {
1702                pull_constant_loc[i] = j;
1703                break;
1704             }
1705          }
1706          if (pull_constant_loc[i] == -1) {
1707             int pull_index = c->prog_data.nr_pull_params++;
1708             c->prog_data.pull_param[pull_index] = c->prog_data.param[i];
1709             pull_constant_loc[i] = pull_index;;
1710          }
1711       }
1712    }
1713    c->prog_data.nr_params = pull_uniform_base;
1714
1715    foreach_list(node, &this->instructions) {
1716       fs_inst *inst = (fs_inst *)node;
1717
1718       for (int i = 0; i < 3; i++) {
1719          if (inst->src[i].file != UNIFORM)
1720             continue;
1721
1722          int pull_index = pull_constant_loc[inst->src[i].reg +
1723                                             inst->src[i].reg_offset];
1724          if (pull_index == -1)
1725             continue;
1726
1727          assert(!inst->src[i].reladdr);
1728
1729          fs_reg dst = fs_reg(this, glsl_type::float_type);
1730          fs_reg index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1731          fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1732          fs_inst *pull =
1733             new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1734                                  dst, index, offset);
1735          pull->ir = inst->ir;
1736          pull->annotation = inst->annotation;
1737
1738          inst->insert_before(pull);
1739
1740          inst->src[i].file = GRF;
1741          inst->src[i].reg = dst.reg;
1742          inst->src[i].reg_offset = 0;
1743          inst->src[i].smear = pull_index & 3;
1744       }
1745    }
1746 }
1747
1748 bool
1749 fs_visitor::opt_algebraic()
1750 {
1751    bool progress = false;
1752
1753    foreach_list(node, &this->instructions) {
1754       fs_inst *inst = (fs_inst *)node;
1755
1756       switch (inst->opcode) {
1757       case BRW_OPCODE_MUL:
1758          if (inst->src[1].file != IMM)
1759             continue;
1760
1761          /* a * 1.0 = a */
1762          if (inst->src[1].is_one()) {
1763             inst->opcode = BRW_OPCODE_MOV;
1764             inst->src[1] = reg_undef;
1765             progress = true;
1766             break;
1767          }
1768
1769          /* a * 0.0 = 0.0 */
1770          if (inst->src[1].is_zero()) {
1771             inst->opcode = BRW_OPCODE_MOV;
1772             inst->src[0] = inst->src[1];
1773             inst->src[1] = reg_undef;
1774             progress = true;
1775             break;
1776          }
1777
1778          break;
1779       case BRW_OPCODE_ADD:
1780          if (inst->src[1].file != IMM)
1781             continue;
1782
1783          /* a + 0.0 = a */
1784          if (inst->src[1].is_zero()) {
1785             inst->opcode = BRW_OPCODE_MOV;
1786             inst->src[1] = reg_undef;
1787             progress = true;
1788             break;
1789          }
1790          break;
1791       default:
1792          break;
1793       }
1794    }
1795
1796    return progress;
1797 }
1798
1799 /**
1800  * Must be called after calculate_live_intervales() to remove unused
1801  * writes to registers -- register allocation will fail otherwise
1802  * because something deffed but not used won't be considered to
1803  * interfere with other regs.
1804  */
1805 bool
1806 fs_visitor::dead_code_eliminate()
1807 {
1808    bool progress = false;
1809    int pc = 0;
1810
1811    calculate_live_intervals();
1812
1813    foreach_list_safe(node, &this->instructions) {
1814       fs_inst *inst = (fs_inst *)node;
1815
1816       if (inst->dst.file == GRF && this->virtual_grf_use[inst->dst.reg] <= pc) {
1817          inst->remove();
1818          progress = true;
1819       }
1820
1821       pc++;
1822    }
1823
1824    if (progress)
1825       live_intervals_valid = false;
1826
1827    return progress;
1828 }
1829
1830 /**
1831  * Implements a second type of register coalescing: This one checks if
1832  * the two regs involved in a raw move don't interfere, in which case
1833  * they can both by stored in the same place and the MOV removed.
1834  */
1835 bool
1836 fs_visitor::register_coalesce_2()
1837 {
1838    bool progress = false;
1839
1840    calculate_live_intervals();
1841
1842    foreach_list_safe(node, &this->instructions) {
1843       fs_inst *inst = (fs_inst *)node;
1844
1845       if (inst->opcode != BRW_OPCODE_MOV ||
1846           inst->predicate ||
1847           inst->saturate ||
1848           inst->src[0].file != GRF ||
1849           inst->src[0].negate ||
1850           inst->src[0].abs ||
1851           inst->src[0].smear != -1 ||
1852           inst->dst.file != GRF ||
1853           inst->dst.type != inst->src[0].type ||
1854           virtual_grf_sizes[inst->src[0].reg] != 1 ||
1855           virtual_grf_interferes(inst->dst.reg, inst->src[0].reg)) {
1856          continue;
1857       }
1858
1859       int reg_from = inst->src[0].reg;
1860       assert(inst->src[0].reg_offset == 0);
1861       int reg_to = inst->dst.reg;
1862       int reg_to_offset = inst->dst.reg_offset;
1863
1864       foreach_list(node, &this->instructions) {
1865          fs_inst *scan_inst = (fs_inst *)node;
1866
1867          if (scan_inst->dst.file == GRF &&
1868              scan_inst->dst.reg == reg_from) {
1869             scan_inst->dst.reg = reg_to;
1870             scan_inst->dst.reg_offset = reg_to_offset;
1871          }
1872          for (int i = 0; i < 3; i++) {
1873             if (scan_inst->src[i].file == GRF &&
1874                 scan_inst->src[i].reg == reg_from) {
1875                scan_inst->src[i].reg = reg_to;
1876                scan_inst->src[i].reg_offset = reg_to_offset;
1877             }
1878          }
1879       }
1880
1881       inst->remove();
1882
1883       /* We don't need to recalculate live intervals inside the loop despite
1884        * flagging live_intervals_valid because we only use live intervals for
1885        * the interferes test, and we must have had a situation where the
1886        * intervals were:
1887        *
1888        *  from  to
1889        *  ^
1890        *  |
1891        *  v
1892        *        ^
1893        *        |
1894        *        v
1895        *
1896        * Some register R that might get coalesced with one of these two could
1897        * only be referencing "to", otherwise "from"'s range would have been
1898        * longer.  R's range could also only start at the end of "to" or later,
1899        * otherwise it will conflict with "to" when we try to coalesce "to"
1900        * into Rw anyway.
1901        */
1902       live_intervals_valid = false;
1903
1904       progress = true;
1905       continue;
1906    }
1907
1908    return progress;
1909 }
1910
1911 bool
1912 fs_visitor::register_coalesce()
1913 {
1914    bool progress = false;
1915    int if_depth = 0;
1916    int loop_depth = 0;
1917
1918    foreach_list_safe(node, &this->instructions) {
1919       fs_inst *inst = (fs_inst *)node;
1920
1921       /* Make sure that we dominate the instructions we're going to
1922        * scan for interfering with our coalescing, or we won't have
1923        * scanned enough to see if anything interferes with our
1924        * coalescing.  We don't dominate the following instructions if
1925        * we're in a loop or an if block.
1926        */
1927       switch (inst->opcode) {
1928       case BRW_OPCODE_DO:
1929          loop_depth++;
1930          break;
1931       case BRW_OPCODE_WHILE:
1932          loop_depth--;
1933          break;
1934       case BRW_OPCODE_IF:
1935          if_depth++;
1936          break;
1937       case BRW_OPCODE_ENDIF:
1938          if_depth--;
1939          break;
1940       default:
1941          break;
1942       }
1943       if (loop_depth || if_depth)
1944          continue;
1945
1946       if (inst->opcode != BRW_OPCODE_MOV ||
1947           inst->predicate ||
1948           inst->saturate ||
1949           inst->dst.file != GRF || (inst->src[0].file != GRF &&
1950                                     inst->src[0].file != UNIFORM)||
1951           inst->dst.type != inst->src[0].type)
1952          continue;
1953
1954       bool has_source_modifiers = (inst->src[0].abs ||
1955                                    inst->src[0].negate ||
1956                                    inst->src[0].smear != -1 ||
1957                                    inst->src[0].file == UNIFORM);
1958
1959       /* Found a move of a GRF to a GRF.  Let's see if we can coalesce
1960        * them: check for no writes to either one until the exit of the
1961        * program.
1962        */
1963       bool interfered = false;
1964
1965       for (fs_inst *scan_inst = (fs_inst *)inst->next;
1966            !scan_inst->is_tail_sentinel();
1967            scan_inst = (fs_inst *)scan_inst->next) {
1968          if (scan_inst->dst.file == GRF) {
1969             if (scan_inst->overwrites_reg(inst->dst) ||
1970                 scan_inst->overwrites_reg(inst->src[0])) {
1971                interfered = true;
1972                break;
1973             }
1974          }
1975
1976          /* The gen6 MATH instruction can't handle source modifiers or
1977           * unusual register regions, so avoid coalescing those for
1978           * now.  We should do something more specific.
1979           */
1980          if (has_source_modifiers && !can_do_source_mods(scan_inst)) {
1981             interfered = true;
1982             break;
1983          }
1984
1985          /* The accumulator result appears to get used for the
1986           * conditional modifier generation.  When negating a UD
1987           * value, there is a 33rd bit generated for the sign in the
1988           * accumulator value, so now you can't check, for example,
1989           * equality with a 32-bit value.  See piglit fs-op-neg-uint.
1990           */
1991          if (scan_inst->conditional_mod &&
1992              inst->src[0].negate &&
1993              inst->src[0].type == BRW_REGISTER_TYPE_UD) {
1994             interfered = true;
1995             break;
1996          }
1997       }
1998       if (interfered) {
1999          continue;
2000       }
2001
2002       /* Rewrite the later usage to point at the source of the move to
2003        * be removed.
2004        */
2005       for (fs_inst *scan_inst = inst;
2006            !scan_inst->is_tail_sentinel();
2007            scan_inst = (fs_inst *)scan_inst->next) {
2008          for (int i = 0; i < 3; i++) {
2009             if (scan_inst->src[i].file == GRF &&
2010                 scan_inst->src[i].reg == inst->dst.reg &&
2011                 scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
2012                fs_reg new_src = inst->src[0];
2013                if (scan_inst->src[i].abs) {
2014                   new_src.negate = 0;
2015                   new_src.abs = 1;
2016                }
2017                new_src.negate ^= scan_inst->src[i].negate;
2018                scan_inst->src[i] = new_src;
2019             }
2020          }
2021       }
2022
2023       inst->remove();
2024       progress = true;
2025    }
2026
2027    if (progress)
2028       live_intervals_valid = false;
2029
2030    return progress;
2031 }
2032
2033
2034 bool
2035 fs_visitor::compute_to_mrf()
2036 {
2037    bool progress = false;
2038    int next_ip = 0;
2039
2040    calculate_live_intervals();
2041
2042    foreach_list_safe(node, &this->instructions) {
2043       fs_inst *inst = (fs_inst *)node;
2044
2045       int ip = next_ip;
2046       next_ip++;
2047
2048       if (inst->opcode != BRW_OPCODE_MOV ||
2049           inst->predicate ||
2050           inst->dst.file != MRF || inst->src[0].file != GRF ||
2051           inst->dst.type != inst->src[0].type ||
2052           inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
2053          continue;
2054
2055       /* Work out which hardware MRF registers are written by this
2056        * instruction.
2057        */
2058       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2059       int mrf_high;
2060       if (inst->dst.reg & BRW_MRF_COMPR4) {
2061          mrf_high = mrf_low + 4;
2062       } else if (dispatch_width == 16 &&
2063                  (!inst->force_uncompressed && !inst->force_sechalf)) {
2064          mrf_high = mrf_low + 1;
2065       } else {
2066          mrf_high = mrf_low;
2067       }
2068
2069       /* Can't compute-to-MRF this GRF if someone else was going to
2070        * read it later.
2071        */
2072       if (this->virtual_grf_use[inst->src[0].reg] > ip)
2073          continue;
2074
2075       /* Found a move of a GRF to a MRF.  Let's see if we can go
2076        * rewrite the thing that made this GRF to write into the MRF.
2077        */
2078       fs_inst *scan_inst;
2079       for (scan_inst = (fs_inst *)inst->prev;
2080            scan_inst->prev != NULL;
2081            scan_inst = (fs_inst *)scan_inst->prev) {
2082          if (scan_inst->dst.file == GRF &&
2083              scan_inst->dst.reg == inst->src[0].reg) {
2084             /* Found the last thing to write our reg we want to turn
2085              * into a compute-to-MRF.
2086              */
2087
2088             /* If it's predicated, it (probably) didn't populate all
2089              * the channels.  We might be able to rewrite everything
2090              * that writes that reg, but it would require smarter
2091              * tracking to delay the rewriting until complete success.
2092              */
2093             if (scan_inst->predicate)
2094                break;
2095
2096             /* If it's half of register setup and not the same half as
2097              * our MOV we're trying to remove, bail for now.
2098              */
2099             if (scan_inst->force_uncompressed != inst->force_uncompressed ||
2100                 scan_inst->force_sechalf != inst->force_sechalf) {
2101                break;
2102             }
2103
2104             /* Things returning more than one register would need us to
2105              * understand coalescing out more than one MOV at a time.
2106              */
2107             if (scan_inst->regs_written() > 1)
2108                break;
2109
2110             /* SEND instructions can't have MRF as a destination. */
2111             if (scan_inst->mlen)
2112                break;
2113
2114             if (intel->gen == 6) {
2115                /* gen6 math instructions must have the destination be
2116                 * GRF, so no compute-to-MRF for them.
2117                 */
2118                if (scan_inst->is_math()) {
2119                   break;
2120                }
2121             }
2122
2123             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2124                /* Found the creator of our MRF's source value. */
2125                scan_inst->dst.file = MRF;
2126                scan_inst->dst.reg = inst->dst.reg;
2127                scan_inst->saturate |= inst->saturate;
2128                inst->remove();
2129                progress = true;
2130             }
2131             break;
2132          }
2133
2134          /* We don't handle control flow here.  Most computation of
2135           * values that end up in MRFs are shortly before the MRF
2136           * write anyway.
2137           */
2138          if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2139             break;
2140
2141          /* You can't read from an MRF, so if someone else reads our
2142           * MRF's source GRF that we wanted to rewrite, that stops us.
2143           */
2144          bool interfered = false;
2145          for (int i = 0; i < 3; i++) {
2146             if (scan_inst->src[i].file == GRF &&
2147                 scan_inst->src[i].reg == inst->src[0].reg &&
2148                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2149                interfered = true;
2150             }
2151          }
2152          if (interfered)
2153             break;
2154
2155          if (scan_inst->dst.file == MRF) {
2156             /* If somebody else writes our MRF here, we can't
2157              * compute-to-MRF before that.
2158              */
2159             int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2160             int scan_mrf_high;
2161
2162             if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2163                scan_mrf_high = scan_mrf_low + 4;
2164             } else if (dispatch_width == 16 &&
2165                        (!scan_inst->force_uncompressed &&
2166                         !scan_inst->force_sechalf)) {
2167                scan_mrf_high = scan_mrf_low + 1;
2168             } else {
2169                scan_mrf_high = scan_mrf_low;
2170             }
2171
2172             if (mrf_low == scan_mrf_low ||
2173                 mrf_low == scan_mrf_high ||
2174                 mrf_high == scan_mrf_low ||
2175                 mrf_high == scan_mrf_high) {
2176                break;
2177             }
2178          }
2179
2180          if (scan_inst->mlen > 0) {
2181             /* Found a SEND instruction, which means that there are
2182              * live values in MRFs from base_mrf to base_mrf +
2183              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
2184              * above it.
2185              */
2186             if (mrf_low >= scan_inst->base_mrf &&
2187                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2188                break;
2189             }
2190             if (mrf_high >= scan_inst->base_mrf &&
2191                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2192                break;
2193             }
2194          }
2195       }
2196    }
2197
2198    if (progress)
2199       live_intervals_valid = false;
2200
2201    return progress;
2202 }
2203
2204 /**
2205  * Walks through basic blocks, looking for repeated MRF writes and
2206  * removing the later ones.
2207  */
2208 bool
2209 fs_visitor::remove_duplicate_mrf_writes()
2210 {
2211    fs_inst *last_mrf_move[16];
2212    bool progress = false;
2213
2214    /* Need to update the MRF tracking for compressed instructions. */
2215    if (dispatch_width == 16)
2216       return false;
2217
2218    memset(last_mrf_move, 0, sizeof(last_mrf_move));
2219
2220    foreach_list_safe(node, &this->instructions) {
2221       fs_inst *inst = (fs_inst *)node;
2222
2223       if (inst->is_control_flow()) {
2224          memset(last_mrf_move, 0, sizeof(last_mrf_move));
2225       }
2226
2227       if (inst->opcode == BRW_OPCODE_MOV &&
2228           inst->dst.file == MRF) {
2229          fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2230          if (prev_inst && inst->equals(prev_inst)) {
2231             inst->remove();
2232             progress = true;
2233             continue;
2234          }
2235       }
2236
2237       /* Clear out the last-write records for MRFs that were overwritten. */
2238       if (inst->dst.file == MRF) {
2239          last_mrf_move[inst->dst.reg] = NULL;
2240       }
2241
2242       if (inst->mlen > 0) {
2243          /* Found a SEND instruction, which will include two or fewer
2244           * implied MRF writes.  We could do better here.
2245           */
2246          for (int i = 0; i < implied_mrf_writes(inst); i++) {
2247             last_mrf_move[inst->base_mrf + i] = NULL;
2248          }
2249       }
2250
2251       /* Clear out any MRF move records whose sources got overwritten. */
2252       if (inst->dst.file == GRF) {
2253          for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2254             if (last_mrf_move[i] &&
2255                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2256                last_mrf_move[i] = NULL;
2257             }
2258          }
2259       }
2260
2261       if (inst->opcode == BRW_OPCODE_MOV &&
2262           inst->dst.file == MRF &&
2263           inst->src[0].file == GRF &&
2264           !inst->predicate) {
2265          last_mrf_move[inst->dst.reg] = inst;
2266       }
2267    }
2268
2269    if (progress)
2270       live_intervals_valid = false;
2271
2272    return progress;
2273 }
2274
2275 static void
2276 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2277                         int first_grf, int grf_len)
2278 {
2279    bool inst_16wide = (dispatch_width > 8 &&
2280                        !inst->force_uncompressed &&
2281                        !inst->force_sechalf);
2282
2283    /* Clear the flag for registers that actually got read (as expected). */
2284    for (int i = 0; i < 3; i++) {
2285       int grf;
2286       if (inst->src[i].file == GRF) {
2287          grf = inst->src[i].reg;
2288       } else if (inst->src[i].file == FIXED_HW_REG &&
2289                  inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2290          grf = inst->src[i].fixed_hw_reg.nr;
2291       } else {
2292          continue;
2293       }
2294
2295       if (grf >= first_grf &&
2296           grf < first_grf + grf_len) {
2297          deps[grf - first_grf] = false;
2298          if (inst_16wide)
2299             deps[grf - first_grf + 1] = false;
2300       }
2301    }
2302 }
2303
2304 /**
2305  * Implements this workaround for the original 965:
2306  *
2307  *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2308  *      check for post destination dependencies on this instruction, software
2309  *      must ensure that there is no destination hazard for the case of ‘write
2310  *      followed by a posted write’ shown in the following example.
2311  *
2312  *      1. mov r3 0
2313  *      2. send r3.xy <rest of send instruction>
2314  *      3. mov r2 r3
2315  *
2316  *      Due to no post-destination dependency check on the ‘send’, the above
2317  *      code sequence could have two instructions (1 and 2) in flight at the
2318  *      same time that both consider ‘r3’ as the target of their final writes.
2319  */
2320 void
2321 fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
2322 {
2323    int reg_size = dispatch_width / 8;
2324    int write_len = inst->regs_written() * reg_size;
2325    int first_write_grf = inst->dst.reg;
2326    bool needs_dep[BRW_MAX_MRF];
2327    assert(write_len < (int)sizeof(needs_dep) - 1);
2328
2329    memset(needs_dep, false, sizeof(needs_dep));
2330    memset(needs_dep, true, write_len);
2331
2332    clear_deps_for_inst_src(inst, dispatch_width,
2333                            needs_dep, first_write_grf, write_len);
2334
2335    /* Walk backwards looking for writes to registers we're writing which
2336     * aren't read since being written.  If we hit the start of the program,
2337     * we assume that there are no outstanding dependencies on entry to the
2338     * program.
2339     */
2340    for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2341         scan_inst != NULL;
2342         scan_inst = (fs_inst *)scan_inst->prev) {
2343
2344       /* If we hit control flow, assume that there *are* outstanding
2345        * dependencies, and force their cleanup before our instruction.
2346        */
2347       if (scan_inst->is_control_flow()) {
2348          for (int i = 0; i < write_len; i++) {
2349             if (needs_dep[i]) {
2350                inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2351             }
2352          }
2353       }
2354
2355       bool scan_inst_16wide = (dispatch_width > 8 &&
2356                                !scan_inst->force_uncompressed &&
2357                                !scan_inst->force_sechalf);
2358
2359       /* We insert our reads as late as possible on the assumption that any
2360        * instruction but a MOV that might have left us an outstanding
2361        * dependency has more latency than a MOV.
2362        */
2363       if (scan_inst->dst.file == GRF) {
2364          for (int i = 0; i < scan_inst->regs_written(); i++) {
2365             int reg = scan_inst->dst.reg + i * reg_size;
2366
2367             if (reg >= first_write_grf &&
2368                 reg < first_write_grf + write_len &&
2369                 needs_dep[reg - first_write_grf]) {
2370                inst->insert_before(DEP_RESOLVE_MOV(reg));
2371                needs_dep[reg - first_write_grf] = false;
2372                if (scan_inst_16wide)
2373                   needs_dep[reg - first_write_grf + 1] = false;
2374             }
2375          }
2376       }
2377
2378       /* Clear the flag for registers that actually got read (as expected). */
2379       clear_deps_for_inst_src(scan_inst, dispatch_width,
2380                               needs_dep, first_write_grf, write_len);
2381
2382       /* Continue the loop only if we haven't resolved all the dependencies */
2383       int i;
2384       for (i = 0; i < write_len; i++) {
2385          if (needs_dep[i])
2386             break;
2387       }
2388       if (i == write_len)
2389          return;
2390    }
2391 }
2392
2393 /**
2394  * Implements this workaround for the original 965:
2395  *
2396  *     "[DevBW, DevCL] Errata: A destination register from a send can not be
2397  *      used as a destination register until after it has been sourced by an
2398  *      instruction with a different destination register.
2399  */
2400 void
2401 fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
2402 {
2403    int write_len = inst->regs_written() * dispatch_width / 8;
2404    int first_write_grf = inst->dst.reg;
2405    bool needs_dep[BRW_MAX_MRF];
2406    assert(write_len < (int)sizeof(needs_dep) - 1);
2407
2408    memset(needs_dep, false, sizeof(needs_dep));
2409    memset(needs_dep, true, write_len);
2410    /* Walk forwards looking for writes to registers we're writing which aren't
2411     * read before being written.
2412     */
2413    for (fs_inst *scan_inst = (fs_inst *)inst->next;
2414         !scan_inst->is_tail_sentinel();
2415         scan_inst = (fs_inst *)scan_inst->next) {
2416       /* If we hit control flow, force resolve all remaining dependencies. */
2417       if (scan_inst->is_control_flow()) {
2418          for (int i = 0; i < write_len; i++) {
2419             if (needs_dep[i])
2420                scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2421          }
2422       }
2423
2424       /* Clear the flag for registers that actually got read (as expected). */
2425       clear_deps_for_inst_src(scan_inst, dispatch_width,
2426                               needs_dep, first_write_grf, write_len);
2427
2428       /* We insert our reads as late as possible since they're reading the
2429        * result of a SEND, which has massive latency.
2430        */
2431       if (scan_inst->dst.file == GRF &&
2432           scan_inst->dst.reg >= first_write_grf &&
2433           scan_inst->dst.reg < first_write_grf + write_len &&
2434           needs_dep[scan_inst->dst.reg - first_write_grf]) {
2435          scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2436          needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2437       }
2438
2439       /* Continue the loop only if we haven't resolved all the dependencies */
2440       int i;
2441       for (i = 0; i < write_len; i++) {
2442          if (needs_dep[i])
2443             break;
2444       }
2445       if (i == write_len)
2446          return;
2447    }
2448
2449    /* If we hit the end of the program, resolve all remaining dependencies out
2450     * of paranoia.
2451     */
2452    fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2453    assert(last_inst->eot);
2454    for (int i = 0; i < write_len; i++) {
2455       if (needs_dep[i])
2456          last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2457    }
2458 }
2459
2460 void
2461 fs_visitor::insert_gen4_send_dependency_workarounds()
2462 {
2463    if (intel->gen != 4 || intel->is_g4x)
2464       return;
2465
2466    /* Note that we're done with register allocation, so GRF fs_regs always
2467     * have a .reg_offset of 0.
2468     */
2469
2470    foreach_list_safe(node, &this->instructions) {
2471       fs_inst *inst = (fs_inst *)node;
2472
2473       if (inst->mlen != 0 && inst->dst.file == GRF) {
2474          insert_gen4_pre_send_dependency_workarounds(inst);
2475          insert_gen4_post_send_dependency_workarounds(inst);
2476       }
2477    }
2478 }
2479
2480 /**
2481  * Turns the generic expression-style uniform pull constant load instruction
2482  * into a hardware-specific series of instructions for loading a pull
2483  * constant.
2484  *
2485  * The expression style allows the CSE pass before this to optimize out
2486  * repeated loads from the same offset, and gives the pre-register-allocation
2487  * scheduling full flexibility, while the conversion to native instructions
2488  * allows the post-register-allocation scheduler the best information
2489  * possible.
2490  *
2491  * Note that execution masking for setting up pull constant loads is special:
2492  * the channels that need to be written are unrelated to the current execution
2493  * mask, since a later instruction will use one of the result channels as a
2494  * source operand for all 8 or 16 of its channels.
2495  */
2496 void
2497 fs_visitor::lower_uniform_pull_constant_loads()
2498 {
2499    foreach_list(node, &this->instructions) {
2500       fs_inst *inst = (fs_inst *)node;
2501
2502       if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2503          continue;
2504
2505       if (intel->gen >= 7) {
2506          /* The offset arg before was a vec4-aligned byte offset.  We need to
2507           * turn it into a dword offset.
2508           */
2509          fs_reg const_offset_reg = inst->src[1];
2510          assert(const_offset_reg.file == IMM &&
2511                 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2512          const_offset_reg.imm.u /= 4;
2513          fs_reg payload = fs_reg(this, glsl_type::uint_type);
2514
2515          /* This is actually going to be a MOV, but since only the first dword
2516           * is accessed, we have a special opcode to do just that one.  Note
2517           * that this needs to be an operation that will be considered a def
2518           * by live variable analysis, or register allocation will explode.
2519           */
2520          fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2521                                                payload, const_offset_reg);
2522          setup->force_writemask_all = true;
2523
2524          setup->ir = inst->ir;
2525          setup->annotation = inst->annotation;
2526          inst->insert_before(setup);
2527
2528          /* Similarly, this will only populate the first 4 channels of the
2529           * result register (since we only use smear values from 0-3), but we
2530           * don't tell the optimizer.
2531           */
2532          inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2533          inst->src[1] = payload;
2534
2535          this->live_intervals_valid = false;
2536       } else {
2537          /* Before register allocation, we didn't tell the scheduler about the
2538           * MRF we use.  We know it's safe to use this MRF because nothing
2539           * else does except for register spill/unspill, which generates and
2540           * uses its MRF within a single IR instruction.
2541           */
2542          inst->base_mrf = 14;
2543          inst->mlen = 1;
2544       }
2545    }
2546 }
2547
2548 void
2549 fs_visitor::dump_instruction(fs_inst *inst)
2550 {
2551    if (inst->predicate) {
2552       printf("(%cf0.%d) ",
2553              inst->predicate_inverse ? '-' : '+',
2554              inst->flag_subreg);
2555    }
2556
2557    printf("%s", brw_instruction_name(inst->opcode));
2558    if (inst->saturate)
2559       printf(".sat");
2560    if (inst->conditional_mod) {
2561       printf(".cmod");
2562       if (!inst->predicate &&
2563           (intel->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2564                               inst->opcode != BRW_OPCODE_IF &&
2565                               inst->opcode != BRW_OPCODE_WHILE))) {
2566          printf(".f0.%d\n", inst->flag_subreg);
2567       }
2568    }
2569    printf(" ");
2570
2571
2572    switch (inst->dst.file) {
2573    case GRF:
2574       printf("vgrf%d", inst->dst.reg);
2575       if (inst->dst.reg_offset)
2576          printf("+%d", inst->dst.reg_offset);
2577       break;
2578    case MRF:
2579       printf("m%d", inst->dst.reg);
2580       break;
2581    case BAD_FILE:
2582       printf("(null)");
2583       break;
2584    case UNIFORM:
2585       printf("***u%d***", inst->dst.reg);
2586       break;
2587    default:
2588       printf("???");
2589       break;
2590    }
2591    printf(", ");
2592
2593    for (int i = 0; i < 3; i++) {
2594       if (inst->src[i].negate)
2595          printf("-");
2596       if (inst->src[i].abs)
2597          printf("|");
2598       switch (inst->src[i].file) {
2599       case GRF:
2600          printf("vgrf%d", inst->src[i].reg);
2601          if (inst->src[i].reg_offset)
2602             printf("+%d", inst->src[i].reg_offset);
2603          break;
2604       case MRF:
2605          printf("***m%d***", inst->src[i].reg);
2606          break;
2607       case UNIFORM:
2608          printf("u%d", inst->src[i].reg);
2609          if (inst->src[i].reg_offset)
2610             printf(".%d", inst->src[i].reg_offset);
2611          break;
2612       case BAD_FILE:
2613          printf("(null)");
2614          break;
2615       case IMM:
2616          switch (inst->src[i].type) {
2617          case BRW_REGISTER_TYPE_F:
2618             printf("%ff", inst->src[i].imm.f);
2619             break;
2620          case BRW_REGISTER_TYPE_D:
2621             printf("%dd", inst->src[i].imm.i);
2622             break;
2623          case BRW_REGISTER_TYPE_UD:
2624             printf("%uu", inst->src[i].imm.u);
2625             break;
2626          default:
2627             printf("???");
2628             break;
2629          }
2630          break;
2631       default:
2632          printf("???");
2633          break;
2634       }
2635       if (inst->src[i].abs)
2636          printf("|");
2637
2638       if (i < 3)
2639          printf(", ");
2640    }
2641
2642    printf(" ");
2643
2644    if (inst->force_uncompressed)
2645       printf("1sthalf ");
2646
2647    if (inst->force_sechalf)
2648       printf("2ndhalf ");
2649
2650    printf("\n");
2651 }
2652
2653 void
2654 fs_visitor::dump_instructions()
2655 {
2656    int ip = 0;
2657    foreach_list(node, &this->instructions) {
2658       fs_inst *inst = (fs_inst *)node;
2659       printf("%d: ", ip++);
2660       dump_instruction(inst);
2661    }
2662 }
2663
2664 /**
2665  * Possibly returns an instruction that set up @param reg.
2666  *
2667  * Sometimes we want to take the result of some expression/variable
2668  * dereference tree and rewrite the instruction generating the result
2669  * of the tree.  When processing the tree, we know that the
2670  * instructions generated are all writing temporaries that are dead
2671  * outside of this tree.  So, if we have some instructions that write
2672  * a temporary, we're free to point that temp write somewhere else.
2673  *
2674  * Note that this doesn't guarantee that the instruction generated
2675  * only reg -- it might be the size=4 destination of a texture instruction.
2676  */
2677 fs_inst *
2678 fs_visitor::get_instruction_generating_reg(fs_inst *start,
2679                                            fs_inst *end,
2680                                            fs_reg reg)
2681 {
2682    if (end == start ||
2683        end->predicate ||
2684        end->force_uncompressed ||
2685        end->force_sechalf ||
2686        reg.reladdr ||
2687        !reg.equals(end->dst)) {
2688       return NULL;
2689    } else {
2690       return end;
2691    }
2692 }
2693
2694 void
2695 fs_visitor::setup_payload_gen6()
2696 {
2697    struct intel_context *intel = &brw->intel;
2698    bool uses_depth =
2699       (fp->Base.InputsRead & (1 << VARYING_SLOT_POS)) != 0;
2700    unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
2701
2702    assert(intel->gen >= 6);
2703
2704    /* R0-1: masks, pixel X/Y coordinates. */
2705    c->nr_payload_regs = 2;
2706    /* R2: only for 32-pixel dispatch.*/
2707
2708    /* R3-26: barycentric interpolation coordinates.  These appear in the
2709     * same order that they appear in the brw_wm_barycentric_interp_mode
2710     * enum.  Each set of coordinates occupies 2 registers if dispatch width
2711     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
2712     * appear if they were enabled using the "Barycentric Interpolation
2713     * Mode" bits in WM_STATE.
2714     */
2715    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
2716       if (barycentric_interp_modes & (1 << i)) {
2717          c->barycentric_coord_reg[i] = c->nr_payload_regs;
2718          c->nr_payload_regs += 2;
2719          if (dispatch_width == 16) {
2720             c->nr_payload_regs += 2;
2721          }
2722       }
2723    }
2724
2725    /* R27: interpolated depth if uses source depth */
2726    if (uses_depth) {
2727       c->source_depth_reg = c->nr_payload_regs;
2728       c->nr_payload_regs++;
2729       if (dispatch_width == 16) {
2730          /* R28: interpolated depth if not 8-wide. */
2731          c->nr_payload_regs++;
2732       }
2733    }
2734    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
2735    if (uses_depth) {
2736       c->source_w_reg = c->nr_payload_regs;
2737       c->nr_payload_regs++;
2738       if (dispatch_width == 16) {
2739          /* R30: interpolated W if not 8-wide. */
2740          c->nr_payload_regs++;
2741       }
2742    }
2743    /* R31: MSAA position offsets. */
2744    /* R32-: bary for 32-pixel. */
2745    /* R58-59: interp W for 32-pixel. */
2746
2747    if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
2748       c->source_depth_to_render_target = true;
2749    }
2750 }
2751
2752 bool
2753 fs_visitor::run()
2754 {
2755    sanity_param_count = fp->Base.Parameters->NumParameters;
2756    uint32_t orig_nr_params = c->prog_data.nr_params;
2757
2758    if (intel->gen >= 6)
2759       setup_payload_gen6();
2760    else
2761       setup_payload_gen4();
2762
2763    if (0) {
2764       emit_dummy_fs();
2765    } else {
2766       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2767          emit_shader_time_begin();
2768
2769       calculate_urb_setup();
2770       if (intel->gen < 6)
2771          emit_interpolation_setup_gen4();
2772       else
2773          emit_interpolation_setup_gen6();
2774
2775       /* We handle discards by keeping track of the still-live pixels in f0.1.
2776        * Initialize it with the dispatched pixels.
2777        */
2778       if (fp->UsesKill) {
2779          fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
2780          discard_init->flag_subreg = 1;
2781       }
2782
2783       /* Generate FS IR for main().  (the visitor only descends into
2784        * functions called "main").
2785        */
2786       if (shader) {
2787          foreach_list(node, &*shader->ir) {
2788             ir_instruction *ir = (ir_instruction *)node;
2789             base_ir = ir;
2790             this->result = reg_undef;
2791             ir->accept(this);
2792          }
2793       } else {
2794          emit_fragment_program_code();
2795       }
2796       base_ir = NULL;
2797       if (failed)
2798          return false;
2799
2800       emit(FS_OPCODE_PLACEHOLDER_HALT);
2801
2802       emit_fb_writes();
2803
2804       split_virtual_grfs();
2805
2806       move_uniform_array_access_to_pull_constants();
2807       setup_pull_constants();
2808
2809       bool progress;
2810       do {
2811          progress = false;
2812
2813          compact_virtual_grfs();
2814
2815          progress = remove_duplicate_mrf_writes() || progress;
2816
2817          progress = opt_algebraic() || progress;
2818          progress = opt_cse() || progress;
2819          progress = opt_copy_propagate() || progress;
2820          progress = dead_code_eliminate() || progress;
2821          progress = register_coalesce() || progress;
2822          progress = register_coalesce_2() || progress;
2823          progress = compute_to_mrf() || progress;
2824       } while (progress);
2825
2826       remove_dead_constants();
2827
2828       schedule_instructions(false);
2829
2830       lower_uniform_pull_constant_loads();
2831
2832       assign_curb_setup();
2833       assign_urb_setup();
2834
2835       if (0) {
2836          /* Debug of register spilling: Go spill everything. */
2837          for (int i = 0; i < virtual_grf_count; i++) {
2838             spill_reg(i);
2839          }
2840       }
2841
2842       if (0)
2843          assign_regs_trivial();
2844       else {
2845          while (!assign_regs()) {
2846             if (failed)
2847                break;
2848          }
2849       }
2850    }
2851    assert(force_uncompressed_stack == 0);
2852    assert(force_sechalf_stack == 0);
2853
2854    /* This must come after all optimization and register allocation, since
2855     * it inserts dead code that happens to have side effects, and it does
2856     * so based on the actual physical registers in use.
2857     */
2858    insert_gen4_send_dependency_workarounds();
2859
2860    if (failed)
2861       return false;
2862
2863    schedule_instructions(true);
2864
2865    if (dispatch_width == 8) {
2866       c->prog_data.reg_blocks = brw_register_blocks(grf_used);
2867    } else {
2868       c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
2869
2870       /* Make sure we didn't try to sneak in an extra uniform */
2871       assert(orig_nr_params == c->prog_data.nr_params);
2872       (void) orig_nr_params;
2873    }
2874
2875    /* If any state parameters were appended, then ParameterValues could have
2876     * been realloced, in which case the driver uniform storage set up by
2877     * _mesa_associate_uniform_storage() would point to freed memory.  Make
2878     * sure that didn't happen.
2879     */
2880    assert(sanity_param_count == fp->Base.Parameters->NumParameters);
2881
2882    return !failed;
2883 }
2884
2885 const unsigned *
2886 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
2887                struct gl_fragment_program *fp,
2888                struct gl_shader_program *prog,
2889                unsigned *final_assembly_size)
2890 {
2891    struct intel_context *intel = &brw->intel;
2892    bool start_busy = false;
2893    float start_time = 0;
2894
2895    if (unlikely(intel->perf_debug)) {
2896       start_busy = (intel->batch.last_bo &&
2897                     drm_intel_bo_busy(intel->batch.last_bo));
2898       start_time = get_time();
2899    }
2900
2901    struct brw_shader *shader = NULL;
2902    if (prog)
2903       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
2904
2905    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
2906       if (shader) {
2907          printf("GLSL IR for native fragment shader %d:\n", prog->Name);
2908          _mesa_print_ir(shader->ir, NULL);
2909          printf("\n\n");
2910       } else {
2911          printf("ARB_fragment_program %d ir for native fragment shader\n",
2912                 fp->Base.Id);
2913          _mesa_print_program(&fp->Base);
2914       }
2915    }
2916
2917    /* Now the main event: Visit the shader IR and generate our FS IR for it.
2918     */
2919    fs_visitor v(brw, c, prog, fp, 8);
2920    if (!v.run()) {
2921       prog->LinkStatus = false;
2922       ralloc_strcat(&prog->InfoLog, v.fail_msg);
2923
2924       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
2925                     v.fail_msg);
2926
2927       return NULL;
2928    }
2929
2930    exec_list *simd16_instructions = NULL;
2931    fs_visitor v2(brw, c, prog, fp, 16);
2932    bool no16 = INTEL_DEBUG & DEBUG_NO16;
2933    if (intel->gen >= 5 && c->prog_data.nr_pull_params == 0 && likely(!no16)) {
2934       v2.import_uniforms(&v);
2935       if (!v2.run()) {
2936          perf_debug("16-wide shader failed to compile, falling back to "
2937                     "8-wide at a 10-20%% performance cost: %s", v2.fail_msg);
2938       } else {
2939          simd16_instructions = &v2.instructions;
2940       }
2941    }
2942
2943    c->prog_data.dispatch_width = 8;
2944
2945    fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
2946    const unsigned *generated = g.generate_assembly(&v.instructions,
2947                                                    simd16_instructions,
2948                                                    final_assembly_size);
2949
2950    if (unlikely(intel->perf_debug) && shader) {
2951       if (shader->compiled_once)
2952          brw_wm_debug_recompile(brw, prog, &c->key);
2953       shader->compiled_once = true;
2954
2955       if (start_busy && !drm_intel_bo_busy(intel->batch.last_bo)) {
2956          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
2957                     (get_time() - start_time) * 1000);
2958       }
2959    }
2960
2961    return generated;
2962 }
2963
2964 bool
2965 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
2966 {
2967    struct brw_context *brw = brw_context(ctx);
2968    struct intel_context *intel = &brw->intel;
2969    struct brw_wm_prog_key key;
2970
2971    if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
2972       return true;
2973
2974    struct gl_fragment_program *fp = (struct gl_fragment_program *)
2975       prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
2976    struct brw_fragment_program *bfp = brw_fragment_program(fp);
2977    bool program_uses_dfdy = fp->UsesDFdy;
2978
2979    memset(&key, 0, sizeof(key));
2980
2981    if (intel->gen < 6) {
2982       if (fp->UsesKill)
2983          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
2984
2985       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
2986          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
2987
2988       /* Just assume depth testing. */
2989       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
2990       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
2991    }
2992
2993    if (prog->Name != 0)
2994       key.proj_attrib_mask = ~(GLbitfield64) 0;
2995    else {
2996       /* Bit VARYING_BIT_POS of key.proj_attrib_mask is never used, so to
2997        * avoid unnecessary recompiles, always set it to 1.
2998        */
2999       key.proj_attrib_mask |= VARYING_BIT_POS;
3000    }
3001
3002    if (intel->gen < 6)
3003       key.input_slots_valid |= BITFIELD64_BIT(VARYING_SLOT_POS);
3004
3005    for (int i = 0; i < VARYING_SLOT_MAX; i++) {
3006       if (!(fp->Base.InputsRead & BITFIELD64_BIT(i)))
3007          continue;
3008
3009       if (prog->Name == 0)
3010          key.proj_attrib_mask |= BITFIELD64_BIT(i);
3011
3012       if (intel->gen < 6) {
3013          if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
3014             key.input_slots_valid |= BITFIELD64_BIT(i);
3015       }
3016    }
3017
3018    key.clamp_fragment_color = true;
3019
3020    for (int i = 0; i < MAX_SAMPLERS; i++) {
3021       if (fp->Base.ShadowSamplers & (1 << i)) {
3022          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3023          key.tex.swizzles[i] =
3024             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3025       } else {
3026          /* Color sampler: assume no swizzling. */
3027          key.tex.swizzles[i] = SWIZZLE_XYZW;
3028       }
3029    }
3030
3031    if (fp->Base.InputsRead & VARYING_BIT_POS) {
3032       key.drawable_height = ctx->DrawBuffer->Height;
3033    }
3034
3035    if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3036       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
3037    }
3038
3039    key.nr_color_regions = 1;
3040
3041    key.program_string_id = bfp->id;
3042
3043    uint32_t old_prog_offset = brw->wm.prog_offset;
3044    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3045
3046    bool success = do_wm_prog(brw, prog, bfp, &key);
3047
3048    brw->wm.prog_offset = old_prog_offset;
3049    brw->wm.prog_data = old_prog_data;
3050
3051    return success;
3052 }