src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 extern "C" {
  32
  33 #include <sys/types.h>
  34
  35 #include "main/macros.h"
  36 #include "main/shaderobj.h"
  37 #include "main/uniforms.h"
  38 #include "main/fbobject.h"
  39 #include "program/prog_parameter.h"
  40 #include "program/prog_print.h"
  41 #include "program/register_allocate.h"
  42 #include "program/sampler.h"
  43 #include "program/hash_table.h"
  44 #include "brw_context.h"
  45 #include "brw_eu.h"
  46 #include "brw_wm.h"
  47 }
  48 #include "brw_fs.h"
  49 #include "glsl/glsl_types.h"
  50 #include "glsl/ir_print_visitor.h"
  51
  52 void
  53 fs_inst::init()
  54 {
  55    memset(this, 0, sizeof(*this));
  56    this->opcode = BRW_OPCODE_NOP;
  57    this->conditional_mod = BRW_CONDITIONAL_NONE;
  58
  59    this->dst = reg_undef;
  60    this->src[0] = reg_undef;
  61    this->src[1] = reg_undef;
  62    this->src[2] = reg_undef;
  63 }
  64
  65 fs_inst::fs_inst()
  66 {
  67    init();
  68 }
  69
  70 fs_inst::fs_inst(enum opcode opcode)
  71 {
  72    init();
  73    this->opcode = opcode;
  74 }
  75
  76 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
  77 {
  78    init();
  79    this->opcode = opcode;
  80    this->dst = dst;
  81
  82    if (dst.file == GRF)
  83       assert(dst.reg_offset >= 0);
  84 }
  85
  86 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
  87 {
  88    init();
  89    this->opcode = opcode;
  90    this->dst = dst;
  91    this->src[0] = src0;
  92
  93    if (dst.file == GRF)
  94       assert(dst.reg_offset >= 0);
  95    if (src[0].file == GRF)
  96       assert(src[0].reg_offset >= 0);
  97 }
  98
  99 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 100 {
 101    init();
 102    this->opcode = opcode;
 103    this->dst = dst;
 104    this->src[0] = src0;
 105    this->src[1] = src1;
 106
 107    if (dst.file == GRF)
 108       assert(dst.reg_offset >= 0);
 109    if (src[0].file == GRF)
 110       assert(src[0].reg_offset >= 0);
 111    if (src[1].file == GRF)
 112       assert(src[1].reg_offset >= 0);
 113 }
 114
 115 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
 116                  fs_reg src0, fs_reg src1, fs_reg src2)
 117 {
 118    init();
 119    this->opcode = opcode;
 120    this->dst = dst;
 121    this->src[0] = src0;
 122    this->src[1] = src1;
 123    this->src[2] = src2;
 124
 125    if (dst.file == GRF)
 126       assert(dst.reg_offset >= 0);
 127    if (src[0].file == GRF)
 128       assert(src[0].reg_offset >= 0);
 129    if (src[1].file == GRF)
 130       assert(src[1].reg_offset >= 0);
 131    if (src[2].file == GRF)
 132       assert(src[2].reg_offset >= 0);
 133 }
 134
 135 #define ALU1(op)                                                        \
 136    fs_inst *                                                            \
 137    fs_visitor::op(fs_reg dst, fs_reg src0)                              \
 138    {                                                                    \
 139       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
 140    }
 141
 142 #define ALU2(op)                                                        \
 143    fs_inst *                                                            \
 144    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1)                 \
 145    {                                                                    \
 146       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
 147    }
 148
 149 #define ALU3(op)                                                        \
 150    fs_inst *                                                            \
 151    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1, fs_reg src2)    \
 152    {                                                                    \
 153       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
 154    }
 155
 156 ALU1(NOT)
 157 ALU1(MOV)
 158 ALU1(FRC)
 159 ALU1(RNDD)
 160 ALU1(RNDE)
 161 ALU1(RNDZ)
 162 ALU2(ADD)
 163 ALU2(MUL)
 164 ALU2(MACH)
 165 ALU2(AND)
 166 ALU2(OR)
 167 ALU2(XOR)
 168 ALU2(SHL)
 169 ALU2(SHR)
 170 ALU2(ASR)
 171 ALU3(LRP)
 172
 173 /** Gen4 predicated IF. */
 174 fs_inst *
 175 fs_visitor::IF(uint32_t predicate)
 176 {
 177    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
 178    inst->predicate = predicate;
 179    return inst;
 180 }
 181
 182 /** Gen6+ IF with embedded comparison. */
 183 fs_inst *
 184 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
 185 {
 186    assert(intel->gen >= 6);
 187    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
 188                                         reg_null_d, src0, src1);
 189    inst->conditional_mod = condition;
 190    return inst;
 191 }
 192
 193 /**
 194  * CMP: Sets the low bit of the destination channels with the result
 195  * of the comparison, while the upper bits are undefined, and updates
 196  * the flag register with the packed 16 bits of the result.
 197  */
 198 fs_inst *
 199 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
 200 {
 201    fs_inst *inst;
 202
 203    /* Take the instruction:
 204     *
 205     * CMP null<d> src0<f> src1<f>
 206     *
 207     * Original gen4 does type conversion to the destination type before
 208     * comparison, producing garbage results for floating point comparisons.
 209     * gen5 does the comparison on the execution type (resolved source types),
 210     * so dst type doesn't matter.  gen6 does comparison and then uses the
 211     * result as if it was the dst type with no conversion, which happens to
 212     * mostly work out for float-interpreted-as-int since our comparisons are
 213     * for >0, =0, <0.
 214     */
 215    if (intel->gen == 4) {
 216       dst.type = src0.type;
 217       if (dst.file == FIXED_HW_REG)
 218          dst.fixed_hw_reg.type = dst.type;
 219    }
 220
 221    resolve_ud_negate(&src0);
 222    resolve_ud_negate(&src1);
 223
 224    inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
 225    inst->conditional_mod = condition;
 226
 227    return inst;
 228 }
 229
 230 exec_list
 231 fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
 232                                        fs_reg varying_offset,
 233                                        uint32_t const_offset)
 234 {
 235    exec_list instructions;
 236    fs_inst *inst;
 237
 238    fs_reg offset = fs_reg(this, glsl_type::uint_type);
 239    instructions.push_tail(ADD(offset, varying_offset, fs_reg(const_offset)));
 240
 241    if (intel->gen >= 7) {
 242       inst = new(mem_ctx) fs_inst(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7,
 243                                   dst, surf_index, offset);
 244       instructions.push_tail(inst);
 245    } else {
 246       int base_mrf = 13;
 247       bool header_present = true;
 248
 249       fs_reg mrf = fs_reg(MRF, base_mrf + header_present);
 250       mrf.type = BRW_REGISTER_TYPE_D;
 251
 252       /* On gen6+ we want the dword offset passed in, but on gen4/5 we need a
 253        * dword-aligned byte offset.
 254        */
 255       if (intel->gen == 6) {
 256          instructions.push_tail(MOV(mrf, offset));
 257       } else {
 258          instructions.push_tail(MUL(mrf, offset, fs_reg(4)));
 259       }
 260       inst = new(mem_ctx) fs_inst(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD,
 261                                   dst, surf_index);
 262       inst->header_present = header_present;
 263       inst->base_mrf = base_mrf;
 264       inst->mlen = header_present + dispatch_width / 8;
 265
 266       instructions.push_tail(inst);
 267    }
 268
 269    return instructions;
 270 }
 271
 272 /**
 273  * A helper for MOV generation for fixing up broken hardware SEND dependency
 274  * handling.
 275  */
 276 fs_inst *
 277 fs_visitor::DEP_RESOLVE_MOV(int grf)
 278 {
 279    fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
 280
 281    inst->ir = NULL;
 282    inst->annotation = "send dependency resolve";
 283
 284    /* The caller always wants uncompressed to emit the minimal extra
 285     * dependencies, and to avoid having to deal with aligning its regs to 2.
 286     */
 287    inst->force_uncompressed = true;
 288
 289    return inst;
 290 }
 291
 292 bool
 293 fs_inst::equals(fs_inst *inst)
 294 {
 295    return (opcode == inst->opcode &&
 296            dst.equals(inst->dst) &&
 297            src[0].equals(inst->src[0]) &&
 298            src[1].equals(inst->src[1]) &&
 299            src[2].equals(inst->src[2]) &&
 300            saturate == inst->saturate &&
 301            predicate == inst->predicate &&
 302            conditional_mod == inst->conditional_mod &&
 303            mlen == inst->mlen &&
 304            base_mrf == inst->base_mrf &&
 305            sampler == inst->sampler &&
 306            target == inst->target &&
 307            eot == inst->eot &&
 308            header_present == inst->header_present &&
 309            shadow_compare == inst->shadow_compare &&
 310            offset == inst->offset);
 311 }
 312
 313 int
 314 fs_inst::regs_written()
 315 {
 316    if (is_tex())
 317       return 4;
 318
 319    /* The SINCOS and INT_DIV_QUOTIENT_AND_REMAINDER math functions return 2,
 320     * but we don't currently use them...nor do we have an opcode for them.
 321     */
 322
 323    return 1;
 324 }
 325
 326 bool
 327 fs_inst::overwrites_reg(const fs_reg &reg)
 328 {
 329    return (reg.file == dst.file &&
 330            reg.reg == dst.reg &&
 331            reg.reg_offset >= dst.reg_offset  &&
 332            reg.reg_offset < dst.reg_offset + regs_written());
 333 }
 334
 335 bool
 336 fs_inst::is_tex()
 337 {
 338    return (opcode == SHADER_OPCODE_TEX ||
 339            opcode == FS_OPCODE_TXB ||
 340            opcode == SHADER_OPCODE_TXD ||
 341            opcode == SHADER_OPCODE_TXF ||
 342            opcode == SHADER_OPCODE_TXF_MS ||
 343            opcode == SHADER_OPCODE_TXL ||
 344            opcode == SHADER_OPCODE_TXS ||
 345            opcode == SHADER_OPCODE_LOD);
 346 }
 347
 348 bool
 349 fs_inst::is_math()
 350 {
 351    return (opcode == SHADER_OPCODE_RCP ||
 352            opcode == SHADER_OPCODE_RSQ ||
 353            opcode == SHADER_OPCODE_SQRT ||
 354            opcode == SHADER_OPCODE_EXP2 ||
 355            opcode == SHADER_OPCODE_LOG2 ||
 356            opcode == SHADER_OPCODE_SIN ||
 357            opcode == SHADER_OPCODE_COS ||
 358            opcode == SHADER_OPCODE_INT_QUOTIENT ||
 359            opcode == SHADER_OPCODE_INT_REMAINDER ||
 360            opcode == SHADER_OPCODE_POW);
 361 }
 362
 363 bool
 364 fs_inst::is_control_flow()
 365 {
 366    switch (opcode) {
 367    case BRW_OPCODE_DO:
 368    case BRW_OPCODE_WHILE:
 369    case BRW_OPCODE_IF:
 370    case BRW_OPCODE_ELSE:
 371    case BRW_OPCODE_ENDIF:
 372    case BRW_OPCODE_BREAK:
 373    case BRW_OPCODE_CONTINUE:
 374       return true;
 375    default:
 376       return false;
 377    }
 378 }
 379
 380 bool
 381 fs_inst::is_send_from_grf()
 382 {
 383    return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
 384            opcode == SHADER_OPCODE_SHADER_TIME_ADD ||
 385            (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
 386             src[1].file == GRF));
 387 }
 388
 389 bool
 390 fs_visitor::can_do_source_mods(fs_inst *inst)
 391 {
 392    if (intel->gen == 6 && inst->is_math())
 393       return false;
 394
 395    if (inst->is_send_from_grf())
 396       return false;
 397
 398    return true;
 399 }
 400
 401 void
 402 fs_reg::init()
 403 {
 404    memset(this, 0, sizeof(*this));
 405    this->smear = -1;
 406 }
 407
 408 /** Generic unset register constructor. */
 409 fs_reg::fs_reg()
 410 {
 411    init();
 412    this->file = BAD_FILE;
 413 }
 414
 415 /** Immediate value constructor. */
 416 fs_reg::fs_reg(float f)
 417 {
 418    init();
 419    this->file = IMM;
 420    this->type = BRW_REGISTER_TYPE_F;
 421    this->imm.f = f;
 422 }
 423
 424 /** Immediate value constructor. */
 425 fs_reg::fs_reg(int32_t i)
 426 {
 427    init();
 428    this->file = IMM;
 429    this->type = BRW_REGISTER_TYPE_D;
 430    this->imm.i = i;
 431 }
 432
 433 /** Immediate value constructor. */
 434 fs_reg::fs_reg(uint32_t u)
 435 {
 436    init();
 437    this->file = IMM;
 438    this->type = BRW_REGISTER_TYPE_UD;
 439    this->imm.u = u;
 440 }
 441
 442 /** Fixed brw_reg Immediate value constructor. */
 443 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
 444 {
 445    init();
 446    this->file = FIXED_HW_REG;
 447    this->fixed_hw_reg = fixed_hw_reg;
 448    this->type = fixed_hw_reg.type;
 449 }
 450
 451 bool
 452 fs_reg::equals(const fs_reg &r) const
 453 {
 454    return (file == r.file &&
 455            reg == r.reg &&
 456            reg_offset == r.reg_offset &&
 457            type == r.type &&
 458            negate == r.negate &&
 459            abs == r.abs &&
 460            !reladdr && !r.reladdr &&
 461            memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
 462                   sizeof(fixed_hw_reg)) == 0 &&
 463            smear == r.smear &&
 464            imm.u == r.imm.u);
 465 }
 466
 467 bool
 468 fs_reg::is_zero() const
 469 {
 470    if (file != IMM)
 471       return false;
 472
 473    return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
 474 }
 475
 476 bool
 477 fs_reg::is_one() const
 478 {
 479    if (file != IMM)
 480       return false;
 481
 482    return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
 483 }
 484
 485 int
 486 fs_visitor::type_size(const struct glsl_type *type)
 487 {
 488    unsigned int size, i;
 489
 490    switch (type->base_type) {
 491    case GLSL_TYPE_UINT:
 492    case GLSL_TYPE_INT:
 493    case GLSL_TYPE_FLOAT:
 494    case GLSL_TYPE_BOOL:
 495       return type->components();
 496    case GLSL_TYPE_ARRAY:
 497       return type_size(type->fields.array) * type->length;
 498    case GLSL_TYPE_STRUCT:
 499       size = 0;
 500       for (i = 0; i < type->length; i++) {
 501          size += type_size(type->fields.structure[i].type);
 502       }
 503       return size;
 504    case GLSL_TYPE_SAMPLER:
 505       /* Samplers take up no register space, since they're baked in at
 506        * link time.
 507        */
 508       return 0;
 509    case GLSL_TYPE_VOID:
 510    case GLSL_TYPE_ERROR:
 511    case GLSL_TYPE_INTERFACE:
 512       assert(!"not reached");
 513       break;
 514    }
 515
 516    return 0;
 517 }
 518
 519 fs_reg
 520 fs_visitor::get_timestamp()
 521 {
 522    assert(intel->gen >= 7);
 523
 524    fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 525                                           BRW_ARF_TIMESTAMP,
 526                                           0),
 527                              BRW_REGISTER_TYPE_UD));
 528
 529    fs_reg dst = fs_reg(this, glsl_type::uint_type);
 530
 531    fs_inst *mov = emit(MOV(dst, ts));
 532    /* We want to read the 3 fields we care about (mostly field 0, but also 2)
 533     * even if it's not enabled in the dispatch.
 534     */
 535    mov->force_writemask_all = true;
 536    mov->force_uncompressed = true;
 537
 538    /* The caller wants the low 32 bits of the timestamp.  Since it's running
 539     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
 540     * which is plenty of time for our purposes.  It is identical across the
 541     * EUs, but since it's tracking GPU core speed it will increment at a
 542     * varying rate as render P-states change.
 543     *
 544     * The caller could also check if render P-states have changed (or anything
 545     * else that might disrupt timing) by setting smear to 2 and checking if
 546     * that field is != 0.
 547     */
 548    dst.smear = 0;
 549
 550    return dst;
 551 }
 552
 553 void
 554 fs_visitor::emit_shader_time_begin()
 555 {
 556    current_annotation = "shader time start";
 557    shader_start_time = get_timestamp();
 558 }
 559
 560 void
 561 fs_visitor::emit_shader_time_end()
 562 {
 563    current_annotation = "shader time end";
 564
 565    enum shader_time_shader_type type, written_type, reset_type;
 566    if (dispatch_width == 8) {
 567       type = ST_FS8;
 568       written_type = ST_FS8_WRITTEN;
 569       reset_type = ST_FS8_RESET;
 570    } else {
 571       assert(dispatch_width == 16);
 572       type = ST_FS16;
 573       written_type = ST_FS16_WRITTEN;
 574       reset_type = ST_FS16_RESET;
 575    }
 576
 577    fs_reg shader_end_time = get_timestamp();
 578
 579    /* Check that there weren't any timestamp reset events (assuming these
 580     * were the only two timestamp reads that happened).
 581     */
 582    fs_reg reset = shader_end_time;
 583    reset.smear = 2;
 584    fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
 585    test->conditional_mod = BRW_CONDITIONAL_Z;
 586    emit(IF(BRW_PREDICATE_NORMAL));
 587
 588    push_force_uncompressed();
 589    fs_reg start = shader_start_time;
 590    start.negate = true;
 591    fs_reg diff = fs_reg(this, glsl_type::uint_type);
 592    emit(ADD(diff, start, shader_end_time));
 593
 594    /* If there were no instructions between the two timestamp gets, the diff
 595     * is 2 cycles.  Remove that overhead, so I can forget about that when
 596     * trying to determine the time taken for single instructions.
 597     */
 598    emit(ADD(diff, diff, fs_reg(-2u)));
 599
 600    emit_shader_time_write(type, diff);
 601    emit_shader_time_write(written_type, fs_reg(1u));
 602    emit(BRW_OPCODE_ELSE);
 603    emit_shader_time_write(reset_type, fs_reg(1u));
 604    emit(BRW_OPCODE_ENDIF);
 605
 606    pop_force_uncompressed();
 607 }
 608
 609 void
 610 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
 611                                    fs_reg value)
 612 {
 613    int shader_time_index = brw_get_shader_time_index(brw, prog, &fp->Base,
 614                                                      type);
 615    fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
 616
 617    fs_reg payload;
 618    if (dispatch_width == 8)
 619       payload = fs_reg(this, glsl_type::uvec2_type);
 620    else
 621       payload = fs_reg(this, glsl_type::uint_type);
 622
 623    emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
 624                 fs_reg(), payload, offset, value));
 625 }
 626
 627 void
 628 fs_visitor::fail(const char *format, ...)
 629 {
 630    va_list va;
 631    char *msg;
 632
 633    if (failed)
 634       return;
 635
 636    failed = true;
 637
 638    va_start(va, format);
 639    msg = ralloc_vasprintf(mem_ctx, format, va);
 640    va_end(va);
 641    msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
 642
 643    this->fail_msg = msg;
 644
 645    if (INTEL_DEBUG & DEBUG_WM) {
 646       fprintf(stderr, "%s",  msg);
 647    }
 648 }
 649
 650 fs_inst *
 651 fs_visitor::emit(enum opcode opcode)
 652 {
 653    return emit(fs_inst(opcode));
 654 }
 655
 656 fs_inst *
 657 fs_visitor::emit(enum opcode opcode, fs_reg dst)
 658 {
 659    return emit(fs_inst(opcode, dst));
 660 }
 661
 662 fs_inst *
 663 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
 664 {
 665    return emit(fs_inst(opcode, dst, src0));
 666 }
 667
 668 fs_inst *
 669 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 670 {
 671    return emit(fs_inst(opcode, dst, src0, src1));
 672 }
 673
 674 fs_inst *
 675 fs_visitor::emit(enum opcode opcode, fs_reg dst,
 676                  fs_reg src0, fs_reg src1, fs_reg src2)
 677 {
 678    return emit(fs_inst(opcode, dst, src0, src1, src2));
 679 }
 680
 681 void
 682 fs_visitor::push_force_uncompressed()
 683 {
 684    force_uncompressed_stack++;
 685 }
 686
 687 void
 688 fs_visitor::pop_force_uncompressed()
 689 {
 690    force_uncompressed_stack--;
 691    assert(force_uncompressed_stack >= 0);
 692 }
 693
 694 void
 695 fs_visitor::push_force_sechalf()
 696 {
 697    force_sechalf_stack++;
 698 }
 699
 700 void
 701 fs_visitor::pop_force_sechalf()
 702 {
 703    force_sechalf_stack--;
 704    assert(force_sechalf_stack >= 0);
 705 }
 706
 707 /**
 708  * Returns how many MRFs an FS opcode will write over.
 709  *
 710  * Note that this is not the 0 or 1 implied writes in an actual gen
 711  * instruction -- the FS opcodes often generate MOVs in addition.
 712  */
 713 int
 714 fs_visitor::implied_mrf_writes(fs_inst *inst)
 715 {
 716    if (inst->mlen == 0)
 717       return 0;
 718
 719    switch (inst->opcode) {
 720    case SHADER_OPCODE_RCP:
 721    case SHADER_OPCODE_RSQ:
 722    case SHADER_OPCODE_SQRT:
 723    case SHADER_OPCODE_EXP2:
 724    case SHADER_OPCODE_LOG2:
 725    case SHADER_OPCODE_SIN:
 726    case SHADER_OPCODE_COS:
 727       return 1 * dispatch_width / 8;
 728    case SHADER_OPCODE_POW:
 729    case SHADER_OPCODE_INT_QUOTIENT:
 730    case SHADER_OPCODE_INT_REMAINDER:
 731       return 2 * dispatch_width / 8;
 732    case SHADER_OPCODE_TEX:
 733    case FS_OPCODE_TXB:
 734    case SHADER_OPCODE_TXD:
 735    case SHADER_OPCODE_TXF:
 736    case SHADER_OPCODE_TXF_MS:
 737    case SHADER_OPCODE_TXL:
 738    case SHADER_OPCODE_TXS:
 739    case SHADER_OPCODE_LOD:
 740       return 1;
 741    case FS_OPCODE_FB_WRITE:
 742       return 2;
 743    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 744    case FS_OPCODE_UNSPILL:
 745       return 1;
 746    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
 747       return inst->header_present;
 748    case FS_OPCODE_SPILL:
 749       return 2;
 750    default:
 751       assert(!"not reached");
 752       return inst->mlen;
 753    }
 754 }
 755
 756 int
 757 fs_visitor::virtual_grf_alloc(int size)
 758 {
 759    if (virtual_grf_array_size <= virtual_grf_count) {
 760       if (virtual_grf_array_size == 0)
 761          virtual_grf_array_size = 16;
 762       else
 763          virtual_grf_array_size *= 2;
 764       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 765                                    virtual_grf_array_size);
 766    }
 767    virtual_grf_sizes[virtual_grf_count] = size;
 768    return virtual_grf_count++;
 769 }
 770
 771 /** Fixed HW reg constructor. */
 772 fs_reg::fs_reg(enum register_file file, int reg)
 773 {
 774    init();
 775    this->file = file;
 776    this->reg = reg;
 777    this->type = BRW_REGISTER_TYPE_F;
 778 }
 779
 780 /** Fixed HW reg constructor. */
 781 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
 782 {
 783    init();
 784    this->file = file;
 785    this->reg = reg;
 786    this->type = type;
 787 }
 788
 789 /** Automatic reg constructor. */
 790 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
 791 {
 792    init();
 793
 794    this->file = GRF;
 795    this->reg = v->virtual_grf_alloc(v->type_size(type));
 796    this->reg_offset = 0;
 797    this->type = brw_type_for_base_type(type);
 798 }
 799
 800 fs_reg *
 801 fs_visitor::variable_storage(ir_variable *var)
 802 {
 803    return (fs_reg *)hash_table_find(this->variable_ht, var);
 804 }
 805
 806 void
 807 import_uniforms_callback(const void *key,
 808                          void *data,
 809                          void *closure)
 810 {
 811    struct hash_table *dst_ht = (struct hash_table *)closure;
 812    const fs_reg *reg = (const fs_reg *)data;
 813
 814    if (reg->file != UNIFORM)
 815       return;
 816
 817    hash_table_insert(dst_ht, data, key);
 818 }
 819
 820 /* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
 821  * This brings in those uniform definitions
 822  */
 823 void
 824 fs_visitor::import_uniforms(fs_visitor *v)
 825 {
 826    hash_table_call_foreach(v->variable_ht,
 827                            import_uniforms_callback,
 828                            variable_ht);
 829    this->params_remap = v->params_remap;
 830 }
 831
 832 /* Our support for uniforms is piggy-backed on the struct
 833  * gl_fragment_program, because that's where the values actually
 834  * get stored, rather than in some global gl_shader_program uniform
 835  * store.
 836  */
 837 void
 838 fs_visitor::setup_uniform_values(ir_variable *ir)
 839 {
 840    int namelen = strlen(ir->name);
 841
 842    /* The data for our (non-builtin) uniforms is stored in a series of
 843     * gl_uniform_driver_storage structs for each subcomponent that
 844     * glGetUniformLocation() could name.  We know it's been set up in the same
 845     * order we'd walk the type, so walk the list of storage and find anything
 846     * with our name, or the prefix of a component that starts with our name.
 847     */
 848    unsigned params_before = c->prog_data.nr_params;
 849    for (unsigned u = 0; u < prog->NumUserUniformStorage; u++) {
 850       struct gl_uniform_storage *storage = &prog->UniformStorage[u];
 851
 852       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 853           (storage->name[namelen] != 0 &&
 854            storage->name[namelen] != '.' &&
 855            storage->name[namelen] != '[')) {
 856          continue;
 857       }
 858
 859       unsigned slots = storage->type->component_slots();
 860       if (storage->array_elements)
 861          slots *= storage->array_elements;
 862
 863       for (unsigned i = 0; i < slots; i++) {
 864          c->prog_data.param[c->prog_data.nr_params++] =
 865             &storage->storage[i].f;
 866       }
 867    }
 868
 869    /* Make sure we actually initialized the right amount of stuff here. */
 870    assert(params_before + ir->type->component_slots() ==
 871           c->prog_data.nr_params);
 872 }
 873
 874
 875 /* Our support for builtin uniforms is even scarier than non-builtin.
 876  * It sits on top of the PROG_STATE_VAR parameters that are
 877  * automatically updated from GL context state.
 878  */
 879 void
 880 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
 881 {
 882    const ir_state_slot *const slots = ir->state_slots;
 883    assert(ir->state_slots != NULL);
 884
 885    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 886       /* This state reference has already been setup by ir_to_mesa, but we'll
 887        * get the same index back here.
 888        */
 889       int index = _mesa_add_state_reference(this->fp->Base.Parameters,
 890                                             (gl_state_index *)slots[i].tokens);
 891
 892       /* Add each of the unique swizzles of the element as a parameter.
 893        * This'll end up matching the expected layout of the
 894        * array/matrix/structure we're trying to fill in.
 895        */
 896       int last_swiz = -1;
 897       for (unsigned int j = 0; j < 4; j++) {
 898          int swiz = GET_SWZ(slots[i].swizzle, j);
 899          if (swiz == last_swiz)
 900             break;
 901          last_swiz = swiz;
 902
 903          c->prog_data.param[c->prog_data.nr_params++] =
 904             &fp->Base.Parameters->ParameterValues[index][swiz].f;
 905       }
 906    }
 907 }
 908
 909 fs_reg *
 910 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
 911 {
 912    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 913    fs_reg wpos = *reg;
 914    bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
 915
 916    /* gl_FragCoord.x */
 917    if (ir->pixel_center_integer) {
 918       emit(MOV(wpos, this->pixel_x));
 919    } else {
 920       emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
 921    }
 922    wpos.reg_offset++;
 923
 924    /* gl_FragCoord.y */
 925    if (!flip && ir->pixel_center_integer) {
 926       emit(MOV(wpos, this->pixel_y));
 927    } else {
 928       fs_reg pixel_y = this->pixel_y;
 929       float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
 930
 931       if (flip) {
 932          pixel_y.negate = true;
 933          offset += c->key.drawable_height - 1.0;
 934       }
 935
 936       emit(ADD(wpos, pixel_y, fs_reg(offset)));
 937    }
 938    wpos.reg_offset++;
 939
 940    /* gl_FragCoord.z */
 941    if (intel->gen >= 6) {
 942       emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
 943    } else {
 944       emit(FS_OPCODE_LINTERP, wpos,
 945            this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
 946            this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
 947            interp_reg(VARYING_SLOT_POS, 2));
 948    }
 949    wpos.reg_offset++;
 950
 951    /* gl_FragCoord.w: Already set up in emit_interpolation */
 952    emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
 953
 954    return reg;
 955 }
 956
 957 fs_inst *
 958 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
 959                          glsl_interp_qualifier interpolation_mode,
 960                          bool is_centroid)
 961 {
 962    brw_wm_barycentric_interp_mode barycoord_mode;
 963    if (is_centroid) {
 964       if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
 965          barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
 966       else
 967          barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
 968    } else {
 969       if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
 970          barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
 971       else
 972          barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
 973    }
 974    return emit(FS_OPCODE_LINTERP, attr,
 975                this->delta_x[barycoord_mode],
 976                this->delta_y[barycoord_mode], interp);
 977 }
 978
 979 fs_reg *
 980 fs_visitor::emit_general_interpolation(ir_variable *ir)
 981 {
 982    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 983    reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
 984    fs_reg attr = *reg;
 985
 986    unsigned int array_elements;
 987    const glsl_type *type;
 988
 989    if (ir->type->is_array()) {
 990       array_elements = ir->type->length;
 991       if (array_elements == 0) {
 992          fail("dereferenced array '%s' has length 0\n", ir->name);
 993       }
 994       type = ir->type->fields.array;
 995    } else {
 996       array_elements = 1;
 997       type = ir->type;
 998    }
 999
1000    glsl_interp_qualifier interpolation_mode =
1001       ir->determine_interpolation_mode(c->key.flat_shade);
1002
1003    int location = ir->location;
1004    for (unsigned int i = 0; i < array_elements; i++) {
1005       for (unsigned int j = 0; j < type->matrix_columns; j++) {
1006          if (urb_setup[location] == -1) {
1007             /* If there's no incoming setup data for this slot, don't
1008              * emit interpolation for it.
1009              */
1010             attr.reg_offset += type->vector_elements;
1011             location++;
1012             continue;
1013          }
1014
1015          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1016             /* Constant interpolation (flat shading) case. The SF has
1017              * handed us defined values in only the constant offset
1018              * field of the setup reg.
1019              */
1020             for (unsigned int k = 0; k < type->vector_elements; k++) {
1021                struct brw_reg interp = interp_reg(location, k);
1022                interp = suboffset(interp, 3);
1023                interp.type = reg->type;
1024                emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1025                attr.reg_offset++;
1026             }
1027          } else {
1028             /* Smooth/noperspective interpolation case. */
1029             for (unsigned int k = 0; k < type->vector_elements; k++) {
1030                /* FINISHME: At some point we probably want to push
1031                 * this farther by giving similar treatment to the
1032                 * other potentially constant components of the
1033                 * attribute, as well as making brw_vs_constval.c
1034                 * handle varyings other than gl_TexCoord.
1035                 */
1036                if (location >= VARYING_SLOT_TEX0 &&
1037                    location <= VARYING_SLOT_TEX7 &&
1038                    k == 3 && !(c->key.proj_attrib_mask
1039                                & BITFIELD64_BIT(location))) {
1040                   emit(BRW_OPCODE_MOV, attr, fs_reg(1.0f));
1041                } else {
1042                   struct brw_reg interp = interp_reg(location, k);
1043                   emit_linterp(attr, fs_reg(interp), interpolation_mode,
1044                                ir->centroid);
1045                   if (brw->needs_unlit_centroid_workaround && ir->centroid) {
1046                      /* Get the pixel/sample mask into f0 so that we know
1047                       * which pixels are lit.  Then, for each channel that is
1048                       * unlit, replace the centroid data with non-centroid
1049                       * data.
1050                       */
1051                      emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS, attr);
1052                      fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1053                                                   interpolation_mode, false);
1054                      inst->predicate = BRW_PREDICATE_NORMAL;
1055                      inst->predicate_inverse = true;
1056                   }
1057                   if (intel->gen < 6) {
1058                      emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1059                   }
1060                }
1061                attr.reg_offset++;
1062             }
1063
1064          }
1065          location++;
1066       }
1067    }
1068
1069    return reg;
1070 }
1071
1072 fs_reg *
1073 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1074 {
1075    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1076
1077    /* The frontfacing comes in as a bit in the thread payload. */
1078    if (intel->gen >= 6) {
1079       emit(BRW_OPCODE_ASR, *reg,
1080            fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1081            fs_reg(15));
1082       emit(BRW_OPCODE_NOT, *reg, *reg);
1083       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1084    } else {
1085       struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1086       /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1087        * us front face
1088        */
1089       emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1090       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1091    }
1092
1093    return reg;
1094 }
1095
1096 fs_reg
1097 fs_visitor::fix_math_operand(fs_reg src)
1098 {
1099    /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1100     * might be able to do better by doing execsize = 1 math and then
1101     * expanding that result out, but we would need to be careful with
1102     * masking.
1103     *
1104     * The hardware ignores source modifiers (negate and abs) on math
1105     * instructions, so we also move to a temp to set those up.
1106     */
1107    if (intel->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1108        !src.abs && !src.negate)
1109       return src;
1110
1111    /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1112     * operands to math
1113     */
1114    if (intel->gen >= 7 && src.file != IMM)
1115       return src;
1116
1117    fs_reg expanded = fs_reg(this, glsl_type::float_type);
1118    expanded.type = src.type;
1119    emit(BRW_OPCODE_MOV, expanded, src);
1120    return expanded;
1121 }
1122
1123 fs_inst *
1124 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1125 {
1126    switch (opcode) {
1127    case SHADER_OPCODE_RCP:
1128    case SHADER_OPCODE_RSQ:
1129    case SHADER_OPCODE_SQRT:
1130    case SHADER_OPCODE_EXP2:
1131    case SHADER_OPCODE_LOG2:
1132    case SHADER_OPCODE_SIN:
1133    case SHADER_OPCODE_COS:
1134       break;
1135    default:
1136       assert(!"not reached: bad math opcode");
1137       return NULL;
1138    }
1139
1140    /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
1141     * might be able to do better by doing execsize = 1 math and then
1142     * expanding that result out, but we would need to be careful with
1143     * masking.
1144     *
1145     * Gen 6 hardware ignores source modifiers (negate and abs) on math
1146     * instructions, so we also move to a temp to set those up.
1147     */
1148    if (intel->gen >= 6)
1149       src = fix_math_operand(src);
1150
1151    fs_inst *inst = emit(opcode, dst, src);
1152
1153    if (intel->gen < 6) {
1154       inst->base_mrf = 2;
1155       inst->mlen = dispatch_width / 8;
1156    }
1157
1158    return inst;
1159 }
1160
1161 fs_inst *
1162 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1163 {
1164    int base_mrf = 2;
1165    fs_inst *inst;
1166
1167    switch (opcode) {
1168    case SHADER_OPCODE_INT_QUOTIENT:
1169    case SHADER_OPCODE_INT_REMAINDER:
1170       if (intel->gen >= 7 && dispatch_width == 16)
1171          fail("16-wide INTDIV unsupported\n");
1172       break;
1173    case SHADER_OPCODE_POW:
1174       break;
1175    default:
1176       assert(!"not reached: unsupported binary math opcode.");
1177       return NULL;
1178    }
1179
1180    if (intel->gen >= 6) {
1181       src0 = fix_math_operand(src0);
1182       src1 = fix_math_operand(src1);
1183
1184       inst = emit(opcode, dst, src0, src1);
1185    } else {
1186       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1187        * "Message Payload":
1188        *
1189        * "Operand0[7].  For the INT DIV functions, this operand is the
1190        *  denominator."
1191        *  ...
1192        * "Operand1[7].  For the INT DIV functions, this operand is the
1193        *  numerator."
1194        */
1195       bool is_int_div = opcode != SHADER_OPCODE_POW;
1196       fs_reg &op0 = is_int_div ? src1 : src0;
1197       fs_reg &op1 = is_int_div ? src0 : src1;
1198
1199       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1200       inst = emit(opcode, dst, op0, reg_null_f);
1201
1202       inst->base_mrf = base_mrf;
1203       inst->mlen = 2 * dispatch_width / 8;
1204    }
1205    return inst;
1206 }
1207
1208 void
1209 fs_visitor::assign_curb_setup()
1210 {
1211    c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
1212    if (dispatch_width == 8) {
1213       c->prog_data.first_curbe_grf = c->nr_payload_regs;
1214    } else {
1215       c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1216    }
1217
1218    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1219    foreach_list(node, &this->instructions) {
1220       fs_inst *inst = (fs_inst *)node;
1221
1222       for (unsigned int i = 0; i < 3; i++) {
1223          if (inst->src[i].file == UNIFORM) {
1224             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1225             struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1226                                                   constant_nr / 8,
1227                                                   constant_nr % 8);
1228
1229             inst->src[i].file = FIXED_HW_REG;
1230             inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
1231          }
1232       }
1233    }
1234 }
1235
1236 void
1237 fs_visitor::calculate_urb_setup()
1238 {
1239    for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1240       urb_setup[i] = -1;
1241    }
1242
1243    int urb_next = 0;
1244    /* Figure out where each of the incoming setup attributes lands. */
1245    if (intel->gen >= 6) {
1246       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1247          if (fp->Base.InputsRead & BITFIELD64_BIT(i)) {
1248             urb_setup[i] = urb_next++;
1249          }
1250       }
1251    } else {
1252       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1253       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1254          /* Point size is packed into the header, not as a general attribute */
1255          if (i == VARYING_SLOT_PSIZ)
1256             continue;
1257
1258          if (c->key.input_slots_valid & BITFIELD64_BIT(i)) {
1259             /* The back color slot is skipped when the front color is
1260              * also written to.  In addition, some slots can be
1261              * written in the vertex shader and not read in the
1262              * fragment shader.  So the register number must always be
1263              * incremented, mapped or not.
1264              */
1265             if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1266                urb_setup[i] = urb_next;
1267             urb_next++;
1268          }
1269       }
1270
1271       /*
1272        * It's a FS only attribute, and we did interpolation for this attribute
1273        * in SF thread. So, count it here, too.
1274        *
1275        * See compile_sf_prog() for more info.
1276        */
1277       if (fp->Base.InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1278          urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1279    }
1280
1281    /* Each attribute is 4 setup channels, each of which is half a reg. */
1282    c->prog_data.urb_read_length = urb_next * 2;
1283 }
1284
1285 void
1286 fs_visitor::assign_urb_setup()
1287 {
1288    int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1289
1290    /* Offset all the urb_setup[] index by the actual position of the
1291     * setup regs, now that the location of the constants has been chosen.
1292     */
1293    foreach_list(node, &this->instructions) {
1294       fs_inst *inst = (fs_inst *)node;
1295
1296       if (inst->opcode == FS_OPCODE_LINTERP) {
1297          assert(inst->src[2].file == FIXED_HW_REG);
1298          inst->src[2].fixed_hw_reg.nr += urb_start;
1299       }
1300
1301       if (inst->opcode == FS_OPCODE_CINTERP) {
1302          assert(inst->src[0].file == FIXED_HW_REG);
1303          inst->src[0].fixed_hw_reg.nr += urb_start;
1304       }
1305    }
1306
1307    this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length;
1308 }
1309
1310 /**
1311  * Split large virtual GRFs into separate components if we can.
1312  *
1313  * This is mostly duplicated with what brw_fs_vector_splitting does,
1314  * but that's really conservative because it's afraid of doing
1315  * splitting that doesn't result in real progress after the rest of
1316  * the optimization phases, which would cause infinite looping in
1317  * optimization.  We can do it once here, safely.  This also has the
1318  * opportunity to split interpolated values, or maybe even uniforms,
1319  * which we don't have at the IR level.
1320  *
1321  * We want to split, because virtual GRFs are what we register
1322  * allocate and spill (due to contiguousness requirements for some
1323  * instructions), and they're what we naturally generate in the
1324  * codegen process, but most virtual GRFs don't actually need to be
1325  * contiguous sets of GRFs.  If we split, we'll end up with reduced
1326  * live intervals and better dead code elimination and coalescing.
1327  */
1328 void
1329 fs_visitor::split_virtual_grfs()
1330 {
1331    int num_vars = this->virtual_grf_count;
1332    bool split_grf[num_vars];
1333    int new_virtual_grf[num_vars];
1334
1335    /* Try to split anything > 0 sized. */
1336    for (int i = 0; i < num_vars; i++) {
1337       if (this->virtual_grf_sizes[i] != 1)
1338          split_grf[i] = true;
1339       else
1340          split_grf[i] = false;
1341    }
1342
1343    if (brw->has_pln &&
1344        this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1345       /* PLN opcodes rely on the delta_xy being contiguous.  We only have to
1346        * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1347        * Gen6, that was the only supported interpolation mode, and since Gen6,
1348        * delta_x and delta_y are in fixed hardware registers.
1349        */
1350       split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1351          false;
1352    }
1353
1354    foreach_list(node, &this->instructions) {
1355       fs_inst *inst = (fs_inst *)node;
1356
1357       /* If there's a SEND message that requires contiguous destination
1358        * registers, no splitting is allowed.
1359        */
1360       if (inst->regs_written() > 1) {
1361          split_grf[inst->dst.reg] = false;
1362       }
1363
1364       /* If we're sending from a GRF, don't split it, on the assumption that
1365        * the send is reading the whole thing.
1366        */
1367       if (inst->is_send_from_grf()) {
1368          split_grf[inst->src[0].reg] = false;
1369       }
1370    }
1371
1372    /* Allocate new space for split regs.  Note that the virtual
1373     * numbers will be contiguous.
1374     */
1375    for (int i = 0; i < num_vars; i++) {
1376       if (split_grf[i]) {
1377          new_virtual_grf[i] = virtual_grf_alloc(1);
1378          for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1379             int reg = virtual_grf_alloc(1);
1380             assert(reg == new_virtual_grf[i] + j - 1);
1381             (void) reg;
1382          }
1383          this->virtual_grf_sizes[i] = 1;
1384       }
1385    }
1386
1387    foreach_list(node, &this->instructions) {
1388       fs_inst *inst = (fs_inst *)node;
1389
1390       if (inst->dst.file == GRF &&
1391           split_grf[inst->dst.reg] &&
1392           inst->dst.reg_offset != 0) {
1393          inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1394                           inst->dst.reg_offset - 1);
1395          inst->dst.reg_offset = 0;
1396       }
1397       for (int i = 0; i < 3; i++) {
1398          if (inst->src[i].file == GRF &&
1399              split_grf[inst->src[i].reg] &&
1400              inst->src[i].reg_offset != 0) {
1401             inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1402                                 inst->src[i].reg_offset - 1);
1403             inst->src[i].reg_offset = 0;
1404          }
1405       }
1406    }
1407    this->live_intervals_valid = false;
1408 }
1409
1410 /**
1411  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1412  *
1413  * During code generation, we create tons of temporary variables, many of
1414  * which get immediately killed and are never used again.  Yet, in later
1415  * optimization and analysis passes, such as compute_live_intervals, we need
1416  * to loop over all the virtual GRFs.  Compacting them can save a lot of
1417  * overhead.
1418  */
1419 void
1420 fs_visitor::compact_virtual_grfs()
1421 {
1422    /* Mark which virtual GRFs are used, and count how many. */
1423    int remap_table[this->virtual_grf_count];
1424    memset(remap_table, -1, sizeof(remap_table));
1425
1426    foreach_list(node, &this->instructions) {
1427       const fs_inst *inst = (const fs_inst *) node;
1428
1429       if (inst->dst.file == GRF)
1430          remap_table[inst->dst.reg] = 0;
1431
1432       for (int i = 0; i < 3; i++) {
1433          if (inst->src[i].file == GRF)
1434             remap_table[inst->src[i].reg] = 0;
1435       }
1436    }
1437
1438    /* In addition to registers used in instructions, fs_visitor keeps
1439     * direct references to certain special values which must be patched:
1440     */
1441    fs_reg *special[] = {
1442       &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1443       &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1444       &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1445       &delta_x[0], &delta_x[1], &delta_x[2],
1446       &delta_x[3], &delta_x[4], &delta_x[5],
1447       &delta_y[0], &delta_y[1], &delta_y[2],
1448       &delta_y[3], &delta_y[4], &delta_y[5],
1449    };
1450    STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1451    STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1452
1453    /* Treat all special values as used, to be conservative */
1454    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1455       if (special[i]->file == GRF)
1456          remap_table[special[i]->reg] = 0;
1457    }
1458
1459    /* Compact the GRF arrays. */
1460    int new_index = 0;
1461    for (int i = 0; i < this->virtual_grf_count; i++) {
1462       if (remap_table[i] != -1) {
1463          remap_table[i] = new_index;
1464          virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1465          if (live_intervals_valid) {
1466             virtual_grf_use[new_index] = virtual_grf_use[i];
1467             virtual_grf_def[new_index] = virtual_grf_def[i];
1468          }
1469          ++new_index;
1470       }
1471    }
1472
1473    this->virtual_grf_count = new_index;
1474
1475    /* Patch all the instructions to use the newly renumbered registers */
1476    foreach_list(node, &this->instructions) {
1477       fs_inst *inst = (fs_inst *) node;
1478
1479       if (inst->dst.file == GRF)
1480          inst->dst.reg = remap_table[inst->dst.reg];
1481
1482       for (int i = 0; i < 3; i++) {
1483          if (inst->src[i].file == GRF)
1484             inst->src[i].reg = remap_table[inst->src[i].reg];
1485       }
1486    }
1487
1488    /* Patch all the references to special values */
1489    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1490       if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1491          special[i]->reg = remap_table[special[i]->reg];
1492    }
1493 }
1494
1495 bool
1496 fs_visitor::remove_dead_constants()
1497 {
1498    if (dispatch_width == 8) {
1499       this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
1500
1501       for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
1502          this->params_remap[i] = -1;
1503
1504       /* Find which params are still in use. */
1505       foreach_list(node, &this->instructions) {
1506          fs_inst *inst = (fs_inst *)node;
1507
1508          for (int i = 0; i < 3; i++) {
1509             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1510
1511             if (inst->src[i].file != UNIFORM)
1512                continue;
1513
1514             assert(constant_nr < (int)c->prog_data.nr_params);
1515
1516             /* For now, set this to non-negative.  We'll give it the
1517              * actual new number in a moment, in order to keep the
1518              * register numbers nicely ordered.
1519              */
1520             this->params_remap[constant_nr] = 0;
1521          }
1522       }
1523
1524       /* Figure out what the new numbers for the params will be.  At some
1525        * point when we're doing uniform array access, we're going to want
1526        * to keep the distinction between .reg and .reg_offset, but for
1527        * now we don't care.
1528        */
1529       unsigned int new_nr_params = 0;
1530       for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1531          if (this->params_remap[i] != -1) {
1532             this->params_remap[i] = new_nr_params++;
1533          }
1534       }
1535
1536       /* Update the list of params to be uploaded to match our new numbering. */
1537       for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1538          int remapped = this->params_remap[i];
1539
1540          if (remapped == -1)
1541             continue;
1542
1543          c->prog_data.param[remapped] = c->prog_data.param[i];
1544       }
1545
1546       c->prog_data.nr_params = new_nr_params;
1547    } else {
1548       /* This should have been generated in the 8-wide pass already. */
1549       assert(this->params_remap);
1550    }
1551
1552    /* Now do the renumbering of the shader to remove unused params. */
1553    foreach_list(node, &this->instructions) {
1554       fs_inst *inst = (fs_inst *)node;
1555
1556       for (int i = 0; i < 3; i++) {
1557          int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1558
1559          if (inst->src[i].file != UNIFORM)
1560             continue;
1561
1562          assert(this->params_remap[constant_nr] != -1);
1563          inst->src[i].reg = this->params_remap[constant_nr];
1564          inst->src[i].reg_offset = 0;
1565       }
1566    }
1567
1568    return true;
1569 }
1570
1571 /*
1572  * Implements array access of uniforms by inserting a
1573  * PULL_CONSTANT_LOAD instruction.
1574  *
1575  * Unlike temporary GRF array access (where we don't support it due to
1576  * the difficulty of doing relative addressing on instruction
1577  * destinations), we could potentially do array access of uniforms
1578  * that were loaded in GRF space as push constants.  In real-world
1579  * usage we've seen, though, the arrays being used are always larger
1580  * than we could load as push constants, so just always move all
1581  * uniform array access out to a pull constant buffer.
1582  */
1583 void
1584 fs_visitor::move_uniform_array_access_to_pull_constants()
1585 {
1586    int pull_constant_loc[c->prog_data.nr_params];
1587
1588    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1589       pull_constant_loc[i] = -1;
1590    }
1591
1592    /* Walk through and find array access of uniforms.  Put a copy of that
1593     * uniform in the pull constant buffer.
1594     *
1595     * Note that we don't move constant-indexed accesses to arrays.  No
1596     * testing has been done of the performance impact of this choice.
1597     */
1598    foreach_list_safe(node, &this->instructions) {
1599       fs_inst *inst = (fs_inst *)node;
1600
1601       for (int i = 0 ; i < 3; i++) {
1602          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1603             continue;
1604
1605          int uniform = inst->src[i].reg;
1606
1607          /* If this array isn't already present in the pull constant buffer,
1608           * add it.
1609           */
1610          if (pull_constant_loc[uniform] == -1) {
1611             const float **values = &c->prog_data.param[uniform];
1612
1613             pull_constant_loc[uniform] = c->prog_data.nr_pull_params;
1614
1615             assert(param_size[uniform]);
1616
1617             for (int j = 0; j < param_size[uniform]; j++) {
1618                c->prog_data.pull_param[c->prog_data.nr_pull_params++] =
1619                   values[j];
1620             }
1621          }
1622
1623          /* Set up the annotation tracking for new generated instructions. */
1624          base_ir = inst->ir;
1625          current_annotation = inst->annotation;
1626
1627          fs_reg surf_index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1628          fs_reg temp = fs_reg(this, glsl_type::float_type);
1629          exec_list list = VARYING_PULL_CONSTANT_LOAD(temp,
1630                                                      surf_index,
1631                                                      *inst->src[i].reladdr,
1632                                                      pull_constant_loc[uniform] +
1633                                                      inst->src[i].reg_offset);
1634          inst->insert_before(&list);
1635
1636          inst->src[i].file = temp.file;
1637          inst->src[i].reg = temp.reg;
1638          inst->src[i].reg_offset = temp.reg_offset;
1639          inst->src[i].reladdr = NULL;
1640       }
1641    }
1642 }
1643
1644 /**
1645  * Choose accesses from the UNIFORM file to demote to using the pull
1646  * constant buffer.
1647  *
1648  * We allow a fragment shader to have more than the specified minimum
1649  * maximum number of fragment shader uniform components (64).  If
1650  * there are too many of these, they'd fill up all of register space.
1651  * So, this will push some of them out to the pull constant buffer and
1652  * update the program to load them.
1653  */
1654 void
1655 fs_visitor::setup_pull_constants()
1656 {
1657    /* Only allow 16 registers (128 uniform components) as push constants. */
1658    unsigned int max_uniform_components = 16 * 8;
1659    if (c->prog_data.nr_params <= max_uniform_components)
1660       return;
1661
1662    if (dispatch_width == 16) {
1663       fail("Pull constants not supported in 16-wide\n");
1664       return;
1665    }
1666
1667    /* Just demote the end of the list.  We could probably do better
1668     * here, demoting things that are rarely used in the program first.
1669     */
1670    unsigned int pull_uniform_base = max_uniform_components;
1671
1672    int pull_constant_loc[c->prog_data.nr_params];
1673    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1674       if (i < pull_uniform_base) {
1675          pull_constant_loc[i] = -1;
1676       } else {
1677          pull_constant_loc[i] = -1;
1678          /* If our constant is already being uploaded for reladdr purposes,
1679           * reuse it.
1680           */
1681          for (unsigned int j = 0; j < c->prog_data.nr_pull_params; j++) {
1682             if (c->prog_data.pull_param[j] == c->prog_data.param[i]) {
1683                pull_constant_loc[i] = j;
1684                break;
1685             }
1686          }
1687          if (pull_constant_loc[i] == -1) {
1688             int pull_index = c->prog_data.nr_pull_params++;
1689             c->prog_data.pull_param[pull_index] = c->prog_data.param[i];
1690             pull_constant_loc[i] = pull_index;;
1691          }
1692       }
1693    }
1694    c->prog_data.nr_params = pull_uniform_base;
1695
1696    foreach_list(node, &this->instructions) {
1697       fs_inst *inst = (fs_inst *)node;
1698
1699       for (int i = 0; i < 3; i++) {
1700          if (inst->src[i].file != UNIFORM)
1701             continue;
1702
1703          int pull_index = pull_constant_loc[inst->src[i].reg +
1704                                             inst->src[i].reg_offset];
1705          if (pull_index == -1)
1706             continue;
1707
1708          assert(!inst->src[i].reladdr);
1709
1710          fs_reg dst = fs_reg(this, glsl_type::float_type);
1711          fs_reg index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1712          fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1713          fs_inst *pull =
1714             new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1715                                  dst, index, offset);
1716          pull->ir = inst->ir;
1717          pull->annotation = inst->annotation;
1718
1719          inst->insert_before(pull);
1720
1721          inst->src[i].file = GRF;
1722          inst->src[i].reg = dst.reg;
1723          inst->src[i].reg_offset = 0;
1724          inst->src[i].smear = pull_index & 3;
1725       }
1726    }
1727 }
1728
1729 bool
1730 fs_visitor::opt_algebraic()
1731 {
1732    bool progress = false;
1733
1734    foreach_list(node, &this->instructions) {
1735       fs_inst *inst = (fs_inst *)node;
1736
1737       switch (inst->opcode) {
1738       case BRW_OPCODE_MUL:
1739          if (inst->src[1].file != IMM)
1740             continue;
1741
1742          /* a * 1.0 = a */
1743          if (inst->src[1].is_one()) {
1744             inst->opcode = BRW_OPCODE_MOV;
1745             inst->src[1] = reg_undef;
1746             progress = true;
1747             break;
1748          }
1749
1750          /* a * 0.0 = 0.0 */
1751          if (inst->src[1].is_zero()) {
1752             inst->opcode = BRW_OPCODE_MOV;
1753             inst->src[0] = inst->src[1];
1754             inst->src[1] = reg_undef;
1755             progress = true;
1756             break;
1757          }
1758
1759          break;
1760       case BRW_OPCODE_ADD:
1761          if (inst->src[1].file != IMM)
1762             continue;
1763
1764          /* a + 0.0 = a */
1765          if (inst->src[1].is_zero()) {
1766             inst->opcode = BRW_OPCODE_MOV;
1767             inst->src[1] = reg_undef;
1768             progress = true;
1769             break;
1770          }
1771          break;
1772       default:
1773          break;
1774       }
1775    }
1776
1777    return progress;
1778 }
1779
1780 /**
1781  * Must be called after calculate_live_intervales() to remove unused
1782  * writes to registers -- register allocation will fail otherwise
1783  * because something deffed but not used won't be considered to
1784  * interfere with other regs.
1785  */
1786 bool
1787 fs_visitor::dead_code_eliminate()
1788 {
1789    bool progress = false;
1790    int pc = 0;
1791
1792    calculate_live_intervals();
1793
1794    foreach_list_safe(node, &this->instructions) {
1795       fs_inst *inst = (fs_inst *)node;
1796
1797       if (inst->dst.file == GRF && this->virtual_grf_use[inst->dst.reg] <= pc) {
1798          inst->remove();
1799          progress = true;
1800       }
1801
1802       pc++;
1803    }
1804
1805    if (progress)
1806       live_intervals_valid = false;
1807
1808    return progress;
1809 }
1810
1811 /**
1812  * Implements a second type of register coalescing: This one checks if
1813  * the two regs involved in a raw move don't interfere, in which case
1814  * they can both by stored in the same place and the MOV removed.
1815  */
1816 bool
1817 fs_visitor::register_coalesce_2()
1818 {
1819    bool progress = false;
1820
1821    calculate_live_intervals();
1822
1823    foreach_list_safe(node, &this->instructions) {
1824       fs_inst *inst = (fs_inst *)node;
1825
1826       if (inst->opcode != BRW_OPCODE_MOV ||
1827           inst->predicate ||
1828           inst->saturate ||
1829           inst->src[0].file != GRF ||
1830           inst->src[0].negate ||
1831           inst->src[0].abs ||
1832           inst->src[0].smear != -1 ||
1833           inst->dst.file != GRF ||
1834           inst->dst.type != inst->src[0].type ||
1835           virtual_grf_sizes[inst->src[0].reg] != 1 ||
1836           virtual_grf_interferes(inst->dst.reg, inst->src[0].reg)) {
1837          continue;
1838       }
1839
1840       int reg_from = inst->src[0].reg;
1841       assert(inst->src[0].reg_offset == 0);
1842       int reg_to = inst->dst.reg;
1843       int reg_to_offset = inst->dst.reg_offset;
1844
1845       foreach_list(node, &this->instructions) {
1846          fs_inst *scan_inst = (fs_inst *)node;
1847
1848          if (scan_inst->dst.file == GRF &&
1849              scan_inst->dst.reg == reg_from) {
1850             scan_inst->dst.reg = reg_to;
1851             scan_inst->dst.reg_offset = reg_to_offset;
1852          }
1853          for (int i = 0; i < 3; i++) {
1854             if (scan_inst->src[i].file == GRF &&
1855                 scan_inst->src[i].reg == reg_from) {
1856                scan_inst->src[i].reg = reg_to;
1857                scan_inst->src[i].reg_offset = reg_to_offset;
1858             }
1859          }
1860       }
1861
1862       inst->remove();
1863
1864       /* We don't need to recalculate live intervals inside the loop despite
1865        * flagging live_intervals_valid because we only use live intervals for
1866        * the interferes test, and we must have had a situation where the
1867        * intervals were:
1868        *
1869        *  from  to
1870        *  ^
1871        *  |
1872        *  v
1873        *        ^
1874        *        |
1875        *        v
1876        *
1877        * Some register R that might get coalesced with one of these two could
1878        * only be referencing "to", otherwise "from"'s range would have been
1879        * longer.  R's range could also only start at the end of "to" or later,
1880        * otherwise it will conflict with "to" when we try to coalesce "to"
1881        * into Rw anyway.
1882        */
1883       live_intervals_valid = false;
1884
1885       progress = true;
1886       continue;
1887    }
1888
1889    return progress;
1890 }
1891
1892 bool
1893 fs_visitor::register_coalesce()
1894 {
1895    bool progress = false;
1896    int if_depth = 0;
1897    int loop_depth = 0;
1898
1899    foreach_list_safe(node, &this->instructions) {
1900       fs_inst *inst = (fs_inst *)node;
1901
1902       /* Make sure that we dominate the instructions we're going to
1903        * scan for interfering with our coalescing, or we won't have
1904        * scanned enough to see if anything interferes with our
1905        * coalescing.  We don't dominate the following instructions if
1906        * we're in a loop or an if block.
1907        */
1908       switch (inst->opcode) {
1909       case BRW_OPCODE_DO:
1910          loop_depth++;
1911          break;
1912       case BRW_OPCODE_WHILE:
1913          loop_depth--;
1914          break;
1915       case BRW_OPCODE_IF:
1916          if_depth++;
1917          break;
1918       case BRW_OPCODE_ENDIF:
1919          if_depth--;
1920          break;
1921       default:
1922          break;
1923       }
1924       if (loop_depth || if_depth)
1925          continue;
1926
1927       if (inst->opcode != BRW_OPCODE_MOV ||
1928           inst->predicate ||
1929           inst->saturate ||
1930           inst->dst.file != GRF || (inst->src[0].file != GRF &&
1931                                     inst->src[0].file != UNIFORM)||
1932           inst->dst.type != inst->src[0].type)
1933          continue;
1934
1935       bool has_source_modifiers = (inst->src[0].abs ||
1936                                    inst->src[0].negate ||
1937                                    inst->src[0].smear != -1 ||
1938                                    inst->src[0].file == UNIFORM);
1939
1940       /* Found a move of a GRF to a GRF.  Let's see if we can coalesce
1941        * them: check for no writes to either one until the exit of the
1942        * program.
1943        */
1944       bool interfered = false;
1945
1946       for (fs_inst *scan_inst = (fs_inst *)inst->next;
1947            !scan_inst->is_tail_sentinel();
1948            scan_inst = (fs_inst *)scan_inst->next) {
1949          if (scan_inst->dst.file == GRF) {
1950             if (scan_inst->overwrites_reg(inst->dst) ||
1951                 scan_inst->overwrites_reg(inst->src[0])) {
1952                interfered = true;
1953                break;
1954             }
1955          }
1956
1957          /* The gen6 MATH instruction can't handle source modifiers or
1958           * unusual register regions, so avoid coalescing those for
1959           * now.  We should do something more specific.
1960           */
1961          if (has_source_modifiers && !can_do_source_mods(scan_inst)) {
1962             interfered = true;
1963             break;
1964          }
1965
1966          /* The accumulator result appears to get used for the
1967           * conditional modifier generation.  When negating a UD
1968           * value, there is a 33rd bit generated for the sign in the
1969           * accumulator value, so now you can't check, for example,
1970           * equality with a 32-bit value.  See piglit fs-op-neg-uint.
1971           */
1972          if (scan_inst->conditional_mod &&
1973              inst->src[0].negate &&
1974              inst->src[0].type == BRW_REGISTER_TYPE_UD) {
1975             interfered = true;
1976             break;
1977          }
1978       }
1979       if (interfered) {
1980          continue;
1981       }
1982
1983       /* Rewrite the later usage to point at the source of the move to
1984        * be removed.
1985        */
1986       for (fs_inst *scan_inst = inst;
1987            !scan_inst->is_tail_sentinel();
1988            scan_inst = (fs_inst *)scan_inst->next) {
1989          for (int i = 0; i < 3; i++) {
1990             if (scan_inst->src[i].file == GRF &&
1991                 scan_inst->src[i].reg == inst->dst.reg &&
1992                 scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
1993                fs_reg new_src = inst->src[0];
1994                if (scan_inst->src[i].abs) {
1995                   new_src.negate = 0;
1996                   new_src.abs = 1;
1997                }
1998                new_src.negate ^= scan_inst->src[i].negate;
1999                scan_inst->src[i] = new_src;
2000             }
2001          }
2002       }
2003
2004       inst->remove();
2005       progress = true;
2006    }
2007
2008    if (progress)
2009       live_intervals_valid = false;
2010
2011    return progress;
2012 }
2013
2014
2015 bool
2016 fs_visitor::compute_to_mrf()
2017 {
2018    bool progress = false;
2019    int next_ip = 0;
2020
2021    calculate_live_intervals();
2022
2023    foreach_list_safe(node, &this->instructions) {
2024       fs_inst *inst = (fs_inst *)node;
2025
2026       int ip = next_ip;
2027       next_ip++;
2028
2029       if (inst->opcode != BRW_OPCODE_MOV ||
2030           inst->predicate ||
2031           inst->dst.file != MRF || inst->src[0].file != GRF ||
2032           inst->dst.type != inst->src[0].type ||
2033           inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
2034          continue;
2035
2036       /* Work out which hardware MRF registers are written by this
2037        * instruction.
2038        */
2039       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2040       int mrf_high;
2041       if (inst->dst.reg & BRW_MRF_COMPR4) {
2042          mrf_high = mrf_low + 4;
2043       } else if (dispatch_width == 16 &&
2044                  (!inst->force_uncompressed && !inst->force_sechalf)) {
2045          mrf_high = mrf_low + 1;
2046       } else {
2047          mrf_high = mrf_low;
2048       }
2049
2050       /* Can't compute-to-MRF this GRF if someone else was going to
2051        * read it later.
2052        */
2053       if (this->virtual_grf_use[inst->src[0].reg] > ip)
2054          continue;
2055
2056       /* Found a move of a GRF to a MRF.  Let's see if we can go
2057        * rewrite the thing that made this GRF to write into the MRF.
2058        */
2059       fs_inst *scan_inst;
2060       for (scan_inst = (fs_inst *)inst->prev;
2061            scan_inst->prev != NULL;
2062            scan_inst = (fs_inst *)scan_inst->prev) {
2063          if (scan_inst->dst.file == GRF &&
2064              scan_inst->dst.reg == inst->src[0].reg) {
2065             /* Found the last thing to write our reg we want to turn
2066              * into a compute-to-MRF.
2067              */
2068
2069             /* If it's predicated, it (probably) didn't populate all
2070              * the channels.  We might be able to rewrite everything
2071              * that writes that reg, but it would require smarter
2072              * tracking to delay the rewriting until complete success.
2073              */
2074             if (scan_inst->predicate)
2075                break;
2076
2077             /* If it's half of register setup and not the same half as
2078              * our MOV we're trying to remove, bail for now.
2079              */
2080             if (scan_inst->force_uncompressed != inst->force_uncompressed ||
2081                 scan_inst->force_sechalf != inst->force_sechalf) {
2082                break;
2083             }
2084
2085             /* Things returning more than one register would need us to
2086              * understand coalescing out more than one MOV at a time.
2087              */
2088             if (scan_inst->regs_written() > 1)
2089                break;
2090
2091             /* SEND instructions can't have MRF as a destination. */
2092             if (scan_inst->mlen)
2093                break;
2094
2095             if (intel->gen == 6) {
2096                /* gen6 math instructions must have the destination be
2097                 * GRF, so no compute-to-MRF for them.
2098                 */
2099                if (scan_inst->is_math()) {
2100                   break;
2101                }
2102             }
2103
2104             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2105                /* Found the creator of our MRF's source value. */
2106                scan_inst->dst.file = MRF;
2107                scan_inst->dst.reg = inst->dst.reg;
2108                scan_inst->saturate |= inst->saturate;
2109                inst->remove();
2110                progress = true;
2111             }
2112             break;
2113          }
2114
2115          /* We don't handle control flow here.  Most computation of
2116           * values that end up in MRFs are shortly before the MRF
2117           * write anyway.
2118           */
2119          if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2120             break;
2121
2122          /* You can't read from an MRF, so if someone else reads our
2123           * MRF's source GRF that we wanted to rewrite, that stops us.
2124           */
2125          bool interfered = false;
2126          for (int i = 0; i < 3; i++) {
2127             if (scan_inst->src[i].file == GRF &&
2128                 scan_inst->src[i].reg == inst->src[0].reg &&
2129                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2130                interfered = true;
2131             }
2132          }
2133          if (interfered)
2134             break;
2135
2136          if (scan_inst->dst.file == MRF) {
2137             /* If somebody else writes our MRF here, we can't
2138              * compute-to-MRF before that.
2139              */
2140             int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2141             int scan_mrf_high;
2142
2143             if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2144                scan_mrf_high = scan_mrf_low + 4;
2145             } else if (dispatch_width == 16 &&
2146                        (!scan_inst->force_uncompressed &&
2147                         !scan_inst->force_sechalf)) {
2148                scan_mrf_high = scan_mrf_low + 1;
2149             } else {
2150                scan_mrf_high = scan_mrf_low;
2151             }
2152
2153             if (mrf_low == scan_mrf_low ||
2154                 mrf_low == scan_mrf_high ||
2155                 mrf_high == scan_mrf_low ||
2156                 mrf_high == scan_mrf_high) {
2157                break;
2158             }
2159          }
2160
2161          if (scan_inst->mlen > 0) {
2162             /* Found a SEND instruction, which means that there are
2163              * live values in MRFs from base_mrf to base_mrf +
2164              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
2165              * above it.
2166              */
2167             if (mrf_low >= scan_inst->base_mrf &&
2168                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2169                break;
2170             }
2171             if (mrf_high >= scan_inst->base_mrf &&
2172                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2173                break;
2174             }
2175          }
2176       }
2177    }
2178
2179    if (progress)
2180       live_intervals_valid = false;
2181
2182    return progress;
2183 }
2184
2185 /**
2186  * Walks through basic blocks, looking for repeated MRF writes and
2187  * removing the later ones.
2188  */
2189 bool
2190 fs_visitor::remove_duplicate_mrf_writes()
2191 {
2192    fs_inst *last_mrf_move[16];
2193    bool progress = false;
2194
2195    /* Need to update the MRF tracking for compressed instructions. */
2196    if (dispatch_width == 16)
2197       return false;
2198
2199    memset(last_mrf_move, 0, sizeof(last_mrf_move));
2200
2201    foreach_list_safe(node, &this->instructions) {
2202       fs_inst *inst = (fs_inst *)node;
2203
2204       if (inst->is_control_flow()) {
2205          memset(last_mrf_move, 0, sizeof(last_mrf_move));
2206       }
2207
2208       if (inst->opcode == BRW_OPCODE_MOV &&
2209           inst->dst.file == MRF) {
2210          fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2211          if (prev_inst && inst->equals(prev_inst)) {
2212             inst->remove();
2213             progress = true;
2214             continue;
2215          }
2216       }
2217
2218       /* Clear out the last-write records for MRFs that were overwritten. */
2219       if (inst->dst.file == MRF) {
2220          last_mrf_move[inst->dst.reg] = NULL;
2221       }
2222
2223       if (inst->mlen > 0) {
2224          /* Found a SEND instruction, which will include two or fewer
2225           * implied MRF writes.  We could do better here.
2226           */
2227          for (int i = 0; i < implied_mrf_writes(inst); i++) {
2228             last_mrf_move[inst->base_mrf + i] = NULL;
2229          }
2230       }
2231
2232       /* Clear out any MRF move records whose sources got overwritten. */
2233       if (inst->dst.file == GRF) {
2234          for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2235             if (last_mrf_move[i] &&
2236                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2237                last_mrf_move[i] = NULL;
2238             }
2239          }
2240       }
2241
2242       if (inst->opcode == BRW_OPCODE_MOV &&
2243           inst->dst.file == MRF &&
2244           inst->src[0].file == GRF &&
2245           !inst->predicate) {
2246          last_mrf_move[inst->dst.reg] = inst;
2247       }
2248    }
2249
2250    if (progress)
2251       live_intervals_valid = false;
2252
2253    return progress;
2254 }
2255
2256 static void
2257 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2258                         int first_grf, int grf_len)
2259 {
2260    bool inst_16wide = (dispatch_width > 8 &&
2261                        !inst->force_uncompressed &&
2262                        !inst->force_sechalf);
2263
2264    /* Clear the flag for registers that actually got read (as expected). */
2265    for (int i = 0; i < 3; i++) {
2266       int grf;
2267       if (inst->src[i].file == GRF) {
2268          grf = inst->src[i].reg;
2269       } else if (inst->src[i].file == FIXED_HW_REG &&
2270                  inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2271          grf = inst->src[i].fixed_hw_reg.nr;
2272       } else {
2273          continue;
2274       }
2275
2276       if (grf >= first_grf &&
2277           grf < first_grf + grf_len) {
2278          deps[grf - first_grf] = false;
2279          if (inst_16wide)
2280             deps[grf - first_grf + 1] = false;
2281       }
2282    }
2283 }
2284
2285 /**
2286  * Implements this workaround for the original 965:
2287  *
2288  *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2289  *      check for post destination dependencies on this instruction, software
2290  *      must ensure that there is no destination hazard for the case of ‘write
2291  *      followed by a posted write’ shown in the following example.
2292  *
2293  *      1. mov r3 0
2294  *      2. send r3.xy <rest of send instruction>
2295  *      3. mov r2 r3
2296  *
2297  *      Due to no post-destination dependency check on the ‘send’, the above
2298  *      code sequence could have two instructions (1 and 2) in flight at the
2299  *      same time that both consider ‘r3’ as the target of their final writes.
2300  */
2301 void
2302 fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
2303 {
2304    int reg_size = dispatch_width / 8;
2305    int write_len = inst->regs_written() * reg_size;
2306    int first_write_grf = inst->dst.reg;
2307    bool needs_dep[BRW_MAX_MRF];
2308    assert(write_len < (int)sizeof(needs_dep) - 1);
2309
2310    memset(needs_dep, false, sizeof(needs_dep));
2311    memset(needs_dep, true, write_len);
2312
2313    clear_deps_for_inst_src(inst, dispatch_width,
2314                            needs_dep, first_write_grf, write_len);
2315
2316    /* Walk backwards looking for writes to registers we're writing which
2317     * aren't read since being written.  If we hit the start of the program,
2318     * we assume that there are no outstanding dependencies on entry to the
2319     * program.
2320     */
2321    for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2322         scan_inst != NULL;
2323         scan_inst = (fs_inst *)scan_inst->prev) {
2324
2325       /* If we hit control flow, assume that there *are* outstanding
2326        * dependencies, and force their cleanup before our instruction.
2327        */
2328       if (scan_inst->is_control_flow()) {
2329          for (int i = 0; i < write_len; i++) {
2330             if (needs_dep[i]) {
2331                inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2332             }
2333          }
2334       }
2335
2336       bool scan_inst_16wide = (dispatch_width > 8 &&
2337                                !scan_inst->force_uncompressed &&
2338                                !scan_inst->force_sechalf);
2339
2340       /* We insert our reads as late as possible on the assumption that any
2341        * instruction but a MOV that might have left us an outstanding
2342        * dependency has more latency than a MOV.
2343        */
2344       if (scan_inst->dst.file == GRF) {
2345          for (int i = 0; i < scan_inst->regs_written(); i++) {
2346             int reg = scan_inst->dst.reg + i * reg_size;
2347
2348             if (reg >= first_write_grf &&
2349                 reg < first_write_grf + write_len &&
2350                 needs_dep[reg - first_write_grf]) {
2351                inst->insert_before(DEP_RESOLVE_MOV(reg));
2352                needs_dep[reg - first_write_grf] = false;
2353                if (scan_inst_16wide)
2354                   needs_dep[reg - first_write_grf + 1] = false;
2355             }
2356          }
2357       }
2358
2359       /* Clear the flag for registers that actually got read (as expected). */
2360       clear_deps_for_inst_src(scan_inst, dispatch_width,
2361                               needs_dep, first_write_grf, write_len);
2362
2363       /* Continue the loop only if we haven't resolved all the dependencies */
2364       int i;
2365       for (i = 0; i < write_len; i++) {
2366          if (needs_dep[i])
2367             break;
2368       }
2369       if (i == write_len)
2370          return;
2371    }
2372 }
2373
2374 /**
2375  * Implements this workaround for the original 965:
2376  *
2377  *     "[DevBW, DevCL] Errata: A destination register from a send can not be
2378  *      used as a destination register until after it has been sourced by an
2379  *      instruction with a different destination register.
2380  */
2381 void
2382 fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
2383 {
2384    int write_len = inst->regs_written() * dispatch_width / 8;
2385    int first_write_grf = inst->dst.reg;
2386    bool needs_dep[BRW_MAX_MRF];
2387    assert(write_len < (int)sizeof(needs_dep) - 1);
2388
2389    memset(needs_dep, false, sizeof(needs_dep));
2390    memset(needs_dep, true, write_len);
2391    /* Walk forwards looking for writes to registers we're writing which aren't
2392     * read before being written.
2393     */
2394    for (fs_inst *scan_inst = (fs_inst *)inst->next;
2395         !scan_inst->is_tail_sentinel();
2396         scan_inst = (fs_inst *)scan_inst->next) {
2397       /* If we hit control flow, force resolve all remaining dependencies. */
2398       if (scan_inst->is_control_flow()) {
2399          for (int i = 0; i < write_len; i++) {
2400             if (needs_dep[i])
2401                scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2402          }
2403       }
2404
2405       /* Clear the flag for registers that actually got read (as expected). */
2406       clear_deps_for_inst_src(scan_inst, dispatch_width,
2407                               needs_dep, first_write_grf, write_len);
2408
2409       /* We insert our reads as late as possible since they're reading the
2410        * result of a SEND, which has massive latency.
2411        */
2412       if (scan_inst->dst.file == GRF &&
2413           scan_inst->dst.reg >= first_write_grf &&
2414           scan_inst->dst.reg < first_write_grf + write_len &&
2415           needs_dep[scan_inst->dst.reg - first_write_grf]) {
2416          scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2417          needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2418       }
2419
2420       /* Continue the loop only if we haven't resolved all the dependencies */
2421       int i;
2422       for (i = 0; i < write_len; i++) {
2423          if (needs_dep[i])
2424             break;
2425       }
2426       if (i == write_len)
2427          return;
2428    }
2429
2430    /* If we hit the end of the program, resolve all remaining dependencies out
2431     * of paranoia.
2432     */
2433    fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2434    assert(last_inst->eot);
2435    for (int i = 0; i < write_len; i++) {
2436       if (needs_dep[i])
2437          last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2438    }
2439 }
2440
2441 void
2442 fs_visitor::insert_gen4_send_dependency_workarounds()
2443 {
2444    if (intel->gen != 4 || intel->is_g4x)
2445       return;
2446
2447    /* Note that we're done with register allocation, so GRF fs_regs always
2448     * have a .reg_offset of 0.
2449     */
2450
2451    foreach_list_safe(node, &this->instructions) {
2452       fs_inst *inst = (fs_inst *)node;
2453
2454       if (inst->mlen != 0 && inst->dst.file == GRF) {
2455          insert_gen4_pre_send_dependency_workarounds(inst);
2456          insert_gen4_post_send_dependency_workarounds(inst);
2457       }
2458    }
2459 }
2460
2461 /**
2462  * Turns the generic expression-style uniform pull constant load instruction
2463  * into a hardware-specific series of instructions for loading a pull
2464  * constant.
2465  *
2466  * The expression style allows the CSE pass before this to optimize out
2467  * repeated loads from the same offset, and gives the pre-register-allocation
2468  * scheduling full flexibility, while the conversion to native instructions
2469  * allows the post-register-allocation scheduler the best information
2470  * possible.
2471  *
2472  * Note that execution masking for setting up pull constant loads is special:
2473  * the channels that need to be written are unrelated to the current execution
2474  * mask, since a later instruction will use one of the result channels as a
2475  * source operand for all 8 or 16 of its channels.
2476  */
2477 void
2478 fs_visitor::lower_uniform_pull_constant_loads()
2479 {
2480    foreach_list(node, &this->instructions) {
2481       fs_inst *inst = (fs_inst *)node;
2482
2483       if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2484          continue;
2485
2486       if (intel->gen >= 7) {
2487          /* The offset arg before was a vec4-aligned byte offset.  We need to
2488           * turn it into a dword offset.
2489           */
2490          fs_reg const_offset_reg = inst->src[1];
2491          assert(const_offset_reg.file == IMM &&
2492                 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2493          const_offset_reg.imm.u /= 4;
2494          fs_reg payload = fs_reg(this, glsl_type::uint_type);
2495
2496          /* This is actually going to be a MOV, but since only the first dword
2497           * is accessed, we have a special opcode to do just that one.  Note
2498           * that this needs to be an operation that will be considered a def
2499           * by live variable analysis, or register allocation will explode.
2500           */
2501          fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2502                                                payload, const_offset_reg);
2503          setup->force_writemask_all = true;
2504
2505          setup->ir = inst->ir;
2506          setup->annotation = inst->annotation;
2507          inst->insert_before(setup);
2508
2509          /* Similarly, this will only populate the first 4 channels of the
2510           * result register (since we only use smear values from 0-3), but we
2511           * don't tell the optimizer.
2512           */
2513          inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2514          inst->src[1] = payload;
2515
2516          this->live_intervals_valid = false;
2517       } else {
2518          /* Before register allocation, we didn't tell the scheduler about the
2519           * MRF we use.  We know it's safe to use this MRF because nothing
2520           * else does except for register spill/unspill, which generates and
2521           * uses its MRF within a single IR instruction.
2522           */
2523          inst->base_mrf = 14;
2524          inst->mlen = 1;
2525       }
2526    }
2527 }
2528
2529 void
2530 fs_visitor::dump_instruction(fs_inst *inst)
2531 {
2532    if (inst->predicate) {
2533       printf("(%cf0.%d) ",
2534              inst->predicate_inverse ? '-' : '+',
2535              inst->flag_subreg);
2536    }
2537
2538    printf("%s", brw_instruction_name(inst->opcode));
2539    if (inst->saturate)
2540       printf(".sat");
2541    if (inst->conditional_mod) {
2542       printf(".cmod");
2543       if (!inst->predicate &&
2544           (intel->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2545                               inst->opcode != BRW_OPCODE_IF &&
2546                               inst->opcode != BRW_OPCODE_WHILE))) {
2547          printf(".f0.%d\n", inst->flag_subreg);
2548       }
2549    }
2550    printf(" ");
2551
2552
2553    switch (inst->dst.file) {
2554    case GRF:
2555       printf("vgrf%d", inst->dst.reg);
2556       if (inst->dst.reg_offset)
2557          printf("+%d", inst->dst.reg_offset);
2558       break;
2559    case MRF:
2560       printf("m%d", inst->dst.reg);
2561       break;
2562    case BAD_FILE:
2563       printf("(null)");
2564       break;
2565    case UNIFORM:
2566       printf("***u%d***", inst->dst.reg);
2567       break;
2568    default:
2569       printf("???");
2570       break;
2571    }
2572    printf(", ");
2573
2574    for (int i = 0; i < 3; i++) {
2575       if (inst->src[i].negate)
2576          printf("-");
2577       if (inst->src[i].abs)
2578          printf("|");
2579       switch (inst->src[i].file) {
2580       case GRF:
2581          printf("vgrf%d", inst->src[i].reg);
2582          if (inst->src[i].reg_offset)
2583             printf("+%d", inst->src[i].reg_offset);
2584          break;
2585       case MRF:
2586          printf("***m%d***", inst->src[i].reg);
2587          break;
2588       case UNIFORM:
2589          printf("u%d", inst->src[i].reg);
2590          if (inst->src[i].reg_offset)
2591             printf(".%d", inst->src[i].reg_offset);
2592          break;
2593       case BAD_FILE:
2594          printf("(null)");
2595          break;
2596       case IMM:
2597          switch (inst->src[i].type) {
2598          case BRW_REGISTER_TYPE_F:
2599             printf("%ff", inst->src[i].imm.f);
2600             break;
2601          case BRW_REGISTER_TYPE_D:
2602             printf("%dd", inst->src[i].imm.i);
2603             break;
2604          case BRW_REGISTER_TYPE_UD:
2605             printf("%uu", inst->src[i].imm.u);
2606             break;
2607          default:
2608             printf("???");
2609             break;
2610          }
2611          break;
2612       default:
2613          printf("???");
2614          break;
2615       }
2616       if (inst->src[i].abs)
2617          printf("|");
2618
2619       if (i < 3)
2620          printf(", ");
2621    }
2622
2623    printf(" ");
2624
2625    if (inst->force_uncompressed)
2626       printf("1sthalf ");
2627
2628    if (inst->force_sechalf)
2629       printf("2ndhalf ");
2630
2631    printf("\n");
2632 }
2633
2634 void
2635 fs_visitor::dump_instructions()
2636 {
2637    int ip = 0;
2638    foreach_list(node, &this->instructions) {
2639       fs_inst *inst = (fs_inst *)node;
2640       printf("%d: ", ip++);
2641       dump_instruction(inst);
2642    }
2643 }
2644
2645 /**
2646  * Possibly returns an instruction that set up @param reg.
2647  *
2648  * Sometimes we want to take the result of some expression/variable
2649  * dereference tree and rewrite the instruction generating the result
2650  * of the tree.  When processing the tree, we know that the
2651  * instructions generated are all writing temporaries that are dead
2652  * outside of this tree.  So, if we have some instructions that write
2653  * a temporary, we're free to point that temp write somewhere else.
2654  *
2655  * Note that this doesn't guarantee that the instruction generated
2656  * only reg -- it might be the size=4 destination of a texture instruction.
2657  */
2658 fs_inst *
2659 fs_visitor::get_instruction_generating_reg(fs_inst *start,
2660                                            fs_inst *end,
2661                                            fs_reg reg)
2662 {
2663    if (end == start ||
2664        end->predicate ||
2665        end->force_uncompressed ||
2666        end->force_sechalf ||
2667        reg.reladdr ||
2668        !reg.equals(end->dst)) {
2669       return NULL;
2670    } else {
2671       return end;
2672    }
2673 }
2674
2675 void
2676 fs_visitor::setup_payload_gen6()
2677 {
2678    struct intel_context *intel = &brw->intel;
2679    bool uses_depth =
2680       (fp->Base.InputsRead & (1 << VARYING_SLOT_POS)) != 0;
2681    unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
2682
2683    assert(intel->gen >= 6);
2684
2685    /* R0-1: masks, pixel X/Y coordinates. */
2686    c->nr_payload_regs = 2;
2687    /* R2: only for 32-pixel dispatch.*/
2688
2689    /* R3-26: barycentric interpolation coordinates.  These appear in the
2690     * same order that they appear in the brw_wm_barycentric_interp_mode
2691     * enum.  Each set of coordinates occupies 2 registers if dispatch width
2692     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
2693     * appear if they were enabled using the "Barycentric Interpolation
2694     * Mode" bits in WM_STATE.
2695     */
2696    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
2697       if (barycentric_interp_modes & (1 << i)) {
2698          c->barycentric_coord_reg[i] = c->nr_payload_regs;
2699          c->nr_payload_regs += 2;
2700          if (dispatch_width == 16) {
2701             c->nr_payload_regs += 2;
2702          }
2703       }
2704    }
2705
2706    /* R27: interpolated depth if uses source depth */
2707    if (uses_depth) {
2708       c->source_depth_reg = c->nr_payload_regs;
2709       c->nr_payload_regs++;
2710       if (dispatch_width == 16) {
2711          /* R28: interpolated depth if not 8-wide. */
2712          c->nr_payload_regs++;
2713       }
2714    }
2715    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
2716    if (uses_depth) {
2717       c->source_w_reg = c->nr_payload_regs;
2718       c->nr_payload_regs++;
2719       if (dispatch_width == 16) {
2720          /* R30: interpolated W if not 8-wide. */
2721          c->nr_payload_regs++;
2722       }
2723    }
2724    /* R31: MSAA position offsets. */
2725    /* R32-: bary for 32-pixel. */
2726    /* R58-59: interp W for 32-pixel. */
2727
2728    if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
2729       c->source_depth_to_render_target = true;
2730    }
2731 }
2732
2733 bool
2734 fs_visitor::run()
2735 {
2736    sanity_param_count = fp->Base.Parameters->NumParameters;
2737    uint32_t orig_nr_params = c->prog_data.nr_params;
2738
2739    if (intel->gen >= 6)
2740       setup_payload_gen6();
2741    else
2742       setup_payload_gen4();
2743
2744    if (0) {
2745       emit_dummy_fs();
2746    } else {
2747       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2748          emit_shader_time_begin();
2749
2750       calculate_urb_setup();
2751       if (intel->gen < 6)
2752          emit_interpolation_setup_gen4();
2753       else
2754          emit_interpolation_setup_gen6();
2755
2756       /* We handle discards by keeping track of the still-live pixels in f0.1.
2757        * Initialize it with the dispatched pixels.
2758        */
2759       if (fp->UsesKill) {
2760          fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
2761          discard_init->flag_subreg = 1;
2762       }
2763
2764       /* Generate FS IR for main().  (the visitor only descends into
2765        * functions called "main").
2766        */
2767       if (shader) {
2768          foreach_list(node, &*shader->ir) {
2769             ir_instruction *ir = (ir_instruction *)node;
2770             base_ir = ir;
2771             this->result = reg_undef;
2772             ir->accept(this);
2773          }
2774       } else {
2775          emit_fragment_program_code();
2776       }
2777       base_ir = NULL;
2778       if (failed)
2779          return false;
2780
2781       emit(FS_OPCODE_PLACEHOLDER_HALT);
2782
2783       emit_fb_writes();
2784
2785       split_virtual_grfs();
2786
2787       move_uniform_array_access_to_pull_constants();
2788       setup_pull_constants();
2789
2790       bool progress;
2791       do {
2792          progress = false;
2793
2794          compact_virtual_grfs();
2795
2796          progress = remove_duplicate_mrf_writes() || progress;
2797
2798          progress = opt_algebraic() || progress;
2799          progress = opt_cse() || progress;
2800          progress = opt_copy_propagate() || progress;
2801          progress = dead_code_eliminate() || progress;
2802          progress = register_coalesce() || progress;
2803          progress = register_coalesce_2() || progress;
2804          progress = compute_to_mrf() || progress;
2805       } while (progress);
2806
2807       remove_dead_constants();
2808
2809       schedule_instructions(false);
2810
2811       lower_uniform_pull_constant_loads();
2812
2813       assign_curb_setup();
2814       assign_urb_setup();
2815
2816       if (0) {
2817          /* Debug of register spilling: Go spill everything. */
2818          for (int i = 0; i < virtual_grf_count; i++) {
2819             spill_reg(i);
2820          }
2821       }
2822
2823       if (0)
2824          assign_regs_trivial();
2825       else {
2826          while (!assign_regs()) {
2827             if (failed)
2828                break;
2829          }
2830       }
2831    }
2832    assert(force_uncompressed_stack == 0);
2833    assert(force_sechalf_stack == 0);
2834
2835    /* This must come after all optimization and register allocation, since
2836     * it inserts dead code that happens to have side effects, and it does
2837     * so based on the actual physical registers in use.
2838     */
2839    insert_gen4_send_dependency_workarounds();
2840
2841    if (failed)
2842       return false;
2843
2844    schedule_instructions(true);
2845
2846    if (dispatch_width == 8) {
2847       c->prog_data.reg_blocks = brw_register_blocks(grf_used);
2848    } else {
2849       c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
2850
2851       /* Make sure we didn't try to sneak in an extra uniform */
2852       assert(orig_nr_params == c->prog_data.nr_params);
2853       (void) orig_nr_params;
2854    }
2855
2856    /* If any state parameters were appended, then ParameterValues could have
2857     * been realloced, in which case the driver uniform storage set up by
2858     * _mesa_associate_uniform_storage() would point to freed memory.  Make
2859     * sure that didn't happen.
2860     */
2861    assert(sanity_param_count == fp->Base.Parameters->NumParameters);
2862
2863    return !failed;
2864 }
2865
2866 const unsigned *
2867 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
2868                struct gl_fragment_program *fp,
2869                struct gl_shader_program *prog,
2870                unsigned *final_assembly_size)
2871 {
2872    struct intel_context *intel = &brw->intel;
2873    bool start_busy = false;
2874    float start_time = 0;
2875
2876    if (unlikely(intel->perf_debug)) {
2877       start_busy = (intel->batch.last_bo &&
2878                     drm_intel_bo_busy(intel->batch.last_bo));
2879       start_time = get_time();
2880    }
2881
2882    struct brw_shader *shader = NULL;
2883    if (prog)
2884       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
2885
2886    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
2887       if (shader) {
2888          printf("GLSL IR for native fragment shader %d:\n", prog->Name);
2889          _mesa_print_ir(shader->ir, NULL);
2890          printf("\n\n");
2891       } else {
2892          printf("ARB_fragment_program %d ir for native fragment shader\n",
2893                 fp->Base.Id);
2894          _mesa_print_program(&fp->Base);
2895       }
2896    }
2897
2898    /* Now the main event: Visit the shader IR and generate our FS IR for it.
2899     */
2900    fs_visitor v(brw, c, prog, fp, 8);
2901    if (!v.run()) {
2902       prog->LinkStatus = false;
2903       ralloc_strcat(&prog->InfoLog, v.fail_msg);
2904
2905       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
2906                     v.fail_msg);
2907
2908       return NULL;
2909    }
2910
2911    exec_list *simd16_instructions = NULL;
2912    fs_visitor v2(brw, c, prog, fp, 16);
2913    bool no16 = INTEL_DEBUG & DEBUG_NO16;
2914    if (intel->gen >= 5 && c->prog_data.nr_pull_params == 0 && likely(!no16)) {
2915       v2.import_uniforms(&v);
2916       if (!v2.run()) {
2917          perf_debug("16-wide shader failed to compile, falling back to "
2918                     "8-wide at a 10-20%% performance cost: %s", v2.fail_msg);
2919       } else {
2920          simd16_instructions = &v2.instructions;
2921       }
2922    }
2923
2924    c->prog_data.dispatch_width = 8;
2925
2926    fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
2927    const unsigned *generated = g.generate_assembly(&v.instructions,
2928                                                    simd16_instructions,
2929                                                    final_assembly_size);
2930
2931    if (unlikely(intel->perf_debug) && shader) {
2932       if (shader->compiled_once)
2933          brw_wm_debug_recompile(brw, prog, &c->key);
2934       shader->compiled_once = true;
2935
2936       if (start_busy && !drm_intel_bo_busy(intel->batch.last_bo)) {
2937          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
2938                     (get_time() - start_time) * 1000);
2939       }
2940    }
2941
2942    return generated;
2943 }
2944
2945 bool
2946 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
2947 {
2948    struct brw_context *brw = brw_context(ctx);
2949    struct intel_context *intel = &brw->intel;
2950    struct brw_wm_prog_key key;
2951
2952    if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
2953       return true;
2954
2955    struct gl_fragment_program *fp = (struct gl_fragment_program *)
2956       prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
2957    struct brw_fragment_program *bfp = brw_fragment_program(fp);
2958    bool program_uses_dfdy = fp->UsesDFdy;
2959
2960    memset(&key, 0, sizeof(key));
2961
2962    if (intel->gen < 6) {
2963       if (fp->UsesKill)
2964          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
2965
2966       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
2967          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
2968
2969       /* Just assume depth testing. */
2970       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
2971       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
2972    }
2973
2974    if (prog->Name != 0)
2975       key.proj_attrib_mask = ~(GLbitfield64) 0;
2976    else {
2977       /* Bit VARYING_BIT_POS of key.proj_attrib_mask is never used, so to
2978        * avoid unnecessary recompiles, always set it to 1.
2979        */
2980       key.proj_attrib_mask |= VARYING_BIT_POS;
2981    }
2982
2983    if (intel->gen < 6)
2984       key.input_slots_valid |= BITFIELD64_BIT(VARYING_SLOT_POS);
2985
2986    for (int i = 0; i < VARYING_SLOT_MAX; i++) {
2987       if (!(fp->Base.InputsRead & BITFIELD64_BIT(i)))
2988          continue;
2989
2990       if (prog->Name == 0)
2991          key.proj_attrib_mask |= BITFIELD64_BIT(i);
2992
2993       if (intel->gen < 6) {
2994          if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
2995             key.input_slots_valid |= BITFIELD64_BIT(i);
2996       }
2997    }
2998
2999    key.clamp_fragment_color = true;
3000
3001    for (int i = 0; i < MAX_SAMPLERS; i++) {
3002       if (fp->Base.ShadowSamplers & (1 << i)) {
3003          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3004          key.tex.swizzles[i] =
3005             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3006       } else {
3007          /* Color sampler: assume no swizzling. */
3008          key.tex.swizzles[i] = SWIZZLE_XYZW;
3009       }
3010    }
3011
3012    if (fp->Base.InputsRead & VARYING_BIT_POS) {
3013       key.drawable_height = ctx->DrawBuffer->Height;
3014    }
3015
3016    if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3017       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
3018    }
3019
3020    key.nr_color_regions = 1;
3021
3022    key.program_string_id = bfp->id;
3023
3024    uint32_t old_prog_offset = brw->wm.prog_offset;
3025    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3026
3027    bool success = do_wm_prog(brw, prog, bfp, &key);
3028
3029    brw->wm.prog_offset = old_prog_offset;
3030    brw->wm.prog_data = old_prog_data;
3031
3032    return success;
3033 }