src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 extern "C" {
  32
  33 #include <sys/types.h>
  34
  35 #include "main/macros.h"
  36 #include "main/shaderobj.h"
  37 #include "main/uniforms.h"
  38 #include "main/fbobject.h"
  39 #include "program/prog_parameter.h"
  40 #include "program/prog_print.h"
  41 #include "program/register_allocate.h"
  42 #include "program/sampler.h"
  43 #include "program/hash_table.h"
  44 #include "brw_context.h"
  45 #include "brw_eu.h"
  46 #include "brw_wm.h"
  47 }
  48 #include "brw_fs.h"
  49 #include "glsl/glsl_types.h"
  50 #include "glsl/ir_print_visitor.h"
  51
  52 void
  53 fs_inst::init()
  54 {
  55    memset(this, 0, sizeof(*this));
  56    this->opcode = BRW_OPCODE_NOP;
  57    this->conditional_mod = BRW_CONDITIONAL_NONE;
  58
  59    this->dst = reg_undef;
  60    this->src[0] = reg_undef;
  61    this->src[1] = reg_undef;
  62    this->src[2] = reg_undef;
  63 }
  64
  65 fs_inst::fs_inst()
  66 {
  67    init();
  68 }
  69
  70 fs_inst::fs_inst(enum opcode opcode)
  71 {
  72    init();
  73    this->opcode = opcode;
  74 }
  75
  76 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
  77 {
  78    init();
  79    this->opcode = opcode;
  80    this->dst = dst;
  81
  82    if (dst.file == GRF)
  83       assert(dst.reg_offset >= 0);
  84 }
  85
  86 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
  87 {
  88    init();
  89    this->opcode = opcode;
  90    this->dst = dst;
  91    this->src[0] = src0;
  92
  93    if (dst.file == GRF)
  94       assert(dst.reg_offset >= 0);
  95    if (src[0].file == GRF)
  96       assert(src[0].reg_offset >= 0);
  97 }
  98
  99 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 100 {
 101    init();
 102    this->opcode = opcode;
 103    this->dst = dst;
 104    this->src[0] = src0;
 105    this->src[1] = src1;
 106
 107    if (dst.file == GRF)
 108       assert(dst.reg_offset >= 0);
 109    if (src[0].file == GRF)
 110       assert(src[0].reg_offset >= 0);
 111    if (src[1].file == GRF)
 112       assert(src[1].reg_offset >= 0);
 113 }
 114
 115 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
 116                  fs_reg src0, fs_reg src1, fs_reg src2)
 117 {
 118    init();
 119    this->opcode = opcode;
 120    this->dst = dst;
 121    this->src[0] = src0;
 122    this->src[1] = src1;
 123    this->src[2] = src2;
 124
 125    if (dst.file == GRF)
 126       assert(dst.reg_offset >= 0);
 127    if (src[0].file == GRF)
 128       assert(src[0].reg_offset >= 0);
 129    if (src[1].file == GRF)
 130       assert(src[1].reg_offset >= 0);
 131    if (src[2].file == GRF)
 132       assert(src[2].reg_offset >= 0);
 133 }
 134
 135 #define ALU1(op)                                                        \
 136    fs_inst *                                                            \
 137    fs_visitor::op(fs_reg dst, fs_reg src0)                              \
 138    {                                                                    \
 139       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
 140    }
 141
 142 #define ALU2(op)                                                        \
 143    fs_inst *                                                            \
 144    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1)                 \
 145    {                                                                    \
 146       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
 147    }
 148
 149 #define ALU3(op)                                                        \
 150    fs_inst *                                                            \
 151    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1, fs_reg src2)    \
 152    {                                                                    \
 153       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
 154    }
 155
 156 ALU1(NOT)
 157 ALU1(MOV)
 158 ALU1(FRC)
 159 ALU1(RNDD)
 160 ALU1(RNDE)
 161 ALU1(RNDZ)
 162 ALU2(ADD)
 163 ALU2(MUL)
 164 ALU2(MACH)
 165 ALU2(AND)
 166 ALU2(OR)
 167 ALU2(XOR)
 168 ALU2(SHL)
 169 ALU2(SHR)
 170 ALU2(ASR)
 171 ALU3(LRP)
 172
 173 /** Gen4 predicated IF. */
 174 fs_inst *
 175 fs_visitor::IF(uint32_t predicate)
 176 {
 177    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
 178    inst->predicate = predicate;
 179    return inst;
 180 }
 181
 182 /** Gen6+ IF with embedded comparison. */
 183 fs_inst *
 184 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
 185 {
 186    assert(intel->gen >= 6);
 187    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
 188                                         reg_null_d, src0, src1);
 189    inst->conditional_mod = condition;
 190    return inst;
 191 }
 192
 193 /**
 194  * CMP: Sets the low bit of the destination channels with the result
 195  * of the comparison, while the upper bits are undefined, and updates
 196  * the flag register with the packed 16 bits of the result.
 197  */
 198 fs_inst *
 199 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
 200 {
 201    fs_inst *inst;
 202
 203    /* Take the instruction:
 204     *
 205     * CMP null<d> src0<f> src1<f>
 206     *
 207     * Original gen4 does type conversion to the destination type before
 208     * comparison, producing garbage results for floating point comparisons.
 209     * gen5 does the comparison on the execution type (resolved source types),
 210     * so dst type doesn't matter.  gen6 does comparison and then uses the
 211     * result as if it was the dst type with no conversion, which happens to
 212     * mostly work out for float-interpreted-as-int since our comparisons are
 213     * for >0, =0, <0.
 214     */
 215    if (intel->gen == 4) {
 216       dst.type = src0.type;
 217       if (dst.file == FIXED_HW_REG)
 218          dst.fixed_hw_reg.type = dst.type;
 219    }
 220
 221    resolve_ud_negate(&src0);
 222    resolve_ud_negate(&src1);
 223
 224    inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
 225    inst->conditional_mod = condition;
 226
 227    return inst;
 228 }
 229
 230 exec_list
 231 fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
 232                                        fs_reg varying_offset,
 233                                        uint32_t const_offset)
 234 {
 235    exec_list instructions;
 236    fs_inst *inst;
 237
 238    fs_reg offset = fs_reg(this, glsl_type::uint_type);
 239    instructions.push_tail(ADD(offset, varying_offset, fs_reg(const_offset)));
 240
 241    if (intel->gen >= 7) {
 242       inst = new(mem_ctx) fs_inst(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7,
 243                                   dst, surf_index, offset);
 244       instructions.push_tail(inst);
 245    } else {
 246       int base_mrf = 13;
 247       bool header_present = true;
 248
 249       fs_reg mrf = fs_reg(MRF, base_mrf + header_present);
 250       mrf.type = BRW_REGISTER_TYPE_D;
 251
 252       /* On gen6+ we want the dword offset passed in, but on gen4/5 we need a
 253        * dword-aligned byte offset.
 254        */
 255       if (intel->gen == 6) {
 256          instructions.push_tail(MOV(mrf, offset));
 257       } else {
 258          instructions.push_tail(MUL(mrf, offset, fs_reg(4)));
 259       }
 260       inst = new(mem_ctx) fs_inst(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD,
 261                                   dst, surf_index);
 262       inst->header_present = header_present;
 263       inst->base_mrf = base_mrf;
 264       inst->mlen = header_present + dispatch_width / 8;
 265
 266       instructions.push_tail(inst);
 267    }
 268
 269    return instructions;
 270 }
 271
 272 /**
 273  * A helper for MOV generation for fixing up broken hardware SEND dependency
 274  * handling.
 275  */
 276 fs_inst *
 277 fs_visitor::DEP_RESOLVE_MOV(int grf)
 278 {
 279    fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
 280
 281    inst->ir = NULL;
 282    inst->annotation = "send dependency resolve";
 283
 284    /* The caller always wants uncompressed to emit the minimal extra
 285     * dependencies, and to avoid having to deal with aligning its regs to 2.
 286     */
 287    inst->force_uncompressed = true;
 288
 289    return inst;
 290 }
 291
 292 bool
 293 fs_inst::equals(fs_inst *inst)
 294 {
 295    return (opcode == inst->opcode &&
 296            dst.equals(inst->dst) &&
 297            src[0].equals(inst->src[0]) &&
 298            src[1].equals(inst->src[1]) &&
 299            src[2].equals(inst->src[2]) &&
 300            saturate == inst->saturate &&
 301            predicate == inst->predicate &&
 302            conditional_mod == inst->conditional_mod &&
 303            mlen == inst->mlen &&
 304            base_mrf == inst->base_mrf &&
 305            sampler == inst->sampler &&
 306            target == inst->target &&
 307            eot == inst->eot &&
 308            header_present == inst->header_present &&
 309            shadow_compare == inst->shadow_compare &&
 310            offset == inst->offset);
 311 }
 312
 313 int
 314 fs_inst::regs_written()
 315 {
 316    if (is_tex())
 317       return 4;
 318
 319    /* The SINCOS and INT_DIV_QUOTIENT_AND_REMAINDER math functions return 2,
 320     * but we don't currently use them...nor do we have an opcode for them.
 321     */
 322
 323    return 1;
 324 }
 325
 326 bool
 327 fs_inst::overwrites_reg(const fs_reg &reg)
 328 {
 329    return (reg.file == dst.file &&
 330            reg.reg == dst.reg &&
 331            reg.reg_offset >= dst.reg_offset  &&
 332            reg.reg_offset < dst.reg_offset + regs_written());
 333 }
 334
 335 bool
 336 fs_inst::is_tex()
 337 {
 338    return (opcode == SHADER_OPCODE_TEX ||
 339            opcode == FS_OPCODE_TXB ||
 340            opcode == SHADER_OPCODE_TXD ||
 341            opcode == SHADER_OPCODE_TXF ||
 342            opcode == SHADER_OPCODE_TXF_MS ||
 343            opcode == SHADER_OPCODE_TXL ||
 344            opcode == SHADER_OPCODE_TXS ||
 345            opcode == SHADER_OPCODE_LOD);
 346 }
 347
 348 bool
 349 fs_inst::is_math()
 350 {
 351    return (opcode == SHADER_OPCODE_RCP ||
 352            opcode == SHADER_OPCODE_RSQ ||
 353            opcode == SHADER_OPCODE_SQRT ||
 354            opcode == SHADER_OPCODE_EXP2 ||
 355            opcode == SHADER_OPCODE_LOG2 ||
 356            opcode == SHADER_OPCODE_SIN ||
 357            opcode == SHADER_OPCODE_COS ||
 358            opcode == SHADER_OPCODE_INT_QUOTIENT ||
 359            opcode == SHADER_OPCODE_INT_REMAINDER ||
 360            opcode == SHADER_OPCODE_POW);
 361 }
 362
 363 bool
 364 fs_inst::is_control_flow()
 365 {
 366    switch (opcode) {
 367    case BRW_OPCODE_DO:
 368    case BRW_OPCODE_WHILE:
 369    case BRW_OPCODE_IF:
 370    case BRW_OPCODE_ELSE:
 371    case BRW_OPCODE_ENDIF:
 372    case BRW_OPCODE_BREAK:
 373    case BRW_OPCODE_CONTINUE:
 374       return true;
 375    default:
 376       return false;
 377    }
 378 }
 379
 380 bool
 381 fs_inst::is_send_from_grf()
 382 {
 383    return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
 384            opcode == SHADER_OPCODE_SHADER_TIME_ADD ||
 385            (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
 386             src[1].file == GRF));
 387 }
 388
 389 bool
 390 fs_visitor::can_do_source_mods(fs_inst *inst)
 391 {
 392    if (intel->gen == 6 && inst->is_math())
 393       return false;
 394
 395    if (inst->is_send_from_grf())
 396       return false;
 397
 398    return true;
 399 }
 400
 401 void
 402 fs_reg::init()
 403 {
 404    memset(this, 0, sizeof(*this));
 405    this->smear = -1;
 406 }
 407
 408 /** Generic unset register constructor. */
 409 fs_reg::fs_reg()
 410 {
 411    init();
 412    this->file = BAD_FILE;
 413 }
 414
 415 /** Immediate value constructor. */
 416 fs_reg::fs_reg(float f)
 417 {
 418    init();
 419    this->file = IMM;
 420    this->type = BRW_REGISTER_TYPE_F;
 421    this->imm.f = f;
 422 }
 423
 424 /** Immediate value constructor. */
 425 fs_reg::fs_reg(int32_t i)
 426 {
 427    init();
 428    this->file = IMM;
 429    this->type = BRW_REGISTER_TYPE_D;
 430    this->imm.i = i;
 431 }
 432
 433 /** Immediate value constructor. */
 434 fs_reg::fs_reg(uint32_t u)
 435 {
 436    init();
 437    this->file = IMM;
 438    this->type = BRW_REGISTER_TYPE_UD;
 439    this->imm.u = u;
 440 }
 441
 442 /** Fixed brw_reg Immediate value constructor. */
 443 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
 444 {
 445    init();
 446    this->file = FIXED_HW_REG;
 447    this->fixed_hw_reg = fixed_hw_reg;
 448    this->type = fixed_hw_reg.type;
 449 }
 450
 451 bool
 452 fs_reg::equals(const fs_reg &r) const
 453 {
 454    return (file == r.file &&
 455            reg == r.reg &&
 456            reg_offset == r.reg_offset &&
 457            type == r.type &&
 458            negate == r.negate &&
 459            abs == r.abs &&
 460            !reladdr && !r.reladdr &&
 461            memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
 462                   sizeof(fixed_hw_reg)) == 0 &&
 463            smear == r.smear &&
 464            imm.u == r.imm.u);
 465 }
 466
 467 bool
 468 fs_reg::is_zero() const
 469 {
 470    if (file != IMM)
 471       return false;
 472
 473    return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
 474 }
 475
 476 bool
 477 fs_reg::is_one() const
 478 {
 479    if (file != IMM)
 480       return false;
 481
 482    return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
 483 }
 484
 485 int
 486 fs_visitor::type_size(const struct glsl_type *type)
 487 {
 488    unsigned int size, i;
 489
 490    switch (type->base_type) {
 491    case GLSL_TYPE_UINT:
 492    case GLSL_TYPE_INT:
 493    case GLSL_TYPE_FLOAT:
 494    case GLSL_TYPE_BOOL:
 495       return type->components();
 496    case GLSL_TYPE_ARRAY:
 497       return type_size(type->fields.array) * type->length;
 498    case GLSL_TYPE_STRUCT:
 499       size = 0;
 500       for (i = 0; i < type->length; i++) {
 501          size += type_size(type->fields.structure[i].type);
 502       }
 503       return size;
 504    case GLSL_TYPE_SAMPLER:
 505       /* Samplers take up no register space, since they're baked in at
 506        * link time.
 507        */
 508       return 0;
 509    case GLSL_TYPE_VOID:
 510    case GLSL_TYPE_ERROR:
 511    case GLSL_TYPE_INTERFACE:
 512       assert(!"not reached");
 513       break;
 514    }
 515
 516    return 0;
 517 }
 518
 519 fs_reg
 520 fs_visitor::get_timestamp()
 521 {
 522    assert(intel->gen >= 7);
 523
 524    fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 525                                           BRW_ARF_TIMESTAMP,
 526                                           0),
 527                              BRW_REGISTER_TYPE_UD));
 528
 529    fs_reg dst = fs_reg(this, glsl_type::uint_type);
 530
 531    fs_inst *mov = emit(MOV(dst, ts));
 532    /* We want to read the 3 fields we care about (mostly field 0, but also 2)
 533     * even if it's not enabled in the dispatch.
 534     */
 535    mov->force_writemask_all = true;
 536    mov->force_uncompressed = true;
 537
 538    /* The caller wants the low 32 bits of the timestamp.  Since it's running
 539     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
 540     * which is plenty of time for our purposes.  It is identical across the
 541     * EUs, but since it's tracking GPU core speed it will increment at a
 542     * varying rate as render P-states change.
 543     *
 544     * The caller could also check if render P-states have changed (or anything
 545     * else that might disrupt timing) by setting smear to 2 and checking if
 546     * that field is != 0.
 547     */
 548    dst.smear = 0;
 549
 550    return dst;
 551 }
 552
 553 void
 554 fs_visitor::emit_shader_time_begin()
 555 {
 556    current_annotation = "shader time start";
 557    shader_start_time = get_timestamp();
 558 }
 559
 560 void
 561 fs_visitor::emit_shader_time_end()
 562 {
 563    current_annotation = "shader time end";
 564
 565    enum shader_time_shader_type type, written_type, reset_type;
 566    if (dispatch_width == 8) {
 567       type = ST_FS8;
 568       written_type = ST_FS8_WRITTEN;
 569       reset_type = ST_FS8_RESET;
 570    } else {
 571       assert(dispatch_width == 16);
 572       type = ST_FS16;
 573       written_type = ST_FS16_WRITTEN;
 574       reset_type = ST_FS16_RESET;
 575    }
 576
 577    fs_reg shader_end_time = get_timestamp();
 578
 579    /* Check that there weren't any timestamp reset events (assuming these
 580     * were the only two timestamp reads that happened).
 581     */
 582    fs_reg reset = shader_end_time;
 583    reset.smear = 2;
 584    fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
 585    test->conditional_mod = BRW_CONDITIONAL_Z;
 586    emit(IF(BRW_PREDICATE_NORMAL));
 587
 588    push_force_uncompressed();
 589    fs_reg start = shader_start_time;
 590    start.negate = true;
 591    fs_reg diff = fs_reg(this, glsl_type::uint_type);
 592    emit(ADD(diff, start, shader_end_time));
 593
 594    /* If there were no instructions between the two timestamp gets, the diff
 595     * is 2 cycles.  Remove that overhead, so I can forget about that when
 596     * trying to determine the time taken for single instructions.
 597     */
 598    emit(ADD(diff, diff, fs_reg(-2u)));
 599
 600    emit_shader_time_write(type, diff);
 601    emit_shader_time_write(written_type, fs_reg(1u));
 602    emit(BRW_OPCODE_ELSE);
 603    emit_shader_time_write(reset_type, fs_reg(1u));
 604    emit(BRW_OPCODE_ENDIF);
 605
 606    pop_force_uncompressed();
 607 }
 608
 609 void
 610 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
 611                                    fs_reg value)
 612 {
 613    int shader_time_index = brw_get_shader_time_index(brw, prog, &fp->Base,
 614                                                      type);
 615    fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
 616
 617    fs_reg payload;
 618    if (dispatch_width == 8)
 619       payload = fs_reg(this, glsl_type::uvec2_type);
 620    else
 621       payload = fs_reg(this, glsl_type::uint_type);
 622
 623    emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
 624                 fs_reg(), payload, offset, value));
 625 }
 626
 627 void
 628 fs_visitor::fail(const char *format, ...)
 629 {
 630    va_list va;
 631    char *msg;
 632
 633    if (failed)
 634       return;
 635
 636    failed = true;
 637
 638    va_start(va, format);
 639    msg = ralloc_vasprintf(mem_ctx, format, va);
 640    va_end(va);
 641    msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
 642
 643    this->fail_msg = msg;
 644
 645    if (INTEL_DEBUG & DEBUG_WM) {
 646       fprintf(stderr, "%s",  msg);
 647    }
 648 }
 649
 650 fs_inst *
 651 fs_visitor::emit(enum opcode opcode)
 652 {
 653    return emit(fs_inst(opcode));
 654 }
 655
 656 fs_inst *
 657 fs_visitor::emit(enum opcode opcode, fs_reg dst)
 658 {
 659    return emit(fs_inst(opcode, dst));
 660 }
 661
 662 fs_inst *
 663 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
 664 {
 665    return emit(fs_inst(opcode, dst, src0));
 666 }
 667
 668 fs_inst *
 669 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 670 {
 671    return emit(fs_inst(opcode, dst, src0, src1));
 672 }
 673
 674 fs_inst *
 675 fs_visitor::emit(enum opcode opcode, fs_reg dst,
 676                  fs_reg src0, fs_reg src1, fs_reg src2)
 677 {
 678    return emit(fs_inst(opcode, dst, src0, src1, src2));
 679 }
 680
 681 void
 682 fs_visitor::push_force_uncompressed()
 683 {
 684    force_uncompressed_stack++;
 685 }
 686
 687 void
 688 fs_visitor::pop_force_uncompressed()
 689 {
 690    force_uncompressed_stack--;
 691    assert(force_uncompressed_stack >= 0);
 692 }
 693
 694 void
 695 fs_visitor::push_force_sechalf()
 696 {
 697    force_sechalf_stack++;
 698 }
 699
 700 void
 701 fs_visitor::pop_force_sechalf()
 702 {
 703    force_sechalf_stack--;
 704    assert(force_sechalf_stack >= 0);
 705 }
 706
 707 /**
 708  * Returns how many MRFs an FS opcode will write over.
 709  *
 710  * Note that this is not the 0 or 1 implied writes in an actual gen
 711  * instruction -- the FS opcodes often generate MOVs in addition.
 712  */
 713 int
 714 fs_visitor::implied_mrf_writes(fs_inst *inst)
 715 {
 716    if (inst->mlen == 0)
 717       return 0;
 718
 719    switch (inst->opcode) {
 720    case SHADER_OPCODE_RCP:
 721    case SHADER_OPCODE_RSQ:
 722    case SHADER_OPCODE_SQRT:
 723    case SHADER_OPCODE_EXP2:
 724    case SHADER_OPCODE_LOG2:
 725    case SHADER_OPCODE_SIN:
 726    case SHADER_OPCODE_COS:
 727       return 1 * dispatch_width / 8;
 728    case SHADER_OPCODE_POW:
 729    case SHADER_OPCODE_INT_QUOTIENT:
 730    case SHADER_OPCODE_INT_REMAINDER:
 731       return 2 * dispatch_width / 8;
 732    case SHADER_OPCODE_TEX:
 733    case FS_OPCODE_TXB:
 734    case SHADER_OPCODE_TXD:
 735    case SHADER_OPCODE_TXF:
 736    case SHADER_OPCODE_TXF_MS:
 737    case SHADER_OPCODE_TXL:
 738    case SHADER_OPCODE_TXS:
 739    case SHADER_OPCODE_LOD:
 740       return 1;
 741    case FS_OPCODE_FB_WRITE:
 742       return 2;
 743    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 744    case FS_OPCODE_UNSPILL:
 745       return 1;
 746    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
 747       return inst->header_present;
 748    case FS_OPCODE_SPILL:
 749       return 2;
 750    default:
 751       assert(!"not reached");
 752       return inst->mlen;
 753    }
 754 }
 755
 756 int
 757 fs_visitor::virtual_grf_alloc(int size)
 758 {
 759    if (virtual_grf_array_size <= virtual_grf_count) {
 760       if (virtual_grf_array_size == 0)
 761          virtual_grf_array_size = 16;
 762       else
 763          virtual_grf_array_size *= 2;
 764       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 765                                    virtual_grf_array_size);
 766    }
 767    virtual_grf_sizes[virtual_grf_count] = size;
 768    return virtual_grf_count++;
 769 }
 770
 771 /** Fixed HW reg constructor. */
 772 fs_reg::fs_reg(enum register_file file, int reg)
 773 {
 774    init();
 775    this->file = file;
 776    this->reg = reg;
 777    this->type = BRW_REGISTER_TYPE_F;
 778 }
 779
 780 /** Fixed HW reg constructor. */
 781 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
 782 {
 783    init();
 784    this->file = file;
 785    this->reg = reg;
 786    this->type = type;
 787 }
 788
 789 /** Automatic reg constructor. */
 790 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
 791 {
 792    init();
 793
 794    this->file = GRF;
 795    this->reg = v->virtual_grf_alloc(v->type_size(type));
 796    this->reg_offset = 0;
 797    this->type = brw_type_for_base_type(type);
 798 }
 799
 800 fs_reg *
 801 fs_visitor::variable_storage(ir_variable *var)
 802 {
 803    return (fs_reg *)hash_table_find(this->variable_ht, var);
 804 }
 805
 806 void
 807 import_uniforms_callback(const void *key,
 808                          void *data,
 809                          void *closure)
 810 {
 811    struct hash_table *dst_ht = (struct hash_table *)closure;
 812    const fs_reg *reg = (const fs_reg *)data;
 813
 814    if (reg->file != UNIFORM)
 815       return;
 816
 817    hash_table_insert(dst_ht, data, key);
 818 }
 819
 820 /* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
 821  * This brings in those uniform definitions
 822  */
 823 void
 824 fs_visitor::import_uniforms(fs_visitor *v)
 825 {
 826    hash_table_call_foreach(v->variable_ht,
 827                            import_uniforms_callback,
 828                            variable_ht);
 829    this->params_remap = v->params_remap;
 830 }
 831
 832 /* Our support for uniforms is piggy-backed on the struct
 833  * gl_fragment_program, because that's where the values actually
 834  * get stored, rather than in some global gl_shader_program uniform
 835  * store.
 836  */
 837 void
 838 fs_visitor::setup_uniform_values(ir_variable *ir)
 839 {
 840    int namelen = strlen(ir->name);
 841
 842    /* The data for our (non-builtin) uniforms is stored in a series of
 843     * gl_uniform_driver_storage structs for each subcomponent that
 844     * glGetUniformLocation() could name.  We know it's been set up in the same
 845     * order we'd walk the type, so walk the list of storage and find anything
 846     * with our name, or the prefix of a component that starts with our name.
 847     */
 848    unsigned params_before = c->prog_data.nr_params;
 849    for (unsigned u = 0; u < prog->NumUserUniformStorage; u++) {
 850       struct gl_uniform_storage *storage = &prog->UniformStorage[u];
 851
 852       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 853           (storage->name[namelen] != 0 &&
 854            storage->name[namelen] != '.' &&
 855            storage->name[namelen] != '[')) {
 856          continue;
 857       }
 858
 859       unsigned slots = storage->type->component_slots();
 860       if (storage->array_elements)
 861          slots *= storage->array_elements;
 862
 863       for (unsigned i = 0; i < slots; i++) {
 864          c->prog_data.param[c->prog_data.nr_params++] =
 865             &storage->storage[i].f;
 866       }
 867    }
 868
 869    /* Make sure we actually initialized the right amount of stuff here. */
 870    assert(params_before + ir->type->component_slots() ==
 871           c->prog_data.nr_params);
 872 }
 873
 874
 875 /* Our support for builtin uniforms is even scarier than non-builtin.
 876  * It sits on top of the PROG_STATE_VAR parameters that are
 877  * automatically updated from GL context state.
 878  */
 879 void
 880 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
 881 {
 882    const ir_state_slot *const slots = ir->state_slots;
 883    assert(ir->state_slots != NULL);
 884
 885    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 886       /* This state reference has already been setup by ir_to_mesa, but we'll
 887        * get the same index back here.
 888        */
 889       int index = _mesa_add_state_reference(this->fp->Base.Parameters,
 890                                             (gl_state_index *)slots[i].tokens);
 891
 892       /* Add each of the unique swizzles of the element as a parameter.
 893        * This'll end up matching the expected layout of the
 894        * array/matrix/structure we're trying to fill in.
 895        */
 896       int last_swiz = -1;
 897       for (unsigned int j = 0; j < 4; j++) {
 898          int swiz = GET_SWZ(slots[i].swizzle, j);
 899          if (swiz == last_swiz)
 900             break;
 901          last_swiz = swiz;
 902
 903          c->prog_data.param[c->prog_data.nr_params++] =
 904             &fp->Base.Parameters->ParameterValues[index][swiz].f;
 905       }
 906    }
 907 }
 908
 909 fs_reg *
 910 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
 911 {
 912    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 913    fs_reg wpos = *reg;
 914    bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
 915
 916    /* gl_FragCoord.x */
 917    if (ir->pixel_center_integer) {
 918       emit(MOV(wpos, this->pixel_x));
 919    } else {
 920       emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
 921    }
 922    wpos.reg_offset++;
 923
 924    /* gl_FragCoord.y */
 925    if (!flip && ir->pixel_center_integer) {
 926       emit(MOV(wpos, this->pixel_y));
 927    } else {
 928       fs_reg pixel_y = this->pixel_y;
 929       float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
 930
 931       if (flip) {
 932          pixel_y.negate = true;
 933          offset += c->key.drawable_height - 1.0;
 934       }
 935
 936       emit(ADD(wpos, pixel_y, fs_reg(offset)));
 937    }
 938    wpos.reg_offset++;
 939
 940    /* gl_FragCoord.z */
 941    if (intel->gen >= 6) {
 942       emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
 943    } else {
 944       emit(FS_OPCODE_LINTERP, wpos,
 945            this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
 946            this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
 947            interp_reg(VARYING_SLOT_POS, 2));
 948    }
 949    wpos.reg_offset++;
 950
 951    /* gl_FragCoord.w: Already set up in emit_interpolation */
 952    emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
 953
 954    return reg;
 955 }
 956
 957 fs_inst *
 958 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
 959                          glsl_interp_qualifier interpolation_mode,
 960                          bool is_centroid)
 961 {
 962    brw_wm_barycentric_interp_mode barycoord_mode;
 963    if (is_centroid) {
 964       if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
 965          barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
 966       else
 967          barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
 968    } else {
 969       if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
 970          barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
 971       else
 972          barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
 973    }
 974    return emit(FS_OPCODE_LINTERP, attr,
 975                this->delta_x[barycoord_mode],
 976                this->delta_y[barycoord_mode], interp);
 977 }
 978
 979 fs_reg *
 980 fs_visitor::emit_general_interpolation(ir_variable *ir)
 981 {
 982    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 983    reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
 984    fs_reg attr = *reg;
 985
 986    unsigned int array_elements;
 987    const glsl_type *type;
 988
 989    if (ir->type->is_array()) {
 990       array_elements = ir->type->length;
 991       if (array_elements == 0) {
 992          fail("dereferenced array '%s' has length 0\n", ir->name);
 993       }
 994       type = ir->type->fields.array;
 995    } else {
 996       array_elements = 1;
 997       type = ir->type;
 998    }
 999
1000    glsl_interp_qualifier interpolation_mode =
1001       ir->determine_interpolation_mode(c->key.flat_shade);
1002
1003    int location = ir->location;
1004    for (unsigned int i = 0; i < array_elements; i++) {
1005       for (unsigned int j = 0; j < type->matrix_columns; j++) {
1006          if (urb_setup[location] == -1) {
1007             /* If there's no incoming setup data for this slot, don't
1008              * emit interpolation for it.
1009              */
1010             attr.reg_offset += type->vector_elements;
1011             location++;
1012             continue;
1013          }
1014
1015          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1016             /* Constant interpolation (flat shading) case. The SF has
1017              * handed us defined values in only the constant offset
1018              * field of the setup reg.
1019              */
1020             for (unsigned int k = 0; k < type->vector_elements; k++) {
1021                struct brw_reg interp = interp_reg(location, k);
1022                interp = suboffset(interp, 3);
1023                interp.type = reg->type;
1024                emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1025                attr.reg_offset++;
1026             }
1027          } else {
1028             /* Smooth/noperspective interpolation case. */
1029             for (unsigned int k = 0; k < type->vector_elements; k++) {
1030                /* FINISHME: At some point we probably want to push
1031                 * this farther by giving similar treatment to the
1032                 * other potentially constant components of the
1033                 * attribute, as well as making brw_vs_constval.c
1034                 * handle varyings other than gl_TexCoord.
1035                 */
1036                if (location >= VARYING_SLOT_TEX0 &&
1037                    location <= VARYING_SLOT_TEX7 &&
1038                    k == 3 && !(c->key.proj_attrib_mask
1039                                & BITFIELD64_BIT(location))) {
1040                   emit(BRW_OPCODE_MOV, attr, fs_reg(1.0f));
1041                } else {
1042                   struct brw_reg interp = interp_reg(location, k);
1043                   emit_linterp(attr, fs_reg(interp), interpolation_mode,
1044                                ir->centroid);
1045                   if (brw->needs_unlit_centroid_workaround && ir->centroid) {
1046                      /* Get the pixel/sample mask into f0 so that we know
1047                       * which pixels are lit.  Then, for each channel that is
1048                       * unlit, replace the centroid data with non-centroid
1049                       * data.
1050                       */
1051                      emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS, attr);
1052                      fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1053                                                   interpolation_mode, false);
1054                      inst->predicate = BRW_PREDICATE_NORMAL;
1055                      inst->predicate_inverse = true;
1056                   }
1057                   if (intel->gen < 6) {
1058                      emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1059                   }
1060                }
1061                attr.reg_offset++;
1062             }
1063
1064          }
1065          location++;
1066       }
1067    }
1068
1069    return reg;
1070 }
1071
1072 fs_reg *
1073 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1074 {
1075    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1076
1077    /* The frontfacing comes in as a bit in the thread payload. */
1078    if (intel->gen >= 6) {
1079       emit(BRW_OPCODE_ASR, *reg,
1080            fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1081            fs_reg(15));
1082       emit(BRW_OPCODE_NOT, *reg, *reg);
1083       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1084    } else {
1085       struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1086       /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1087        * us front face
1088        */
1089       emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1090       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1091    }
1092
1093    return reg;
1094 }
1095
1096 fs_reg
1097 fs_visitor::fix_math_operand(fs_reg src)
1098 {
1099    /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1100     * might be able to do better by doing execsize = 1 math and then
1101     * expanding that result out, but we would need to be careful with
1102     * masking.
1103     *
1104     * The hardware ignores source modifiers (negate and abs) on math
1105     * instructions, so we also move to a temp to set those up.
1106     */
1107    if (intel->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1108        !src.abs && !src.negate)
1109       return src;
1110
1111    /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1112     * operands to math
1113     */
1114    if (intel->gen >= 7 && src.file != IMM)
1115       return src;
1116
1117    fs_reg expanded = fs_reg(this, glsl_type::float_type);
1118    expanded.type = src.type;
1119    emit(BRW_OPCODE_MOV, expanded, src);
1120    return expanded;
1121 }
1122
1123 fs_inst *
1124 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1125 {
1126    switch (opcode) {
1127    case SHADER_OPCODE_RCP:
1128    case SHADER_OPCODE_RSQ:
1129    case SHADER_OPCODE_SQRT:
1130    case SHADER_OPCODE_EXP2:
1131    case SHADER_OPCODE_LOG2:
1132    case SHADER_OPCODE_SIN:
1133    case SHADER_OPCODE_COS:
1134       break;
1135    default:
1136       assert(!"not reached: bad math opcode");
1137       return NULL;
1138    }
1139
1140    /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
1141     * might be able to do better by doing execsize = 1 math and then
1142     * expanding that result out, but we would need to be careful with
1143     * masking.
1144     *
1145     * Gen 6 hardware ignores source modifiers (negate and abs) on math
1146     * instructions, so we also move to a temp to set those up.
1147     */
1148    if (intel->gen >= 6)
1149       src = fix_math_operand(src);
1150
1151    fs_inst *inst = emit(opcode, dst, src);
1152
1153    if (intel->gen < 6) {
1154       inst->base_mrf = 2;
1155       inst->mlen = dispatch_width / 8;
1156    }
1157
1158    return inst;
1159 }
1160
1161 fs_inst *
1162 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1163 {
1164    int base_mrf = 2;
1165    fs_inst *inst;
1166
1167    switch (opcode) {
1168    case SHADER_OPCODE_INT_QUOTIENT:
1169    case SHADER_OPCODE_INT_REMAINDER:
1170       if (intel->gen >= 7 && dispatch_width == 16)
1171          fail("16-wide INTDIV unsupported\n");
1172       break;
1173    case SHADER_OPCODE_POW:
1174       break;
1175    default:
1176       assert(!"not reached: unsupported binary math opcode.");
1177       return NULL;
1178    }
1179
1180    if (intel->gen >= 6) {
1181       src0 = fix_math_operand(src0);
1182       src1 = fix_math_operand(src1);
1183
1184       inst = emit(opcode, dst, src0, src1);
1185    } else {
1186       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1187        * "Message Payload":
1188        *
1189        * "Operand0[7].  For the INT DIV functions, this operand is the
1190        *  denominator."
1191        *  ...
1192        * "Operand1[7].  For the INT DIV functions, this operand is the
1193        *  numerator."
1194        */
1195       bool is_int_div = opcode != SHADER_OPCODE_POW;
1196       fs_reg &op0 = is_int_div ? src1 : src0;
1197       fs_reg &op1 = is_int_div ? src0 : src1;
1198
1199       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1200       inst = emit(opcode, dst, op0, reg_null_f);
1201
1202       inst->base_mrf = base_mrf;
1203       inst->mlen = 2 * dispatch_width / 8;
1204    }
1205    return inst;
1206 }
1207
1208 void
1209 fs_visitor::assign_curb_setup()
1210 {
1211    c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
1212    if (dispatch_width == 8) {
1213       c->prog_data.first_curbe_grf = c->nr_payload_regs;
1214    } else {
1215       c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1216    }
1217
1218    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1219    foreach_list(node, &this->instructions) {
1220       fs_inst *inst = (fs_inst *)node;
1221
1222       for (unsigned int i = 0; i < 3; i++) {
1223          if (inst->src[i].file == UNIFORM) {
1224             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1225             struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1226                                                   constant_nr / 8,
1227                                                   constant_nr % 8);
1228
1229             inst->src[i].file = FIXED_HW_REG;
1230             inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
1231          }
1232       }
1233    }
1234 }
1235
1236 void
1237 fs_visitor::calculate_urb_setup()
1238 {
1239    for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1240       urb_setup[i] = -1;
1241    }
1242
1243    int urb_next = 0;
1244    /* Figure out where each of the incoming setup attributes lands. */
1245    if (intel->gen >= 6) {
1246       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1247          if (fp->Base.InputsRead & BITFIELD64_BIT(i)) {
1248             urb_setup[i] = urb_next++;
1249          }
1250       }
1251    } else {
1252       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1253       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1254          /* Point size is packed into the header, not as a general attribute */
1255          if (i == VARYING_SLOT_PSIZ)
1256             continue;
1257
1258          if (c->key.input_slots_valid & BITFIELD64_BIT(i)) {
1259             /* The back color slot is skipped when the front color is
1260              * also written to.  In addition, some slots can be
1261              * written in the vertex shader and not read in the
1262              * fragment shader.  So the register number must always be
1263              * incremented, mapped or not.
1264              */
1265             if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1266                urb_setup[i] = urb_next;
1267             urb_next++;
1268          }
1269       }
1270
1271       /*
1272        * It's a FS only attribute, and we did interpolation for this attribute
1273        * in SF thread. So, count it here, too.
1274        *
1275        * See compile_sf_prog() for more info.
1276        */
1277       if (fp->Base.InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1278          urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1279    }
1280
1281    /* Each attribute is 4 setup channels, each of which is half a reg. */
1282    c->prog_data.urb_read_length = urb_next * 2;
1283 }
1284
1285 void
1286 fs_visitor::assign_urb_setup()
1287 {
1288    int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1289
1290    /* Offset all the urb_setup[] index by the actual position of the
1291     * setup regs, now that the location of the constants has been chosen.
1292     */
1293    foreach_list(node, &this->instructions) {
1294       fs_inst *inst = (fs_inst *)node;
1295
1296       if (inst->opcode == FS_OPCODE_LINTERP) {
1297          assert(inst->src[2].file == FIXED_HW_REG);
1298          inst->src[2].fixed_hw_reg.nr += urb_start;
1299       }
1300
1301       if (inst->opcode == FS_OPCODE_CINTERP) {
1302          assert(inst->src[0].file == FIXED_HW_REG);
1303          inst->src[0].fixed_hw_reg.nr += urb_start;
1304       }
1305    }
1306
1307    this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length;
1308 }
1309
1310 /**
1311  * Split large virtual GRFs into separate components if we can.
1312  *
1313  * This is mostly duplicated with what brw_fs_vector_splitting does,
1314  * but that's really conservative because it's afraid of doing
1315  * splitting that doesn't result in real progress after the rest of
1316  * the optimization phases, which would cause infinite looping in
1317  * optimization.  We can do it once here, safely.  This also has the
1318  * opportunity to split interpolated values, or maybe even uniforms,
1319  * which we don't have at the IR level.
1320  *
1321  * We want to split, because virtual GRFs are what we register
1322  * allocate and spill (due to contiguousness requirements for some
1323  * instructions), and they're what we naturally generate in the
1324  * codegen process, but most virtual GRFs don't actually need to be
1325  * contiguous sets of GRFs.  If we split, we'll end up with reduced
1326  * live intervals and better dead code elimination and coalescing.
1327  */
1328 void
1329 fs_visitor::split_virtual_grfs()
1330 {
1331    int num_vars = this->virtual_grf_count;
1332    bool split_grf[num_vars];
1333    int new_virtual_grf[num_vars];
1334
1335    /* Try to split anything > 0 sized. */
1336    for (int i = 0; i < num_vars; i++) {
1337       if (this->virtual_grf_sizes[i] != 1)
1338          split_grf[i] = true;
1339       else
1340          split_grf[i] = false;
1341    }
1342
1343    if (brw->has_pln &&
1344        this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1345       /* PLN opcodes rely on the delta_xy being contiguous.  We only have to
1346        * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1347        * Gen6, that was the only supported interpolation mode, and since Gen6,
1348        * delta_x and delta_y are in fixed hardware registers.
1349        */
1350       split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1351          false;
1352    }
1353
1354    foreach_list(node, &this->instructions) {
1355       fs_inst *inst = (fs_inst *)node;
1356
1357       /* If there's a SEND message that requires contiguous destination
1358        * registers, no splitting is allowed.
1359        */
1360       if (inst->regs_written() > 1) {
1361          split_grf[inst->dst.reg] = false;
1362       }
1363
1364       /* If we're sending from a GRF, don't split it, on the assumption that
1365        * the send is reading the whole thing.
1366        */
1367       if (inst->is_send_from_grf()) {
1368          split_grf[inst->src[0].reg] = false;
1369       }
1370    }
1371
1372    /* Allocate new space for split regs.  Note that the virtual
1373     * numbers will be contiguous.
1374     */
1375    for (int i = 0; i < num_vars; i++) {
1376       if (split_grf[i]) {
1377          new_virtual_grf[i] = virtual_grf_alloc(1);
1378          for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1379             int reg = virtual_grf_alloc(1);
1380             assert(reg == new_virtual_grf[i] + j - 1);
1381             (void) reg;
1382          }
1383          this->virtual_grf_sizes[i] = 1;
1384       }
1385    }
1386
1387    foreach_list(node, &this->instructions) {
1388       fs_inst *inst = (fs_inst *)node;
1389
1390       if (inst->dst.file == GRF &&
1391           split_grf[inst->dst.reg] &&
1392           inst->dst.reg_offset != 0) {
1393          inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1394                           inst->dst.reg_offset - 1);
1395          inst->dst.reg_offset = 0;
1396       }
1397       for (int i = 0; i < 3; i++) {
1398          if (inst->src[i].file == GRF &&
1399              split_grf[inst->src[i].reg] &&
1400              inst->src[i].reg_offset != 0) {
1401             inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1402                                 inst->src[i].reg_offset - 1);
1403             inst->src[i].reg_offset = 0;
1404          }
1405       }
1406    }
1407    this->live_intervals_valid = false;
1408 }
1409
1410 /**
1411  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1412  *
1413  * During code generation, we create tons of temporary variables, many of
1414  * which get immediately killed and are never used again.  Yet, in later
1415  * optimization and analysis passes, such as compute_live_intervals, we need
1416  * to loop over all the virtual GRFs.  Compacting them can save a lot of
1417  * overhead.
1418  */
1419 void
1420 fs_visitor::compact_virtual_grfs()
1421 {
1422    /* Mark which virtual GRFs are used, and count how many. */
1423    int remap_table[this->virtual_grf_count];
1424    memset(remap_table, -1, sizeof(remap_table));
1425
1426    foreach_list(node, &this->instructions) {
1427       const fs_inst *inst = (const fs_inst *) node;
1428
1429       if (inst->dst.file == GRF)
1430          remap_table[inst->dst.reg] = 0;
1431
1432       for (int i = 0; i < 3; i++) {
1433          if (inst->src[i].file == GRF)
1434             remap_table[inst->src[i].reg] = 0;
1435       }
1436    }
1437
1438    /* In addition to registers used in instructions, fs_visitor keeps
1439     * direct references to certain special values which must be patched:
1440     */
1441    fs_reg *special[] = {
1442       &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1443       &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1444       &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1445       &delta_x[0], &delta_x[1], &delta_x[2],
1446       &delta_x[3], &delta_x[4], &delta_x[5],
1447       &delta_y[0], &delta_y[1], &delta_y[2],
1448       &delta_y[3], &delta_y[4], &delta_y[5],
1449    };
1450    STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1451    STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1452
1453    /* Treat all special values as used, to be conservative */
1454    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1455       if (special[i]->file == GRF)
1456          remap_table[special[i]->reg] = 0;
1457    }
1458
1459    /* Compact the GRF arrays. */
1460    int new_index = 0;
1461    for (int i = 0; i < this->virtual_grf_count; i++) {
1462       if (remap_table[i] != -1) {
1463          remap_table[i] = new_index;
1464          virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1465          if (live_intervals_valid) {
1466             virtual_grf_use[new_index] = virtual_grf_use[i];
1467             virtual_grf_def[new_index] = virtual_grf_def[i];
1468          }
1469          ++new_index;
1470       }
1471    }
1472
1473    this->virtual_grf_count = new_index;
1474
1475    /* Patch all the instructions to use the newly renumbered registers */
1476    foreach_list(node, &this->instructions) {
1477       fs_inst *inst = (fs_inst *) node;
1478
1479       if (inst->dst.file == GRF)
1480          inst->dst.reg = remap_table[inst->dst.reg];
1481
1482       for (int i = 0; i < 3; i++) {
1483          if (inst->src[i].file == GRF)
1484             inst->src[i].reg = remap_table[inst->src[i].reg];
1485       }
1486    }
1487
1488    /* Patch all the references to special values */
1489    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1490       if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1491          special[i]->reg = remap_table[special[i]->reg];
1492    }
1493 }
1494
1495 bool
1496 fs_visitor::remove_dead_constants()
1497 {
1498    if (dispatch_width == 8) {
1499       this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
1500
1501       for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
1502          this->params_remap[i] = -1;
1503
1504       /* Find which params are still in use. */
1505       foreach_list(node, &this->instructions) {
1506          fs_inst *inst = (fs_inst *)node;
1507
1508          for (int i = 0; i < 3; i++) {
1509             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1510
1511             if (inst->src[i].file != UNIFORM)
1512                continue;
1513
1514             assert(constant_nr < (int)c->prog_data.nr_params);
1515
1516             /* For now, set this to non-negative.  We'll give it the
1517              * actual new number in a moment, in order to keep the
1518              * register numbers nicely ordered.
1519              */
1520             this->params_remap[constant_nr] = 0;
1521          }
1522       }
1523
1524       /* Figure out what the new numbers for the params will be.  At some
1525        * point when we're doing uniform array access, we're going to want
1526        * to keep the distinction between .reg and .reg_offset, but for
1527        * now we don't care.
1528        */
1529       unsigned int new_nr_params = 0;
1530       for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1531          if (this->params_remap[i] != -1) {
1532             this->params_remap[i] = new_nr_params++;
1533          }
1534       }
1535
1536       /* Update the list of params to be uploaded to match our new numbering. */
1537       for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1538          int remapped = this->params_remap[i];
1539
1540          if (remapped == -1)
1541             continue;
1542
1543          c->prog_data.param[remapped] = c->prog_data.param[i];
1544       }
1545
1546       c->prog_data.nr_params = new_nr_params;
1547    } else {
1548       /* This should have been generated in the 8-wide pass already. */
1549       assert(this->params_remap);
1550    }
1551
1552    /* Now do the renumbering of the shader to remove unused params. */
1553    foreach_list(node, &this->instructions) {
1554       fs_inst *inst = (fs_inst *)node;
1555
1556       for (int i = 0; i < 3; i++) {
1557          int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1558
1559          if (inst->src[i].file != UNIFORM)
1560             continue;
1561
1562          assert(this->params_remap[constant_nr] != -1);
1563          inst->src[i].reg = this->params_remap[constant_nr];
1564          inst->src[i].reg_offset = 0;
1565       }
1566    }
1567
1568    return true;
1569 }
1570
1571 /*
1572  * Implements array access of uniforms by inserting a
1573  * PULL_CONSTANT_LOAD instruction.
1574  *
1575  * Unlike temporary GRF array access (where we don't support it due to
1576  * the difficulty of doing relative addressing on instruction
1577  * destinations), we could potentially do array access of uniforms
1578  * that were loaded in GRF space as push constants.  In real-world
1579  * usage we've seen, though, the arrays being used are always larger
1580  * than we could load as push constants, so just always move all
1581  * uniform array access out to a pull constant buffer.
1582  */
1583 void
1584 fs_visitor::move_uniform_array_access_to_pull_constants()
1585 {
1586    int pull_constant_loc[c->prog_data.nr_params];
1587
1588    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1589       pull_constant_loc[i] = -1;
1590    }
1591
1592    /* Walk through and find array access of uniforms.  Put a copy of that
1593     * uniform in the pull constant buffer.
1594     *
1595     * Note that we don't move constant-indexed accesses to arrays.  No
1596     * testing has been done of the performance impact of this choice.
1597     */
1598    foreach_list_safe(node, &this->instructions) {
1599       fs_inst *inst = (fs_inst *)node;
1600
1601       for (int i = 0 ; i < 3; i++) {
1602          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1603             continue;
1604
1605          int uniform = inst->src[i].reg;
1606
1607          /* If this array isn't already present in the pull constant buffer,
1608           * add it.
1609           */
1610          if (pull_constant_loc[uniform] == -1) {
1611             const float **values = &c->prog_data.param[uniform];
1612
1613             pull_constant_loc[uniform] = c->prog_data.nr_pull_params;
1614
1615             assert(param_size[uniform]);
1616
1617             for (int j = 0; j < param_size[uniform]; j++) {
1618                c->prog_data.pull_param[c->prog_data.nr_pull_params++] =
1619                   values[j];
1620             }
1621          }
1622
1623          /* Set up the annotation tracking for new generated instructions. */
1624          base_ir = inst->ir;
1625          current_annotation = inst->annotation;
1626
1627          fs_reg surf_index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1628          fs_reg temp = fs_reg(this, glsl_type::float_type);
1629          exec_list list = VARYING_PULL_CONSTANT_LOAD(temp,
1630                                                      surf_index,
1631                                                      *inst->src[i].reladdr,
1632                                                      pull_constant_loc[uniform] +
1633                                                      inst->src[i].reg_offset);
1634          inst->insert_before(&list);
1635
1636          inst->src[i].file = temp.file;
1637          inst->src[i].reg = temp.reg;
1638          inst->src[i].reg_offset = temp.reg_offset;
1639          inst->src[i].reladdr = NULL;
1640       }
1641    }
1642 }
1643
1644 /**
1645  * Choose accesses from the UNIFORM file to demote to using the pull
1646  * constant buffer.
1647  *
1648  * We allow a fragment shader to have more than the specified minimum
1649  * maximum number of fragment shader uniform components (64).  If
1650  * there are too many of these, they'd fill up all of register space.
1651  * So, this will push some of them out to the pull constant buffer and
1652  * update the program to load them.
1653  */
1654 void
1655 fs_visitor::setup_pull_constants()
1656 {
1657    /* Only allow 16 registers (128 uniform components) as push constants. */
1658    unsigned int max_uniform_components = 16 * 8;
1659    if (c->prog_data.nr_params <= max_uniform_components)
1660       return;
1661
1662    if (dispatch_width == 16) {
1663       fail("Pull constants not supported in 16-wide\n");
1664       return;
1665    }
1666
1667    /* Just demote the end of the list.  We could probably do better
1668     * here, demoting things that are rarely used in the program first.
1669     */
1670    unsigned int pull_uniform_base = max_uniform_components;
1671
1672    int pull_constant_loc[c->prog_data.nr_params];
1673    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1674       if (i < pull_uniform_base) {
1675          pull_constant_loc[i] = -1;
1676       } else {
1677          pull_constant_loc[i] = -1;
1678          /* If our constant is already being uploaded for reladdr purposes,
1679           * reuse it.
1680           */
1681          for (unsigned int j = 0; j < c->prog_data.nr_pull_params; j++) {
1682             if (c->prog_data.pull_param[j] == c->prog_data.param[i]) {
1683                pull_constant_loc[i] = j;
1684                break;
1685             }
1686          }
1687          if (pull_constant_loc[i] == -1) {
1688             int pull_index = c->prog_data.nr_pull_params++;
1689             c->prog_data.pull_param[pull_index] = c->prog_data.param[i];
1690             pull_constant_loc[i] = pull_index;;
1691          }
1692       }
1693    }
1694    c->prog_data.nr_params = pull_uniform_base;
1695
1696    foreach_list(node, &this->instructions) {
1697       fs_inst *inst = (fs_inst *)node;
1698
1699       for (int i = 0; i < 3; i++) {
1700          if (inst->src[i].file != UNIFORM)
1701             continue;
1702
1703          int pull_index = pull_constant_loc[inst->src[i].reg +
1704                                             inst->src[i].reg_offset];
1705          if (pull_index == -1)
1706             continue;
1707
1708          assert(!inst->src[i].reladdr);
1709
1710          fs_reg dst = fs_reg(this, glsl_type::float_type);
1711          fs_reg index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1712          fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1713          fs_inst *pull =
1714             new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1715                                  dst, index, offset);
1716          pull->ir = inst->ir;
1717          pull->annotation = inst->annotation;
1718
1719          inst->insert_before(pull);
1720
1721          inst->src[i].file = GRF;
1722          inst->src[i].reg = dst.reg;
1723          inst->src[i].reg_offset = 0;
1724          inst->src[i].smear = pull_index & 3;
1725       }
1726    }
1727 }
1728
1729 bool
1730 fs_visitor::opt_algebraic()
1731 {
1732    bool progress = false;
1733
1734    foreach_list(node, &this->instructions) {
1735       fs_inst *inst = (fs_inst *)node;
1736
1737       switch (inst->opcode) {
1738       case BRW_OPCODE_MUL:
1739          if (inst->src[1].file != IMM)
1740             continue;
1741
1742          /* a * 1.0 = a */
1743          if (inst->src[1].is_one()) {
1744             inst->opcode = BRW_OPCODE_MOV;
1745             inst->src[1] = reg_undef;
1746             progress = true;
1747             break;
1748          }
1749
1750          /* a * 0.0 = 0.0 */
1751          if (inst->src[1].is_zero()) {
1752             inst->opcode = BRW_OPCODE_MOV;
1753             inst->src[0] = inst->src[1];
1754             inst->src[1] = reg_undef;
1755             progress = true;
1756             break;
1757          }
1758
1759          break;
1760       case BRW_OPCODE_ADD:
1761          if (inst->src[1].file != IMM)
1762             continue;
1763
1764          /* a + 0.0 = a */
1765          if (inst->src[1].is_zero()) {
1766             inst->opcode = BRW_OPCODE_MOV;
1767             inst->src[1] = reg_undef;
1768             progress = true;
1769             break;
1770          }
1771          break;
1772       default:
1773          break;
1774       }
1775    }
1776
1777    return progress;
1778 }
1779
1780 /**
1781  * Must be called after calculate_live_intervales() to remove unused
1782  * writes to registers -- register allocation will fail otherwise
1783  * because something deffed but not used won't be considered to
1784  * interfere with other regs.
1785  */
1786 bool
1787 fs_visitor::dead_code_eliminate()
1788 {
1789    bool progress = false;
1790    int pc = 0;
1791
1792    calculate_live_intervals();
1793
1794    foreach_list_safe(node, &this->instructions) {
1795       fs_inst *inst = (fs_inst *)node;
1796
1797       if (inst->dst.file == GRF && this->virtual_grf_use[inst->dst.reg] <= pc) {
1798          inst->remove();
1799          progress = true;
1800       }
1801
1802       pc++;
1803    }
1804
1805    if (progress)
1806       live_intervals_valid = false;
1807
1808    return progress;
1809 }
1810
1811 /**
1812  * Implements a second type of register coalescing: This one checks if
1813  * the two regs involved in a raw move don't interfere, in which case
1814  * they can both by stored in the same place and the MOV removed.
1815  */
1816 bool
1817 fs_visitor::register_coalesce_2()
1818 {
1819    bool progress = false;
1820
1821    calculate_live_intervals();
1822
1823    foreach_list_safe(node, &this->instructions) {
1824       fs_inst *inst = (fs_inst *)node;
1825
1826       if (inst->opcode != BRW_OPCODE_MOV ||
1827           inst->predicate ||
1828           inst->saturate ||
1829           inst->src[0].file != GRF ||
1830           inst->src[0].negate ||
1831           inst->src[0].abs ||
1832           inst->src[0].smear != -1 ||
1833           inst->dst.file != GRF ||
1834           inst->dst.type != inst->src[0].type ||
1835           virtual_grf_sizes[inst->src[0].reg] != 1 ||
1836           virtual_grf_interferes(inst->dst.reg, inst->src[0].reg)) {
1837          continue;
1838       }
1839
1840       int reg_from = inst->src[0].reg;
1841       assert(inst->src[0].reg_offset == 0);
1842       int reg_to = inst->dst.reg;
1843       int reg_to_offset = inst->dst.reg_offset;
1844
1845       foreach_list(node, &this->instructions) {
1846          fs_inst *scan_inst = (fs_inst *)node;
1847
1848          if (scan_inst->dst.file == GRF &&
1849              scan_inst->dst.reg == reg_from) {
1850             scan_inst->dst.reg = reg_to;
1851             scan_inst->dst.reg_offset = reg_to_offset;
1852          }
1853          for (int i = 0; i < 3; i++) {
1854             if (scan_inst->src[i].file == GRF &&
1855                 scan_inst->src[i].reg == reg_from) {
1856                scan_inst->src[i].reg = reg_to;
1857                scan_inst->src[i].reg_offset = reg_to_offset;
1858             }
1859          }
1860       }
1861
1862       inst->remove();
1863
1864       /* We don't need to recalculate live intervals inside the loop despite
1865        * flagging live_intervals_valid because we only use live intervals for
1866        * the interferes test, and we must have had a situation where the
1867        * intervals were:
1868        *
1869        *  from  to
1870        *  ^
1871        *  |
1872        *  v
1873        *        ^
1874        *        |
1875        *        v
1876        *
1877        * Some register R that might get coalesced with one of these two could
1878        * only be referencing "to", otherwise "from"'s range would have been
1879        * longer.  R's range could also only start at the end of "to" or later,
1880        * otherwise it will conflict with "to" when we try to coalesce "to"
1881        * into Rw anyway.
1882        */
1883       live_intervals_valid = false;
1884
1885       progress = true;
1886       continue;
1887    }
1888
1889    return progress;
1890 }
1891
1892 bool
1893 fs_visitor::register_coalesce()
1894 {
1895    bool progress = false;
1896    int if_depth = 0;
1897    int loop_depth = 0;
1898
1899    foreach_list_safe(node, &this->instructions) {
1900       fs_inst *inst = (fs_inst *)node;
1901
1902       /* Make sure that we dominate the instructions we're going to
1903        * scan for interfering with our coalescing, or we won't have
1904        * scanned enough to see if anything interferes with our
1905        * coalescing.  We don't dominate the following instructions if
1906        * we're in a loop or an if block.
1907        */
1908       switch (inst->opcode) {
1909       case BRW_OPCODE_DO:
1910          loop_depth++;
1911          break;
1912       case BRW_OPCODE_WHILE:
1913          loop_depth--;
1914          break;
1915       case BRW_OPCODE_IF:
1916          if_depth++;
1917          break;
1918       case BRW_OPCODE_ENDIF:
1919          if_depth--;
1920          break;
1921       default:
1922          break;
1923       }
1924       if (loop_depth || if_depth)
1925          continue;
1926
1927       if (inst->opcode != BRW_OPCODE_MOV ||
1928           inst->predicate ||
1929           inst->saturate ||
1930           inst->dst.file != GRF || (inst->src[0].file != GRF &&
1931                                     inst->src[0].file != UNIFORM)||
1932           inst->dst.type != inst->src[0].type)
1933          continue;
1934
1935       bool has_source_modifiers = (inst->src[0].abs ||
1936                                    inst->src[0].negate ||
1937                                    inst->src[0].smear != -1 ||
1938                                    inst->src[0].file == UNIFORM);
1939
1940       /* Found a move of a GRF to a GRF.  Let's see if we can coalesce
1941        * them: check for no writes to either one until the exit of the
1942        * program.
1943        */
1944       bool interfered = false;
1945
1946       for (fs_inst *scan_inst = (fs_inst *)inst->next;
1947            !scan_inst->is_tail_sentinel();
1948            scan_inst = (fs_inst *)scan_inst->next) {
1949          if (scan_inst->dst.file == GRF) {
1950             if (scan_inst->overwrites_reg(inst->dst) ||
1951                 scan_inst->overwrites_reg(inst->src[0])) {
1952                interfered = true;
1953                break;
1954             }
1955          }
1956
1957          /* The gen6 MATH instruction can't handle source modifiers or
1958           * unusual register regions, so avoid coalescing those for
1959           * now.  We should do something more specific.
1960           */
1961          if (has_source_modifiers && !can_do_source_mods(scan_inst)) {
1962             interfered = true;
1963             break;
1964          }
1965
1966          /* The accumulator result appears to get used for the
1967           * conditional modifier generation.  When negating a UD
1968           * value, there is a 33rd bit generated for the sign in the
1969           * accumulator value, so now you can't check, for example,
1970           * equality with a 32-bit value.  See piglit fs-op-neg-uint.
1971           */
1972          if (scan_inst->conditional_mod &&
1973              inst->src[0].negate &&
1974              inst->src[0].type == BRW_REGISTER_TYPE_UD) {
1975             interfered = true;
1976             break;
1977          }
1978       }
1979       if (interfered) {
1980          continue;
1981       }
1982
1983       /* Rewrite the later usage to point at the source of the move to
1984        * be removed.
1985        */
1986       for (fs_inst *scan_inst = inst;
1987            !scan_inst->is_tail_sentinel();
1988            scan_inst = (fs_inst *)scan_inst->next) {
1989          for (int i = 0; i < 3; i++) {
1990             if (scan_inst->src[i].file == GRF &&
1991                 scan_inst->src[i].reg == inst->dst.reg &&
1992                 scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
1993                fs_reg new_src = inst->src[0];
1994                if (scan_inst->src[i].abs) {
1995                   new_src.negate = 0;
1996                   new_src.abs = 1;
1997                }
1998                new_src.negate ^= scan_inst->src[i].negate;
1999                scan_inst->src[i] = new_src;
2000             }
2001          }
2002       }
2003
2004       inst->remove();
2005       progress = true;
2006    }
2007
2008    if (progress)
2009       live_intervals_valid = false;
2010
2011    return progress;
2012 }
2013
2014
2015 bool
2016 fs_visitor::compute_to_mrf()
2017 {
2018    bool progress = false;
2019    int next_ip = 0;
2020
2021    calculate_live_intervals();
2022
2023    foreach_list_safe(node, &this->instructions) {
2024       fs_inst *inst = (fs_inst *)node;
2025
2026       int ip = next_ip;
2027       next_ip++;
2028
2029       if (inst->opcode != BRW_OPCODE_MOV ||
2030           inst->predicate ||
2031           inst->dst.file != MRF || inst->src[0].file != GRF ||
2032           inst->dst.type != inst->src[0].type ||
2033           inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
2034          continue;
2035
2036       /* Work out which hardware MRF registers are written by this
2037        * instruction.
2038        */
2039       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2040       int mrf_high;
2041       if (inst->dst.reg & BRW_MRF_COMPR4) {
2042          mrf_high = mrf_low + 4;
2043       } else if (dispatch_width == 16 &&
2044                  (!inst->force_uncompressed && !inst->force_sechalf)) {
2045          mrf_high = mrf_low + 1;
2046       } else {
2047          mrf_high = mrf_low;
2048       }
2049
2050       /* Can't compute-to-MRF this GRF if someone else was going to
2051        * read it later.
2052        */
2053       if (this->virtual_grf_use[inst->src[0].reg] > ip)
2054          continue;
2055
2056       /* Found a move of a GRF to a MRF.  Let's see if we can go
2057        * rewrite the thing that made this GRF to write into the MRF.
2058        */
2059       fs_inst *scan_inst;
2060       for (scan_inst = (fs_inst *)inst->prev;
2061            scan_inst->prev != NULL;
2062            scan_inst = (fs_inst *)scan_inst->prev) {
2063          if (scan_inst->dst.file == GRF &&
2064              scan_inst->dst.reg == inst->src[0].reg) {
2065             /* Found the last thing to write our reg we want to turn
2066              * into a compute-to-MRF.
2067              */
2068
2069             /* If it's predicated, it (probably) didn't populate all
2070              * the channels.  We might be able to rewrite everything
2071              * that writes that reg, but it would require smarter
2072              * tracking to delay the rewriting until complete success.
2073              */
2074             if (scan_inst->predicate)
2075                break;
2076
2077             /* If it's half of register setup and not the same half as
2078              * our MOV we're trying to remove, bail for now.
2079              */
2080             if (scan_inst->force_uncompressed != inst->force_uncompressed ||
2081                 scan_inst->force_sechalf != inst->force_sechalf) {
2082                break;
2083             }
2084
2085             /* SEND instructions can't have MRF as a destination. */
2086             if (scan_inst->mlen)
2087                break;
2088
2089             if (intel->gen == 6) {
2090                /* gen6 math instructions must have the destination be
2091                 * GRF, so no compute-to-MRF for them.
2092                 */
2093                if (scan_inst->is_math()) {
2094                   break;
2095                }
2096             }
2097
2098             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2099                /* Found the creator of our MRF's source value. */
2100                scan_inst->dst.file = MRF;
2101                scan_inst->dst.reg = inst->dst.reg;
2102                scan_inst->saturate |= inst->saturate;
2103                inst->remove();
2104                progress = true;
2105             }
2106             break;
2107          }
2108
2109          /* We don't handle control flow here.  Most computation of
2110           * values that end up in MRFs are shortly before the MRF
2111           * write anyway.
2112           */
2113          if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2114             break;
2115
2116          /* You can't read from an MRF, so if someone else reads our
2117           * MRF's source GRF that we wanted to rewrite, that stops us.
2118           */
2119          bool interfered = false;
2120          for (int i = 0; i < 3; i++) {
2121             if (scan_inst->src[i].file == GRF &&
2122                 scan_inst->src[i].reg == inst->src[0].reg &&
2123                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2124                interfered = true;
2125             }
2126          }
2127          if (interfered)
2128             break;
2129
2130          if (scan_inst->dst.file == MRF) {
2131             /* If somebody else writes our MRF here, we can't
2132              * compute-to-MRF before that.
2133              */
2134             int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2135             int scan_mrf_high;
2136
2137             if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2138                scan_mrf_high = scan_mrf_low + 4;
2139             } else if (dispatch_width == 16 &&
2140                        (!scan_inst->force_uncompressed &&
2141                         !scan_inst->force_sechalf)) {
2142                scan_mrf_high = scan_mrf_low + 1;
2143             } else {
2144                scan_mrf_high = scan_mrf_low;
2145             }
2146
2147             if (mrf_low == scan_mrf_low ||
2148                 mrf_low == scan_mrf_high ||
2149                 mrf_high == scan_mrf_low ||
2150                 mrf_high == scan_mrf_high) {
2151                break;
2152             }
2153          }
2154
2155          if (scan_inst->mlen > 0) {
2156             /* Found a SEND instruction, which means that there are
2157              * live values in MRFs from base_mrf to base_mrf +
2158              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
2159              * above it.
2160              */
2161             if (mrf_low >= scan_inst->base_mrf &&
2162                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2163                break;
2164             }
2165             if (mrf_high >= scan_inst->base_mrf &&
2166                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2167                break;
2168             }
2169          }
2170       }
2171    }
2172
2173    if (progress)
2174       live_intervals_valid = false;
2175
2176    return progress;
2177 }
2178
2179 /**
2180  * Walks through basic blocks, looking for repeated MRF writes and
2181  * removing the later ones.
2182  */
2183 bool
2184 fs_visitor::remove_duplicate_mrf_writes()
2185 {
2186    fs_inst *last_mrf_move[16];
2187    bool progress = false;
2188
2189    /* Need to update the MRF tracking for compressed instructions. */
2190    if (dispatch_width == 16)
2191       return false;
2192
2193    memset(last_mrf_move, 0, sizeof(last_mrf_move));
2194
2195    foreach_list_safe(node, &this->instructions) {
2196       fs_inst *inst = (fs_inst *)node;
2197
2198       if (inst->is_control_flow()) {
2199          memset(last_mrf_move, 0, sizeof(last_mrf_move));
2200       }
2201
2202       if (inst->opcode == BRW_OPCODE_MOV &&
2203           inst->dst.file == MRF) {
2204          fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2205          if (prev_inst && inst->equals(prev_inst)) {
2206             inst->remove();
2207             progress = true;
2208             continue;
2209          }
2210       }
2211
2212       /* Clear out the last-write records for MRFs that were overwritten. */
2213       if (inst->dst.file == MRF) {
2214          last_mrf_move[inst->dst.reg] = NULL;
2215       }
2216
2217       if (inst->mlen > 0) {
2218          /* Found a SEND instruction, which will include two or fewer
2219           * implied MRF writes.  We could do better here.
2220           */
2221          for (int i = 0; i < implied_mrf_writes(inst); i++) {
2222             last_mrf_move[inst->base_mrf + i] = NULL;
2223          }
2224       }
2225
2226       /* Clear out any MRF move records whose sources got overwritten. */
2227       if (inst->dst.file == GRF) {
2228          for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2229             if (last_mrf_move[i] &&
2230                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2231                last_mrf_move[i] = NULL;
2232             }
2233          }
2234       }
2235
2236       if (inst->opcode == BRW_OPCODE_MOV &&
2237           inst->dst.file == MRF &&
2238           inst->src[0].file == GRF &&
2239           !inst->predicate) {
2240          last_mrf_move[inst->dst.reg] = inst;
2241       }
2242    }
2243
2244    if (progress)
2245       live_intervals_valid = false;
2246
2247    return progress;
2248 }
2249
2250 static void
2251 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2252                         int first_grf, int grf_len)
2253 {
2254    bool inst_16wide = (dispatch_width > 8 &&
2255                        !inst->force_uncompressed &&
2256                        !inst->force_sechalf);
2257
2258    /* Clear the flag for registers that actually got read (as expected). */
2259    for (int i = 0; i < 3; i++) {
2260       int grf;
2261       if (inst->src[i].file == GRF) {
2262          grf = inst->src[i].reg;
2263       } else if (inst->src[i].file == FIXED_HW_REG &&
2264                  inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2265          grf = inst->src[i].fixed_hw_reg.nr;
2266       } else {
2267          continue;
2268       }
2269
2270       if (grf >= first_grf &&
2271           grf < first_grf + grf_len) {
2272          deps[grf - first_grf] = false;
2273          if (inst_16wide)
2274             deps[grf - first_grf + 1] = false;
2275       }
2276    }
2277 }
2278
2279 /**
2280  * Implements this workaround for the original 965:
2281  *
2282  *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2283  *      check for post destination dependencies on this instruction, software
2284  *      must ensure that there is no destination hazard for the case of ‘write
2285  *      followed by a posted write’ shown in the following example.
2286  *
2287  *      1. mov r3 0
2288  *      2. send r3.xy <rest of send instruction>
2289  *      3. mov r2 r3
2290  *
2291  *      Due to no post-destination dependency check on the ‘send’, the above
2292  *      code sequence could have two instructions (1 and 2) in flight at the
2293  *      same time that both consider ‘r3’ as the target of their final writes.
2294  */
2295 void
2296 fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
2297 {
2298    int reg_size = dispatch_width / 8;
2299    int write_len = inst->regs_written() * reg_size;
2300    int first_write_grf = inst->dst.reg;
2301    bool needs_dep[BRW_MAX_MRF];
2302    assert(write_len < (int)sizeof(needs_dep) - 1);
2303
2304    memset(needs_dep, false, sizeof(needs_dep));
2305    memset(needs_dep, true, write_len);
2306
2307    clear_deps_for_inst_src(inst, dispatch_width,
2308                            needs_dep, first_write_grf, write_len);
2309
2310    /* Walk backwards looking for writes to registers we're writing which
2311     * aren't read since being written.  If we hit the start of the program,
2312     * we assume that there are no outstanding dependencies on entry to the
2313     * program.
2314     */
2315    for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2316         scan_inst != NULL;
2317         scan_inst = (fs_inst *)scan_inst->prev) {
2318
2319       /* If we hit control flow, assume that there *are* outstanding
2320        * dependencies, and force their cleanup before our instruction.
2321        */
2322       if (scan_inst->is_control_flow()) {
2323          for (int i = 0; i < write_len; i++) {
2324             if (needs_dep[i]) {
2325                inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2326             }
2327          }
2328       }
2329
2330       bool scan_inst_16wide = (dispatch_width > 8 &&
2331                                !scan_inst->force_uncompressed &&
2332                                !scan_inst->force_sechalf);
2333
2334       /* We insert our reads as late as possible on the assumption that any
2335        * instruction but a MOV that might have left us an outstanding
2336        * dependency has more latency than a MOV.
2337        */
2338       if (scan_inst->dst.file == GRF) {
2339          for (int i = 0; i < scan_inst->regs_written(); i++) {
2340             int reg = scan_inst->dst.reg + i * reg_size;
2341
2342             if (reg >= first_write_grf &&
2343                 reg < first_write_grf + write_len &&
2344                 needs_dep[reg - first_write_grf]) {
2345                inst->insert_before(DEP_RESOLVE_MOV(reg));
2346                needs_dep[reg - first_write_grf] = false;
2347                if (scan_inst_16wide)
2348                   needs_dep[reg - first_write_grf + 1] = false;
2349             }
2350          }
2351       }
2352
2353       /* Clear the flag for registers that actually got read (as expected). */
2354       clear_deps_for_inst_src(scan_inst, dispatch_width,
2355                               needs_dep, first_write_grf, write_len);
2356
2357       /* Continue the loop only if we haven't resolved all the dependencies */
2358       int i;
2359       for (i = 0; i < write_len; i++) {
2360          if (needs_dep[i])
2361             break;
2362       }
2363       if (i == write_len)
2364          return;
2365    }
2366 }
2367
2368 /**
2369  * Implements this workaround for the original 965:
2370  *
2371  *     "[DevBW, DevCL] Errata: A destination register from a send can not be
2372  *      used as a destination register until after it has been sourced by an
2373  *      instruction with a different destination register.
2374  */
2375 void
2376 fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
2377 {
2378    int write_len = inst->regs_written() * dispatch_width / 8;
2379    int first_write_grf = inst->dst.reg;
2380    bool needs_dep[BRW_MAX_MRF];
2381    assert(write_len < (int)sizeof(needs_dep) - 1);
2382
2383    memset(needs_dep, false, sizeof(needs_dep));
2384    memset(needs_dep, true, write_len);
2385    /* Walk forwards looking for writes to registers we're writing which aren't
2386     * read before being written.
2387     */
2388    for (fs_inst *scan_inst = (fs_inst *)inst->next;
2389         !scan_inst->is_tail_sentinel();
2390         scan_inst = (fs_inst *)scan_inst->next) {
2391       /* If we hit control flow, force resolve all remaining dependencies. */
2392       if (scan_inst->is_control_flow()) {
2393          for (int i = 0; i < write_len; i++) {
2394             if (needs_dep[i])
2395                scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2396          }
2397       }
2398
2399       /* Clear the flag for registers that actually got read (as expected). */
2400       clear_deps_for_inst_src(scan_inst, dispatch_width,
2401                               needs_dep, first_write_grf, write_len);
2402
2403       /* We insert our reads as late as possible since they're reading the
2404        * result of a SEND, which has massive latency.
2405        */
2406       if (scan_inst->dst.file == GRF &&
2407           scan_inst->dst.reg >= first_write_grf &&
2408           scan_inst->dst.reg < first_write_grf + write_len &&
2409           needs_dep[scan_inst->dst.reg - first_write_grf]) {
2410          scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2411          needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2412       }
2413
2414       /* Continue the loop only if we haven't resolved all the dependencies */
2415       int i;
2416       for (i = 0; i < write_len; i++) {
2417          if (needs_dep[i])
2418             break;
2419       }
2420       if (i == write_len)
2421          return;
2422    }
2423
2424    /* If we hit the end of the program, resolve all remaining dependencies out
2425     * of paranoia.
2426     */
2427    fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2428    assert(last_inst->eot);
2429    for (int i = 0; i < write_len; i++) {
2430       if (needs_dep[i])
2431          last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2432    }
2433 }
2434
2435 void
2436 fs_visitor::insert_gen4_send_dependency_workarounds()
2437 {
2438    if (intel->gen != 4 || intel->is_g4x)
2439       return;
2440
2441    /* Note that we're done with register allocation, so GRF fs_regs always
2442     * have a .reg_offset of 0.
2443     */
2444
2445    foreach_list_safe(node, &this->instructions) {
2446       fs_inst *inst = (fs_inst *)node;
2447
2448       if (inst->mlen != 0 && inst->dst.file == GRF) {
2449          insert_gen4_pre_send_dependency_workarounds(inst);
2450          insert_gen4_post_send_dependency_workarounds(inst);
2451       }
2452    }
2453 }
2454
2455 /**
2456  * Turns the generic expression-style uniform pull constant load instruction
2457  * into a hardware-specific series of instructions for loading a pull
2458  * constant.
2459  *
2460  * The expression style allows the CSE pass before this to optimize out
2461  * repeated loads from the same offset, and gives the pre-register-allocation
2462  * scheduling full flexibility, while the conversion to native instructions
2463  * allows the post-register-allocation scheduler the best information
2464  * possible.
2465  *
2466  * Note that execution masking for setting up pull constant loads is special:
2467  * the channels that need to be written are unrelated to the current execution
2468  * mask, since a later instruction will use one of the result channels as a
2469  * source operand for all 8 or 16 of its channels.
2470  */
2471 void
2472 fs_visitor::lower_uniform_pull_constant_loads()
2473 {
2474    foreach_list(node, &this->instructions) {
2475       fs_inst *inst = (fs_inst *)node;
2476
2477       if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2478          continue;
2479
2480       if (intel->gen >= 7) {
2481          /* The offset arg before was a vec4-aligned byte offset.  We need to
2482           * turn it into a dword offset.
2483           */
2484          fs_reg const_offset_reg = inst->src[1];
2485          assert(const_offset_reg.file == IMM &&
2486                 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2487          const_offset_reg.imm.u /= 4;
2488          fs_reg payload = fs_reg(this, glsl_type::uint_type);
2489
2490          /* This is actually going to be a MOV, but since only the first dword
2491           * is accessed, we have a special opcode to do just that one.  Note
2492           * that this needs to be an operation that will be considered a def
2493           * by live variable analysis, or register allocation will explode.
2494           */
2495          fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2496                                                payload, const_offset_reg);
2497          setup->force_writemask_all = true;
2498
2499          setup->ir = inst->ir;
2500          setup->annotation = inst->annotation;
2501          inst->insert_before(setup);
2502
2503          /* Similarly, this will only populate the first 4 channels of the
2504           * result register (since we only use smear values from 0-3), but we
2505           * don't tell the optimizer.
2506           */
2507          inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2508          inst->src[1] = payload;
2509
2510          this->live_intervals_valid = false;
2511       } else {
2512          /* Before register allocation, we didn't tell the scheduler about the
2513           * MRF we use.  We know it's safe to use this MRF because nothing
2514           * else does except for register spill/unspill, which generates and
2515           * uses its MRF within a single IR instruction.
2516           */
2517          inst->base_mrf = 14;
2518          inst->mlen = 1;
2519       }
2520    }
2521 }
2522
2523 void
2524 fs_visitor::dump_instruction(fs_inst *inst)
2525 {
2526    if (inst->predicate) {
2527       printf("(%cf0.%d) ",
2528              inst->predicate_inverse ? '-' : '+',
2529              inst->flag_subreg);
2530    }
2531
2532    printf("%s", brw_instruction_name(inst->opcode));
2533    if (inst->saturate)
2534       printf(".sat");
2535    if (inst->conditional_mod) {
2536       printf(".cmod");
2537       if (!inst->predicate &&
2538           (intel->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2539                               inst->opcode != BRW_OPCODE_IF &&
2540                               inst->opcode != BRW_OPCODE_WHILE))) {
2541          printf(".f0.%d\n", inst->flag_subreg);
2542       }
2543    }
2544    printf(" ");
2545
2546
2547    switch (inst->dst.file) {
2548    case GRF:
2549       printf("vgrf%d", inst->dst.reg);
2550       if (inst->dst.reg_offset)
2551          printf("+%d", inst->dst.reg_offset);
2552       break;
2553    case MRF:
2554       printf("m%d", inst->dst.reg);
2555       break;
2556    case BAD_FILE:
2557       printf("(null)");
2558       break;
2559    case UNIFORM:
2560       printf("***u%d***", inst->dst.reg);
2561       break;
2562    default:
2563       printf("???");
2564       break;
2565    }
2566    printf(", ");
2567
2568    for (int i = 0; i < 3; i++) {
2569       if (inst->src[i].negate)
2570          printf("-");
2571       if (inst->src[i].abs)
2572          printf("|");
2573       switch (inst->src[i].file) {
2574       case GRF:
2575          printf("vgrf%d", inst->src[i].reg);
2576          if (inst->src[i].reg_offset)
2577             printf("+%d", inst->src[i].reg_offset);
2578          break;
2579       case MRF:
2580          printf("***m%d***", inst->src[i].reg);
2581          break;
2582       case UNIFORM:
2583          printf("u%d", inst->src[i].reg);
2584          if (inst->src[i].reg_offset)
2585             printf(".%d", inst->src[i].reg_offset);
2586          break;
2587       case BAD_FILE:
2588          printf("(null)");
2589          break;
2590       case IMM:
2591          switch (inst->src[i].type) {
2592          case BRW_REGISTER_TYPE_F:
2593             printf("%ff", inst->src[i].imm.f);
2594             break;
2595          case BRW_REGISTER_TYPE_D:
2596             printf("%dd", inst->src[i].imm.i);
2597             break;
2598          case BRW_REGISTER_TYPE_UD:
2599             printf("%uu", inst->src[i].imm.u);
2600             break;
2601          default:
2602             printf("???");
2603             break;
2604          }
2605          break;
2606       default:
2607          printf("???");
2608          break;
2609       }
2610       if (inst->src[i].abs)
2611          printf("|");
2612
2613       if (i < 3)
2614          printf(", ");
2615    }
2616
2617    printf(" ");
2618
2619    if (inst->force_uncompressed)
2620       printf("1sthalf ");
2621
2622    if (inst->force_sechalf)
2623       printf("2ndhalf ");
2624
2625    printf("\n");
2626 }
2627
2628 void
2629 fs_visitor::dump_instructions()
2630 {
2631    int ip = 0;
2632    foreach_list(node, &this->instructions) {
2633       fs_inst *inst = (fs_inst *)node;
2634       printf("%d: ", ip++);
2635       dump_instruction(inst);
2636    }
2637 }
2638
2639 /**
2640  * Possibly returns an instruction that set up @param reg.
2641  *
2642  * Sometimes we want to take the result of some expression/variable
2643  * dereference tree and rewrite the instruction generating the result
2644  * of the tree.  When processing the tree, we know that the
2645  * instructions generated are all writing temporaries that are dead
2646  * outside of this tree.  So, if we have some instructions that write
2647  * a temporary, we're free to point that temp write somewhere else.
2648  *
2649  * Note that this doesn't guarantee that the instruction generated
2650  * only reg -- it might be the size=4 destination of a texture instruction.
2651  */
2652 fs_inst *
2653 fs_visitor::get_instruction_generating_reg(fs_inst *start,
2654                                            fs_inst *end,
2655                                            fs_reg reg)
2656 {
2657    if (end == start ||
2658        end->predicate ||
2659        end->force_uncompressed ||
2660        end->force_sechalf ||
2661        reg.reladdr ||
2662        !reg.equals(end->dst)) {
2663       return NULL;
2664    } else {
2665       return end;
2666    }
2667 }
2668
2669 void
2670 fs_visitor::setup_payload_gen6()
2671 {
2672    struct intel_context *intel = &brw->intel;
2673    bool uses_depth =
2674       (fp->Base.InputsRead & (1 << VARYING_SLOT_POS)) != 0;
2675    unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
2676
2677    assert(intel->gen >= 6);
2678
2679    /* R0-1: masks, pixel X/Y coordinates. */
2680    c->nr_payload_regs = 2;
2681    /* R2: only for 32-pixel dispatch.*/
2682
2683    /* R3-26: barycentric interpolation coordinates.  These appear in the
2684     * same order that they appear in the brw_wm_barycentric_interp_mode
2685     * enum.  Each set of coordinates occupies 2 registers if dispatch width
2686     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
2687     * appear if they were enabled using the "Barycentric Interpolation
2688     * Mode" bits in WM_STATE.
2689     */
2690    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
2691       if (barycentric_interp_modes & (1 << i)) {
2692          c->barycentric_coord_reg[i] = c->nr_payload_regs;
2693          c->nr_payload_regs += 2;
2694          if (dispatch_width == 16) {
2695             c->nr_payload_regs += 2;
2696          }
2697       }
2698    }
2699
2700    /* R27: interpolated depth if uses source depth */
2701    if (uses_depth) {
2702       c->source_depth_reg = c->nr_payload_regs;
2703       c->nr_payload_regs++;
2704       if (dispatch_width == 16) {
2705          /* R28: interpolated depth if not 8-wide. */
2706          c->nr_payload_regs++;
2707       }
2708    }
2709    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
2710    if (uses_depth) {
2711       c->source_w_reg = c->nr_payload_regs;
2712       c->nr_payload_regs++;
2713       if (dispatch_width == 16) {
2714          /* R30: interpolated W if not 8-wide. */
2715          c->nr_payload_regs++;
2716       }
2717    }
2718    /* R31: MSAA position offsets. */
2719    /* R32-: bary for 32-pixel. */
2720    /* R58-59: interp W for 32-pixel. */
2721
2722    if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
2723       c->source_depth_to_render_target = true;
2724    }
2725 }
2726
2727 bool
2728 fs_visitor::run()
2729 {
2730    sanity_param_count = fp->Base.Parameters->NumParameters;
2731    uint32_t orig_nr_params = c->prog_data.nr_params;
2732
2733    if (intel->gen >= 6)
2734       setup_payload_gen6();
2735    else
2736       setup_payload_gen4();
2737
2738    if (0) {
2739       emit_dummy_fs();
2740    } else {
2741       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2742          emit_shader_time_begin();
2743
2744       calculate_urb_setup();
2745       if (intel->gen < 6)
2746          emit_interpolation_setup_gen4();
2747       else
2748          emit_interpolation_setup_gen6();
2749
2750       /* We handle discards by keeping track of the still-live pixels in f0.1.
2751        * Initialize it with the dispatched pixels.
2752        */
2753       if (fp->UsesKill) {
2754          fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
2755          discard_init->flag_subreg = 1;
2756       }
2757
2758       /* Generate FS IR for main().  (the visitor only descends into
2759        * functions called "main").
2760        */
2761       if (shader) {
2762          foreach_list(node, &*shader->ir) {
2763             ir_instruction *ir = (ir_instruction *)node;
2764             base_ir = ir;
2765             this->result = reg_undef;
2766             ir->accept(this);
2767          }
2768       } else {
2769          emit_fragment_program_code();
2770       }
2771       base_ir = NULL;
2772       if (failed)
2773          return false;
2774
2775       emit(FS_OPCODE_PLACEHOLDER_HALT);
2776
2777       emit_fb_writes();
2778
2779       split_virtual_grfs();
2780
2781       move_uniform_array_access_to_pull_constants();
2782       setup_pull_constants();
2783
2784       bool progress;
2785       do {
2786          progress = false;
2787
2788          compact_virtual_grfs();
2789
2790          progress = remove_duplicate_mrf_writes() || progress;
2791
2792          progress = opt_algebraic() || progress;
2793          progress = opt_cse() || progress;
2794          progress = opt_copy_propagate() || progress;
2795          progress = dead_code_eliminate() || progress;
2796          progress = register_coalesce() || progress;
2797          progress = register_coalesce_2() || progress;
2798          progress = compute_to_mrf() || progress;
2799       } while (progress);
2800
2801       remove_dead_constants();
2802
2803       schedule_instructions(false);
2804
2805       lower_uniform_pull_constant_loads();
2806
2807       assign_curb_setup();
2808       assign_urb_setup();
2809
2810       if (0) {
2811          /* Debug of register spilling: Go spill everything. */
2812          for (int i = 0; i < virtual_grf_count; i++) {
2813             spill_reg(i);
2814          }
2815       }
2816
2817       if (0)
2818          assign_regs_trivial();
2819       else {
2820          while (!assign_regs()) {
2821             if (failed)
2822                break;
2823          }
2824       }
2825    }
2826    assert(force_uncompressed_stack == 0);
2827    assert(force_sechalf_stack == 0);
2828
2829    /* This must come after all optimization and register allocation, since
2830     * it inserts dead code that happens to have side effects, and it does
2831     * so based on the actual physical registers in use.
2832     */
2833    insert_gen4_send_dependency_workarounds();
2834
2835    if (failed)
2836       return false;
2837
2838    schedule_instructions(true);
2839
2840    if (dispatch_width == 8) {
2841       c->prog_data.reg_blocks = brw_register_blocks(grf_used);
2842    } else {
2843       c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
2844
2845       /* Make sure we didn't try to sneak in an extra uniform */
2846       assert(orig_nr_params == c->prog_data.nr_params);
2847       (void) orig_nr_params;
2848    }
2849
2850    /* If any state parameters were appended, then ParameterValues could have
2851     * been realloced, in which case the driver uniform storage set up by
2852     * _mesa_associate_uniform_storage() would point to freed memory.  Make
2853     * sure that didn't happen.
2854     */
2855    assert(sanity_param_count == fp->Base.Parameters->NumParameters);
2856
2857    return !failed;
2858 }
2859
2860 const unsigned *
2861 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
2862                struct gl_fragment_program *fp,
2863                struct gl_shader_program *prog,
2864                unsigned *final_assembly_size)
2865 {
2866    struct intel_context *intel = &brw->intel;
2867    bool start_busy = false;
2868    float start_time = 0;
2869
2870    if (unlikely(intel->perf_debug)) {
2871       start_busy = (intel->batch.last_bo &&
2872                     drm_intel_bo_busy(intel->batch.last_bo));
2873       start_time = get_time();
2874    }
2875
2876    struct brw_shader *shader = NULL;
2877    if (prog)
2878       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
2879
2880    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
2881       if (shader) {
2882          printf("GLSL IR for native fragment shader %d:\n", prog->Name);
2883          _mesa_print_ir(shader->ir, NULL);
2884          printf("\n\n");
2885       } else {
2886          printf("ARB_fragment_program %d ir for native fragment shader\n",
2887                 fp->Base.Id);
2888          _mesa_print_program(&fp->Base);
2889       }
2890    }
2891
2892    /* Now the main event: Visit the shader IR and generate our FS IR for it.
2893     */
2894    fs_visitor v(brw, c, prog, fp, 8);
2895    if (!v.run()) {
2896       prog->LinkStatus = false;
2897       ralloc_strcat(&prog->InfoLog, v.fail_msg);
2898
2899       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
2900                     v.fail_msg);
2901
2902       return NULL;
2903    }
2904
2905    exec_list *simd16_instructions = NULL;
2906    fs_visitor v2(brw, c, prog, fp, 16);
2907    bool no16 = INTEL_DEBUG & DEBUG_NO16;
2908    if (intel->gen >= 5 && c->prog_data.nr_pull_params == 0 && likely(!no16)) {
2909       v2.import_uniforms(&v);
2910       if (!v2.run()) {
2911          perf_debug("16-wide shader failed to compile, falling back to "
2912                     "8-wide at a 10-20%% performance cost: %s", v2.fail_msg);
2913       } else {
2914          simd16_instructions = &v2.instructions;
2915       }
2916    }
2917
2918    c->prog_data.dispatch_width = 8;
2919
2920    fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
2921    const unsigned *generated = g.generate_assembly(&v.instructions,
2922                                                    simd16_instructions,
2923                                                    final_assembly_size);
2924
2925    if (unlikely(intel->perf_debug) && shader) {
2926       if (shader->compiled_once)
2927          brw_wm_debug_recompile(brw, prog, &c->key);
2928       shader->compiled_once = true;
2929
2930       if (start_busy && !drm_intel_bo_busy(intel->batch.last_bo)) {
2931          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
2932                     (get_time() - start_time) * 1000);
2933       }
2934    }
2935
2936    return generated;
2937 }
2938
2939 bool
2940 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
2941 {
2942    struct brw_context *brw = brw_context(ctx);
2943    struct intel_context *intel = &brw->intel;
2944    struct brw_wm_prog_key key;
2945
2946    if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
2947       return true;
2948
2949    struct gl_fragment_program *fp = (struct gl_fragment_program *)
2950       prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
2951    struct brw_fragment_program *bfp = brw_fragment_program(fp);
2952    bool program_uses_dfdy = fp->UsesDFdy;
2953
2954    memset(&key, 0, sizeof(key));
2955
2956    if (intel->gen < 6) {
2957       if (fp->UsesKill)
2958          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
2959
2960       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
2961          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
2962
2963       /* Just assume depth testing. */
2964       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
2965       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
2966    }
2967
2968    if (prog->Name != 0)
2969       key.proj_attrib_mask = ~(GLbitfield64) 0;
2970    else {
2971       /* Bit VARYING_BIT_POS of key.proj_attrib_mask is never used, so to
2972        * avoid unnecessary recompiles, always set it to 1.
2973        */
2974       key.proj_attrib_mask |= VARYING_BIT_POS;
2975    }
2976
2977    if (intel->gen < 6)
2978       key.input_slots_valid |= BITFIELD64_BIT(VARYING_SLOT_POS);
2979
2980    for (int i = 0; i < VARYING_SLOT_MAX; i++) {
2981       if (!(fp->Base.InputsRead & BITFIELD64_BIT(i)))
2982          continue;
2983
2984       if (prog->Name == 0)
2985          key.proj_attrib_mask |= BITFIELD64_BIT(i);
2986
2987       if (intel->gen < 6) {
2988          if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
2989             key.input_slots_valid |= BITFIELD64_BIT(i);
2990       }
2991    }
2992
2993    key.clamp_fragment_color = true;
2994
2995    for (int i = 0; i < MAX_SAMPLERS; i++) {
2996       if (fp->Base.ShadowSamplers & (1 << i)) {
2997          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
2998          key.tex.swizzles[i] =
2999             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3000       } else {
3001          /* Color sampler: assume no swizzling. */
3002          key.tex.swizzles[i] = SWIZZLE_XYZW;
3003       }
3004    }
3005
3006    if (fp->Base.InputsRead & VARYING_BIT_POS) {
3007       key.drawable_height = ctx->DrawBuffer->Height;
3008    }
3009
3010    if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3011       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
3012    }
3013
3014    key.nr_color_regions = 1;
3015
3016    key.program_string_id = bfp->id;
3017
3018    uint32_t old_prog_offset = brw->wm.prog_offset;
3019    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3020
3021    bool success = do_wm_prog(brw, prog, bfp, &key);
3022
3023    brw->wm.prog_offset = old_prog_offset;
3024    brw->wm.prog_data = old_prog_data;
3025
3026    return success;
3027 }