src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 extern "C" {
  32
  33 #include <sys/types.h>
  34
  35 #include "main/macros.h"
  36 #include "main/shaderobj.h"
  37 #include "main/uniforms.h"
  38 #include "main/fbobject.h"
  39 #include "program/prog_parameter.h"
  40 #include "program/prog_print.h"
  41 #include "program/register_allocate.h"
  42 #include "program/sampler.h"
  43 #include "program/hash_table.h"
  44 #include "brw_context.h"
  45 #include "brw_eu.h"
  46 #include "brw_wm.h"
  47 }
  48 #include "brw_fs.h"
  49 #include "glsl/glsl_types.h"
  50 #include "glsl/ir_print_visitor.h"
  51
  52 void
  53 fs_inst::init()
  54 {
  55    memset(this, 0, sizeof(*this));
  56    this->opcode = BRW_OPCODE_NOP;
  57    this->conditional_mod = BRW_CONDITIONAL_NONE;
  58
  59    this->dst = reg_undef;
  60    this->src[0] = reg_undef;
  61    this->src[1] = reg_undef;
  62    this->src[2] = reg_undef;
  63 }
  64
  65 fs_inst::fs_inst()
  66 {
  67    init();
  68 }
  69
  70 fs_inst::fs_inst(enum opcode opcode)
  71 {
  72    init();
  73    this->opcode = opcode;
  74 }
  75
  76 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
  77 {
  78    init();
  79    this->opcode = opcode;
  80    this->dst = dst;
  81
  82    if (dst.file == GRF)
  83       assert(dst.reg_offset >= 0);
  84 }
  85
  86 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
  87 {
  88    init();
  89    this->opcode = opcode;
  90    this->dst = dst;
  91    this->src[0] = src0;
  92
  93    if (dst.file == GRF)
  94       assert(dst.reg_offset >= 0);
  95    if (src[0].file == GRF)
  96       assert(src[0].reg_offset >= 0);
  97 }
  98
  99 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 100 {
 101    init();
 102    this->opcode = opcode;
 103    this->dst = dst;
 104    this->src[0] = src0;
 105    this->src[1] = src1;
 106
 107    if (dst.file == GRF)
 108       assert(dst.reg_offset >= 0);
 109    if (src[0].file == GRF)
 110       assert(src[0].reg_offset >= 0);
 111    if (src[1].file == GRF)
 112       assert(src[1].reg_offset >= 0);
 113 }
 114
 115 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
 116                  fs_reg src0, fs_reg src1, fs_reg src2)
 117 {
 118    init();
 119    this->opcode = opcode;
 120    this->dst = dst;
 121    this->src[0] = src0;
 122    this->src[1] = src1;
 123    this->src[2] = src2;
 124
 125    if (dst.file == GRF)
 126       assert(dst.reg_offset >= 0);
 127    if (src[0].file == GRF)
 128       assert(src[0].reg_offset >= 0);
 129    if (src[1].file == GRF)
 130       assert(src[1].reg_offset >= 0);
 131    if (src[2].file == GRF)
 132       assert(src[2].reg_offset >= 0);
 133 }
 134
 135 #define ALU1(op)                                                        \
 136    fs_inst *                                                            \
 137    fs_visitor::op(fs_reg dst, fs_reg src0)                              \
 138    {                                                                    \
 139       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
 140    }
 141
 142 #define ALU2(op)                                                        \
 143    fs_inst *                                                            \
 144    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1)                 \
 145    {                                                                    \
 146       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
 147    }
 148
 149 #define ALU3(op)                                                        \
 150    fs_inst *                                                            \
 151    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1, fs_reg src2)    \
 152    {                                                                    \
 153       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
 154    }
 155
 156 ALU1(NOT)
 157 ALU1(MOV)
 158 ALU1(FRC)
 159 ALU1(RNDD)
 160 ALU1(RNDE)
 161 ALU1(RNDZ)
 162 ALU2(ADD)
 163 ALU2(MUL)
 164 ALU2(MACH)
 165 ALU2(AND)
 166 ALU2(OR)
 167 ALU2(XOR)
 168 ALU2(SHL)
 169 ALU2(SHR)
 170 ALU2(ASR)
 171 ALU3(LRP)
 172
 173 /** Gen4 predicated IF. */
 174 fs_inst *
 175 fs_visitor::IF(uint32_t predicate)
 176 {
 177    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
 178    inst->predicate = predicate;
 179    return inst;
 180 }
 181
 182 /** Gen6+ IF with embedded comparison. */
 183 fs_inst *
 184 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
 185 {
 186    assert(intel->gen >= 6);
 187    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
 188                                         reg_null_d, src0, src1);
 189    inst->conditional_mod = condition;
 190    return inst;
 191 }
 192
 193 /**
 194  * CMP: Sets the low bit of the destination channels with the result
 195  * of the comparison, while the upper bits are undefined, and updates
 196  * the flag register with the packed 16 bits of the result.
 197  */
 198 fs_inst *
 199 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
 200 {
 201    fs_inst *inst;
 202
 203    /* Take the instruction:
 204     *
 205     * CMP null<d> src0<f> src1<f>
 206     *
 207     * Original gen4 does type conversion to the destination type before
 208     * comparison, producing garbage results for floating point comparisons.
 209     * gen5 does the comparison on the execution type (resolved source types),
 210     * so dst type doesn't matter.  gen6 does comparison and then uses the
 211     * result as if it was the dst type with no conversion, which happens to
 212     * mostly work out for float-interpreted-as-int since our comparisons are
 213     * for >0, =0, <0.
 214     */
 215    if (intel->gen == 4) {
 216       dst.type = src0.type;
 217       if (dst.file == FIXED_HW_REG)
 218          dst.fixed_hw_reg.type = dst.type;
 219    }
 220
 221    resolve_ud_negate(&src0);
 222    resolve_ud_negate(&src1);
 223
 224    inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
 225    inst->conditional_mod = condition;
 226
 227    return inst;
 228 }
 229
 230 exec_list
 231 fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
 232                                        fs_reg offset)
 233 {
 234    exec_list instructions;
 235    fs_inst *inst;
 236
 237    if (intel->gen >= 7) {
 238       inst = new(mem_ctx) fs_inst(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7,
 239                                   dst, surf_index, offset);
 240       instructions.push_tail(inst);
 241    } else {
 242       int base_mrf = 13;
 243       bool header_present = true;
 244
 245       fs_reg mrf = fs_reg(MRF, base_mrf + header_present);
 246       mrf.type = BRW_REGISTER_TYPE_D;
 247
 248       /* On gen6+ we want the dword offset passed in, but on gen4/5 we need a
 249        * dword-aligned byte offset.
 250        */
 251       if (intel->gen == 6) {
 252          instructions.push_tail(MOV(mrf, offset));
 253       } else {
 254          instructions.push_tail(MUL(mrf, offset, fs_reg(4)));
 255       }
 256       inst = MOV(mrf, offset);
 257       inst = new(mem_ctx) fs_inst(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD,
 258                                   dst, surf_index);
 259       inst->header_present = header_present;
 260       inst->base_mrf = base_mrf;
 261       inst->mlen = header_present + dispatch_width / 8;
 262
 263       instructions.push_tail(inst);
 264    }
 265
 266    return instructions;
 267 }
 268
 269 /**
 270  * A helper for MOV generation for fixing up broken hardware SEND dependency
 271  * handling.
 272  */
 273 fs_inst *
 274 fs_visitor::DEP_RESOLVE_MOV(int grf)
 275 {
 276    fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
 277
 278    inst->ir = NULL;
 279    inst->annotation = "send dependency resolve";
 280
 281    /* The caller always wants uncompressed to emit the minimal extra
 282     * dependencies, and to avoid having to deal with aligning its regs to 2.
 283     */
 284    inst->force_uncompressed = true;
 285
 286    return inst;
 287 }
 288
 289 bool
 290 fs_inst::equals(fs_inst *inst)
 291 {
 292    return (opcode == inst->opcode &&
 293            dst.equals(inst->dst) &&
 294            src[0].equals(inst->src[0]) &&
 295            src[1].equals(inst->src[1]) &&
 296            src[2].equals(inst->src[2]) &&
 297            saturate == inst->saturate &&
 298            predicate == inst->predicate &&
 299            conditional_mod == inst->conditional_mod &&
 300            mlen == inst->mlen &&
 301            base_mrf == inst->base_mrf &&
 302            sampler == inst->sampler &&
 303            target == inst->target &&
 304            eot == inst->eot &&
 305            header_present == inst->header_present &&
 306            shadow_compare == inst->shadow_compare &&
 307            offset == inst->offset);
 308 }
 309
 310 int
 311 fs_inst::regs_written()
 312 {
 313    if (is_tex())
 314       return 4;
 315
 316    /* The SINCOS and INT_DIV_QUOTIENT_AND_REMAINDER math functions return 2,
 317     * but we don't currently use them...nor do we have an opcode for them.
 318     */
 319
 320    return 1;
 321 }
 322
 323 bool
 324 fs_inst::overwrites_reg(const fs_reg &reg)
 325 {
 326    return (reg.file == dst.file &&
 327            reg.reg == dst.reg &&
 328            reg.reg_offset >= dst.reg_offset  &&
 329            reg.reg_offset < dst.reg_offset + regs_written());
 330 }
 331
 332 bool
 333 fs_inst::is_tex()
 334 {
 335    return (opcode == SHADER_OPCODE_TEX ||
 336            opcode == FS_OPCODE_TXB ||
 337            opcode == SHADER_OPCODE_TXD ||
 338            opcode == SHADER_OPCODE_TXF ||
 339            opcode == SHADER_OPCODE_TXF_MS ||
 340            opcode == SHADER_OPCODE_TXL ||
 341            opcode == SHADER_OPCODE_TXS);
 342 }
 343
 344 bool
 345 fs_inst::is_math()
 346 {
 347    return (opcode == SHADER_OPCODE_RCP ||
 348            opcode == SHADER_OPCODE_RSQ ||
 349            opcode == SHADER_OPCODE_SQRT ||
 350            opcode == SHADER_OPCODE_EXP2 ||
 351            opcode == SHADER_OPCODE_LOG2 ||
 352            opcode == SHADER_OPCODE_SIN ||
 353            opcode == SHADER_OPCODE_COS ||
 354            opcode == SHADER_OPCODE_INT_QUOTIENT ||
 355            opcode == SHADER_OPCODE_INT_REMAINDER ||
 356            opcode == SHADER_OPCODE_POW);
 357 }
 358
 359 bool
 360 fs_inst::is_control_flow()
 361 {
 362    switch (opcode) {
 363    case BRW_OPCODE_DO:
 364    case BRW_OPCODE_WHILE:
 365    case BRW_OPCODE_IF:
 366    case BRW_OPCODE_ELSE:
 367    case BRW_OPCODE_ENDIF:
 368    case BRW_OPCODE_BREAK:
 369    case BRW_OPCODE_CONTINUE:
 370       return true;
 371    default:
 372       return false;
 373    }
 374 }
 375
 376 bool
 377 fs_inst::is_send_from_grf()
 378 {
 379    return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
 380            (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
 381             src[1].file == GRF));
 382 }
 383
 384 bool
 385 fs_visitor::can_do_source_mods(fs_inst *inst)
 386 {
 387    if (intel->gen == 6 && inst->is_math())
 388       return false;
 389
 390    if (inst->is_send_from_grf())
 391       return false;
 392
 393    return true;
 394 }
 395
 396 void
 397 fs_reg::init()
 398 {
 399    memset(this, 0, sizeof(*this));
 400    this->smear = -1;
 401 }
 402
 403 /** Generic unset register constructor. */
 404 fs_reg::fs_reg()
 405 {
 406    init();
 407    this->file = BAD_FILE;
 408 }
 409
 410 /** Immediate value constructor. */
 411 fs_reg::fs_reg(float f)
 412 {
 413    init();
 414    this->file = IMM;
 415    this->type = BRW_REGISTER_TYPE_F;
 416    this->imm.f = f;
 417 }
 418
 419 /** Immediate value constructor. */
 420 fs_reg::fs_reg(int32_t i)
 421 {
 422    init();
 423    this->file = IMM;
 424    this->type = BRW_REGISTER_TYPE_D;
 425    this->imm.i = i;
 426 }
 427
 428 /** Immediate value constructor. */
 429 fs_reg::fs_reg(uint32_t u)
 430 {
 431    init();
 432    this->file = IMM;
 433    this->type = BRW_REGISTER_TYPE_UD;
 434    this->imm.u = u;
 435 }
 436
 437 /** Fixed brw_reg Immediate value constructor. */
 438 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
 439 {
 440    init();
 441    this->file = FIXED_HW_REG;
 442    this->fixed_hw_reg = fixed_hw_reg;
 443    this->type = fixed_hw_reg.type;
 444 }
 445
 446 bool
 447 fs_reg::equals(const fs_reg &r) const
 448 {
 449    return (file == r.file &&
 450            reg == r.reg &&
 451            reg_offset == r.reg_offset &&
 452            type == r.type &&
 453            negate == r.negate &&
 454            abs == r.abs &&
 455            !reladdr && !r.reladdr &&
 456            memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
 457                   sizeof(fixed_hw_reg)) == 0 &&
 458            smear == r.smear &&
 459            imm.u == r.imm.u);
 460 }
 461
 462 bool
 463 fs_reg::is_zero() const
 464 {
 465    if (file != IMM)
 466       return false;
 467
 468    return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
 469 }
 470
 471 bool
 472 fs_reg::is_one() const
 473 {
 474    if (file != IMM)
 475       return false;
 476
 477    return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
 478 }
 479
 480 int
 481 fs_visitor::type_size(const struct glsl_type *type)
 482 {
 483    unsigned int size, i;
 484
 485    switch (type->base_type) {
 486    case GLSL_TYPE_UINT:
 487    case GLSL_TYPE_INT:
 488    case GLSL_TYPE_FLOAT:
 489    case GLSL_TYPE_BOOL:
 490       return type->components();
 491    case GLSL_TYPE_ARRAY:
 492       return type_size(type->fields.array) * type->length;
 493    case GLSL_TYPE_STRUCT:
 494       size = 0;
 495       for (i = 0; i < type->length; i++) {
 496          size += type_size(type->fields.structure[i].type);
 497       }
 498       return size;
 499    case GLSL_TYPE_SAMPLER:
 500       /* Samplers take up no register space, since they're baked in at
 501        * link time.
 502        */
 503       return 0;
 504    case GLSL_TYPE_VOID:
 505    case GLSL_TYPE_ERROR:
 506    case GLSL_TYPE_INTERFACE:
 507       assert(!"not reached");
 508       break;
 509    }
 510
 511    return 0;
 512 }
 513
 514 fs_reg
 515 fs_visitor::get_timestamp()
 516 {
 517    assert(intel->gen >= 7);
 518
 519    fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 520                                           BRW_ARF_TIMESTAMP,
 521                                           0),
 522                              BRW_REGISTER_TYPE_UD));
 523
 524    fs_reg dst = fs_reg(this, glsl_type::uint_type);
 525
 526    fs_inst *mov = emit(MOV(dst, ts));
 527    /* We want to read the 3 fields we care about (mostly field 0, but also 2)
 528     * even if it's not enabled in the dispatch.
 529     */
 530    mov->force_writemask_all = true;
 531    mov->force_uncompressed = true;
 532
 533    /* The caller wants the low 32 bits of the timestamp.  Since it's running
 534     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
 535     * which is plenty of time for our purposes.  It is identical across the
 536     * EUs, but since it's tracking GPU core speed it will increment at a
 537     * varying rate as render P-states change.
 538     *
 539     * The caller could also check if render P-states have changed (or anything
 540     * else that might disrupt timing) by setting smear to 2 and checking if
 541     * that field is != 0.
 542     */
 543    dst.smear = 0;
 544
 545    return dst;
 546 }
 547
 548 void
 549 fs_visitor::emit_shader_time_begin()
 550 {
 551    current_annotation = "shader time start";
 552    shader_start_time = get_timestamp();
 553 }
 554
 555 void
 556 fs_visitor::emit_shader_time_end()
 557 {
 558    current_annotation = "shader time end";
 559
 560    enum shader_time_shader_type type, written_type, reset_type;
 561    if (dispatch_width == 8) {
 562       type = ST_FS8;
 563       written_type = ST_FS8_WRITTEN;
 564       reset_type = ST_FS8_RESET;
 565    } else {
 566       assert(dispatch_width == 16);
 567       type = ST_FS16;
 568       written_type = ST_FS16_WRITTEN;
 569       reset_type = ST_FS16_RESET;
 570    }
 571
 572    fs_reg shader_end_time = get_timestamp();
 573
 574    /* Check that there weren't any timestamp reset events (assuming these
 575     * were the only two timestamp reads that happened).
 576     */
 577    fs_reg reset = shader_end_time;
 578    reset.smear = 2;
 579    fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
 580    test->conditional_mod = BRW_CONDITIONAL_Z;
 581    emit(IF(BRW_PREDICATE_NORMAL));
 582
 583    push_force_uncompressed();
 584    fs_reg start = shader_start_time;
 585    start.negate = true;
 586    fs_reg diff = fs_reg(this, glsl_type::uint_type);
 587    emit(ADD(diff, start, shader_end_time));
 588
 589    /* If there were no instructions between the two timestamp gets, the diff
 590     * is 2 cycles.  Remove that overhead, so I can forget about that when
 591     * trying to determine the time taken for single instructions.
 592     */
 593    emit(ADD(diff, diff, fs_reg(-2u)));
 594
 595    emit_shader_time_write(type, diff);
 596    emit_shader_time_write(written_type, fs_reg(1u));
 597    emit(BRW_OPCODE_ELSE);
 598    emit_shader_time_write(reset_type, fs_reg(1u));
 599    emit(BRW_OPCODE_ENDIF);
 600
 601    pop_force_uncompressed();
 602 }
 603
 604 void
 605 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
 606                                    fs_reg value)
 607 {
 608    /* Choose an index in the buffer and set up tracking information for our
 609     * printouts.
 610     */
 611    int shader_time_index = brw->shader_time.num_entries++;
 612    assert(shader_time_index <= brw->shader_time.max_entries);
 613    brw->shader_time.types[shader_time_index] = type;
 614    if (prog) {
 615       _mesa_reference_shader_program(ctx,
 616                                      &brw->shader_time.programs[shader_time_index],
 617                                      prog);
 618    }
 619
 620    int base_mrf = 6;
 621
 622    fs_reg offset_mrf = fs_reg(MRF, base_mrf);
 623    offset_mrf.type = BRW_REGISTER_TYPE_UD;
 624    emit(MOV(offset_mrf, fs_reg(shader_time_index * 4)));
 625
 626    fs_reg time_mrf = fs_reg(MRF, base_mrf + 1);
 627    time_mrf.type = BRW_REGISTER_TYPE_UD;
 628    emit(MOV(time_mrf, value));
 629
 630    fs_inst *inst = emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD));
 631    inst->base_mrf = base_mrf;
 632    inst->mlen = 2;
 633 }
 634
 635 void
 636 fs_visitor::fail(const char *format, ...)
 637 {
 638    va_list va;
 639    char *msg;
 640
 641    if (failed)
 642       return;
 643
 644    failed = true;
 645
 646    va_start(va, format);
 647    msg = ralloc_vasprintf(mem_ctx, format, va);
 648    va_end(va);
 649    msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
 650
 651    this->fail_msg = msg;
 652
 653    if (INTEL_DEBUG & DEBUG_WM) {
 654       fprintf(stderr, "%s",  msg);
 655    }
 656 }
 657
 658 fs_inst *
 659 fs_visitor::emit(enum opcode opcode)
 660 {
 661    return emit(fs_inst(opcode));
 662 }
 663
 664 fs_inst *
 665 fs_visitor::emit(enum opcode opcode, fs_reg dst)
 666 {
 667    return emit(fs_inst(opcode, dst));
 668 }
 669
 670 fs_inst *
 671 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
 672 {
 673    return emit(fs_inst(opcode, dst, src0));
 674 }
 675
 676 fs_inst *
 677 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 678 {
 679    return emit(fs_inst(opcode, dst, src0, src1));
 680 }
 681
 682 fs_inst *
 683 fs_visitor::emit(enum opcode opcode, fs_reg dst,
 684                  fs_reg src0, fs_reg src1, fs_reg src2)
 685 {
 686    return emit(fs_inst(opcode, dst, src0, src1, src2));
 687 }
 688
 689 void
 690 fs_visitor::push_force_uncompressed()
 691 {
 692    force_uncompressed_stack++;
 693 }
 694
 695 void
 696 fs_visitor::pop_force_uncompressed()
 697 {
 698    force_uncompressed_stack--;
 699    assert(force_uncompressed_stack >= 0);
 700 }
 701
 702 void
 703 fs_visitor::push_force_sechalf()
 704 {
 705    force_sechalf_stack++;
 706 }
 707
 708 void
 709 fs_visitor::pop_force_sechalf()
 710 {
 711    force_sechalf_stack--;
 712    assert(force_sechalf_stack >= 0);
 713 }
 714
 715 /**
 716  * Returns how many MRFs an FS opcode will write over.
 717  *
 718  * Note that this is not the 0 or 1 implied writes in an actual gen
 719  * instruction -- the FS opcodes often generate MOVs in addition.
 720  */
 721 int
 722 fs_visitor::implied_mrf_writes(fs_inst *inst)
 723 {
 724    if (inst->mlen == 0)
 725       return 0;
 726
 727    switch (inst->opcode) {
 728    case SHADER_OPCODE_RCP:
 729    case SHADER_OPCODE_RSQ:
 730    case SHADER_OPCODE_SQRT:
 731    case SHADER_OPCODE_EXP2:
 732    case SHADER_OPCODE_LOG2:
 733    case SHADER_OPCODE_SIN:
 734    case SHADER_OPCODE_COS:
 735       return 1 * dispatch_width / 8;
 736    case SHADER_OPCODE_POW:
 737    case SHADER_OPCODE_INT_QUOTIENT:
 738    case SHADER_OPCODE_INT_REMAINDER:
 739       return 2 * dispatch_width / 8;
 740    case SHADER_OPCODE_TEX:
 741    case FS_OPCODE_TXB:
 742    case SHADER_OPCODE_TXD:
 743    case SHADER_OPCODE_TXF:
 744    case SHADER_OPCODE_TXF_MS:
 745    case SHADER_OPCODE_TXL:
 746    case SHADER_OPCODE_TXS:
 747       return 1;
 748    case SHADER_OPCODE_SHADER_TIME_ADD:
 749       return 0;
 750    case FS_OPCODE_FB_WRITE:
 751       return 2;
 752    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 753    case FS_OPCODE_UNSPILL:
 754       return 1;
 755    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
 756       return inst->header_present;
 757    case FS_OPCODE_SPILL:
 758       return 2;
 759    default:
 760       assert(!"not reached");
 761       return inst->mlen;
 762    }
 763 }
 764
 765 int
 766 fs_visitor::virtual_grf_alloc(int size)
 767 {
 768    if (virtual_grf_array_size <= virtual_grf_count) {
 769       if (virtual_grf_array_size == 0)
 770          virtual_grf_array_size = 16;
 771       else
 772          virtual_grf_array_size *= 2;
 773       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 774                                    virtual_grf_array_size);
 775    }
 776    virtual_grf_sizes[virtual_grf_count] = size;
 777    return virtual_grf_count++;
 778 }
 779
 780 /** Fixed HW reg constructor. */
 781 fs_reg::fs_reg(enum register_file file, int reg)
 782 {
 783    init();
 784    this->file = file;
 785    this->reg = reg;
 786    this->type = BRW_REGISTER_TYPE_F;
 787 }
 788
 789 /** Fixed HW reg constructor. */
 790 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
 791 {
 792    init();
 793    this->file = file;
 794    this->reg = reg;
 795    this->type = type;
 796 }
 797
 798 /** Automatic reg constructor. */
 799 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
 800 {
 801    init();
 802
 803    this->file = GRF;
 804    this->reg = v->virtual_grf_alloc(v->type_size(type));
 805    this->reg_offset = 0;
 806    this->type = brw_type_for_base_type(type);
 807 }
 808
 809 fs_reg *
 810 fs_visitor::variable_storage(ir_variable *var)
 811 {
 812    return (fs_reg *)hash_table_find(this->variable_ht, var);
 813 }
 814
 815 void
 816 import_uniforms_callback(const void *key,
 817                          void *data,
 818                          void *closure)
 819 {
 820    struct hash_table *dst_ht = (struct hash_table *)closure;
 821    const fs_reg *reg = (const fs_reg *)data;
 822
 823    if (reg->file != UNIFORM)
 824       return;
 825
 826    hash_table_insert(dst_ht, data, key);
 827 }
 828
 829 /* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
 830  * This brings in those uniform definitions
 831  */
 832 void
 833 fs_visitor::import_uniforms(fs_visitor *v)
 834 {
 835    hash_table_call_foreach(v->variable_ht,
 836                            import_uniforms_callback,
 837                            variable_ht);
 838    this->params_remap = v->params_remap;
 839 }
 840
 841 /* Our support for uniforms is piggy-backed on the struct
 842  * gl_fragment_program, because that's where the values actually
 843  * get stored, rather than in some global gl_shader_program uniform
 844  * store.
 845  */
 846 void
 847 fs_visitor::setup_uniform_values(ir_variable *ir)
 848 {
 849    int namelen = strlen(ir->name);
 850
 851    /* The data for our (non-builtin) uniforms is stored in a series of
 852     * gl_uniform_driver_storage structs for each subcomponent that
 853     * glGetUniformLocation() could name.  We know it's been set up in the same
 854     * order we'd walk the type, so walk the list of storage and find anything
 855     * with our name, or the prefix of a component that starts with our name.
 856     */
 857    unsigned params_before = c->prog_data.nr_params;
 858    for (unsigned u = 0; u < prog->NumUserUniformStorage; u++) {
 859       struct gl_uniform_storage *storage = &prog->UniformStorage[u];
 860
 861       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 862           (storage->name[namelen] != 0 &&
 863            storage->name[namelen] != '.' &&
 864            storage->name[namelen] != '[')) {
 865          continue;
 866       }
 867
 868       unsigned slots = storage->type->component_slots();
 869       if (storage->array_elements)
 870          slots *= storage->array_elements;
 871
 872       for (unsigned i = 0; i < slots; i++) {
 873          c->prog_data.param[c->prog_data.nr_params++] =
 874             &storage->storage[i].f;
 875       }
 876    }
 877
 878    /* Make sure we actually initialized the right amount of stuff here. */
 879    assert(params_before + ir->type->component_slots() ==
 880           c->prog_data.nr_params);
 881 }
 882
 883
 884 /* Our support for builtin uniforms is even scarier than non-builtin.
 885  * It sits on top of the PROG_STATE_VAR parameters that are
 886  * automatically updated from GL context state.
 887  */
 888 void
 889 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
 890 {
 891    const ir_state_slot *const slots = ir->state_slots;
 892    assert(ir->state_slots != NULL);
 893
 894    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 895       /* This state reference has already been setup by ir_to_mesa, but we'll
 896        * get the same index back here.
 897        */
 898       int index = _mesa_add_state_reference(this->fp->Base.Parameters,
 899                                             (gl_state_index *)slots[i].tokens);
 900
 901       /* Add each of the unique swizzles of the element as a parameter.
 902        * This'll end up matching the expected layout of the
 903        * array/matrix/structure we're trying to fill in.
 904        */
 905       int last_swiz = -1;
 906       for (unsigned int j = 0; j < 4; j++) {
 907          int swiz = GET_SWZ(slots[i].swizzle, j);
 908          if (swiz == last_swiz)
 909             break;
 910          last_swiz = swiz;
 911
 912          c->prog_data.param[c->prog_data.nr_params++] =
 913             &fp->Base.Parameters->ParameterValues[index][swiz].f;
 914       }
 915    }
 916 }
 917
 918 fs_reg *
 919 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
 920 {
 921    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 922    fs_reg wpos = *reg;
 923    bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
 924
 925    /* gl_FragCoord.x */
 926    if (ir->pixel_center_integer) {
 927       emit(MOV(wpos, this->pixel_x));
 928    } else {
 929       emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
 930    }
 931    wpos.reg_offset++;
 932
 933    /* gl_FragCoord.y */
 934    if (!flip && ir->pixel_center_integer) {
 935       emit(MOV(wpos, this->pixel_y));
 936    } else {
 937       fs_reg pixel_y = this->pixel_y;
 938       float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
 939
 940       if (flip) {
 941          pixel_y.negate = true;
 942          offset += c->key.drawable_height - 1.0;
 943       }
 944
 945       emit(ADD(wpos, pixel_y, fs_reg(offset)));
 946    }
 947    wpos.reg_offset++;
 948
 949    /* gl_FragCoord.z */
 950    if (intel->gen >= 6) {
 951       emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
 952    } else {
 953       emit(FS_OPCODE_LINTERP, wpos,
 954            this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
 955            this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
 956            interp_reg(FRAG_ATTRIB_WPOS, 2));
 957    }
 958    wpos.reg_offset++;
 959
 960    /* gl_FragCoord.w: Already set up in emit_interpolation */
 961    emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
 962
 963    return reg;
 964 }
 965
 966 fs_inst *
 967 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
 968                          glsl_interp_qualifier interpolation_mode,
 969                          bool is_centroid)
 970 {
 971    brw_wm_barycentric_interp_mode barycoord_mode;
 972    if (is_centroid) {
 973       if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
 974          barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
 975       else
 976          barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
 977    } else {
 978       if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
 979          barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
 980       else
 981          barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
 982    }
 983    return emit(FS_OPCODE_LINTERP, attr,
 984                this->delta_x[barycoord_mode],
 985                this->delta_y[barycoord_mode], interp);
 986 }
 987
 988 fs_reg *
 989 fs_visitor::emit_general_interpolation(ir_variable *ir)
 990 {
 991    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 992    reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
 993    fs_reg attr = *reg;
 994
 995    unsigned int array_elements;
 996    const glsl_type *type;
 997
 998    if (ir->type->is_array()) {
 999       array_elements = ir->type->length;
1000       if (array_elements == 0) {
1001          fail("dereferenced array '%s' has length 0\n", ir->name);
1002       }
1003       type = ir->type->fields.array;
1004    } else {
1005       array_elements = 1;
1006       type = ir->type;
1007    }
1008
1009    glsl_interp_qualifier interpolation_mode =
1010       ir->determine_interpolation_mode(c->key.flat_shade);
1011
1012    int location = ir->location;
1013    for (unsigned int i = 0; i < array_elements; i++) {
1014       for (unsigned int j = 0; j < type->matrix_columns; j++) {
1015          if (urb_setup[location] == -1) {
1016             /* If there's no incoming setup data for this slot, don't
1017              * emit interpolation for it.
1018              */
1019             attr.reg_offset += type->vector_elements;
1020             location++;
1021             continue;
1022          }
1023
1024          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1025             /* Constant interpolation (flat shading) case. The SF has
1026              * handed us defined values in only the constant offset
1027              * field of the setup reg.
1028              */
1029             for (unsigned int k = 0; k < type->vector_elements; k++) {
1030                struct brw_reg interp = interp_reg(location, k);
1031                interp = suboffset(interp, 3);
1032                interp.type = reg->type;
1033                emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1034                attr.reg_offset++;
1035             }
1036          } else {
1037             /* Smooth/noperspective interpolation case. */
1038             for (unsigned int k = 0; k < type->vector_elements; k++) {
1039                /* FINISHME: At some point we probably want to push
1040                 * this farther by giving similar treatment to the
1041                 * other potentially constant components of the
1042                 * attribute, as well as making brw_vs_constval.c
1043                 * handle varyings other than gl_TexCoord.
1044                 */
1045                if (location >= FRAG_ATTRIB_TEX0 &&
1046                    location <= FRAG_ATTRIB_TEX7 &&
1047                    k == 3 && !(c->key.proj_attrib_mask & (1 << location))) {
1048                   emit(BRW_OPCODE_MOV, attr, fs_reg(1.0f));
1049                } else {
1050                   struct brw_reg interp = interp_reg(location, k);
1051                   emit_linterp(attr, fs_reg(interp), interpolation_mode,
1052                                ir->centroid);
1053                   if (brw->needs_unlit_centroid_workaround && ir->centroid) {
1054                      /* Get the pixel/sample mask into f0 so that we know
1055                       * which pixels are lit.  Then, for each channel that is
1056                       * unlit, replace the centroid data with non-centroid
1057                       * data.
1058                       */
1059                      emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS, attr);
1060                      fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1061                                                   interpolation_mode, false);
1062                      inst->predicate = BRW_PREDICATE_NORMAL;
1063                      inst->predicate_inverse = true;
1064                   }
1065                   if (intel->gen < 6) {
1066                      emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1067                   }
1068                }
1069                attr.reg_offset++;
1070             }
1071
1072          }
1073          location++;
1074       }
1075    }
1076
1077    return reg;
1078 }
1079
1080 fs_reg *
1081 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1082 {
1083    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1084
1085    /* The frontfacing comes in as a bit in the thread payload. */
1086    if (intel->gen >= 6) {
1087       emit(BRW_OPCODE_ASR, *reg,
1088            fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1089            fs_reg(15));
1090       emit(BRW_OPCODE_NOT, *reg, *reg);
1091       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1092    } else {
1093       struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1094       /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1095        * us front face
1096        */
1097       emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1098       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1099    }
1100
1101    return reg;
1102 }
1103
1104 fs_reg
1105 fs_visitor::fix_math_operand(fs_reg src)
1106 {
1107    /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1108     * might be able to do better by doing execsize = 1 math and then
1109     * expanding that result out, but we would need to be careful with
1110     * masking.
1111     *
1112     * The hardware ignores source modifiers (negate and abs) on math
1113     * instructions, so we also move to a temp to set those up.
1114     */
1115    if (intel->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1116        !src.abs && !src.negate)
1117       return src;
1118
1119    /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1120     * operands to math
1121     */
1122    if (intel->gen >= 7 && src.file != IMM)
1123       return src;
1124
1125    fs_reg expanded = fs_reg(this, glsl_type::float_type);
1126    expanded.type = src.type;
1127    emit(BRW_OPCODE_MOV, expanded, src);
1128    return expanded;
1129 }
1130
1131 fs_inst *
1132 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1133 {
1134    switch (opcode) {
1135    case SHADER_OPCODE_RCP:
1136    case SHADER_OPCODE_RSQ:
1137    case SHADER_OPCODE_SQRT:
1138    case SHADER_OPCODE_EXP2:
1139    case SHADER_OPCODE_LOG2:
1140    case SHADER_OPCODE_SIN:
1141    case SHADER_OPCODE_COS:
1142       break;
1143    default:
1144       assert(!"not reached: bad math opcode");
1145       return NULL;
1146    }
1147
1148    /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
1149     * might be able to do better by doing execsize = 1 math and then
1150     * expanding that result out, but we would need to be careful with
1151     * masking.
1152     *
1153     * Gen 6 hardware ignores source modifiers (negate and abs) on math
1154     * instructions, so we also move to a temp to set those up.
1155     */
1156    if (intel->gen >= 6)
1157       src = fix_math_operand(src);
1158
1159    fs_inst *inst = emit(opcode, dst, src);
1160
1161    if (intel->gen < 6) {
1162       inst->base_mrf = 2;
1163       inst->mlen = dispatch_width / 8;
1164    }
1165
1166    return inst;
1167 }
1168
1169 fs_inst *
1170 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1171 {
1172    int base_mrf = 2;
1173    fs_inst *inst;
1174
1175    switch (opcode) {
1176    case SHADER_OPCODE_INT_QUOTIENT:
1177    case SHADER_OPCODE_INT_REMAINDER:
1178       if (intel->gen >= 7 && dispatch_width == 16)
1179          fail("16-wide INTDIV unsupported\n");
1180       break;
1181    case SHADER_OPCODE_POW:
1182       break;
1183    default:
1184       assert(!"not reached: unsupported binary math opcode.");
1185       return NULL;
1186    }
1187
1188    if (intel->gen >= 6) {
1189       src0 = fix_math_operand(src0);
1190       src1 = fix_math_operand(src1);
1191
1192       inst = emit(opcode, dst, src0, src1);
1193    } else {
1194       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1195        * "Message Payload":
1196        *
1197        * "Operand0[7].  For the INT DIV functions, this operand is the
1198        *  denominator."
1199        *  ...
1200        * "Operand1[7].  For the INT DIV functions, this operand is the
1201        *  numerator."
1202        */
1203       bool is_int_div = opcode != SHADER_OPCODE_POW;
1204       fs_reg &op0 = is_int_div ? src1 : src0;
1205       fs_reg &op1 = is_int_div ? src0 : src1;
1206
1207       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1208       inst = emit(opcode, dst, op0, reg_null_f);
1209
1210       inst->base_mrf = base_mrf;
1211       inst->mlen = 2 * dispatch_width / 8;
1212    }
1213    return inst;
1214 }
1215
1216 void
1217 fs_visitor::assign_curb_setup()
1218 {
1219    c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
1220    if (dispatch_width == 8) {
1221       c->prog_data.first_curbe_grf = c->nr_payload_regs;
1222    } else {
1223       c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1224    }
1225
1226    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1227    foreach_list(node, &this->instructions) {
1228       fs_inst *inst = (fs_inst *)node;
1229
1230       for (unsigned int i = 0; i < 3; i++) {
1231          if (inst->src[i].file == UNIFORM) {
1232             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1233             struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1234                                                   constant_nr / 8,
1235                                                   constant_nr % 8);
1236
1237             inst->src[i].file = FIXED_HW_REG;
1238             inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
1239          }
1240       }
1241    }
1242 }
1243
1244 void
1245 fs_visitor::calculate_urb_setup()
1246 {
1247    for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
1248       urb_setup[i] = -1;
1249    }
1250
1251    int urb_next = 0;
1252    /* Figure out where each of the incoming setup attributes lands. */
1253    if (intel->gen >= 6) {
1254       for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
1255          if (fp->Base.InputsRead & BITFIELD64_BIT(i)) {
1256             urb_setup[i] = urb_next++;
1257          }
1258       }
1259    } else {
1260       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1261       for (unsigned int i = 0; i < VERT_RESULT_MAX; i++) {
1262          /* Point size is packed into the header, not as a general attribute */
1263          if (i == VERT_RESULT_PSIZ)
1264             continue;
1265
1266          if (c->key.vp_outputs_written & BITFIELD64_BIT(i)) {
1267             int fp_index = _mesa_vert_result_to_frag_attrib((gl_vert_result) i);
1268
1269             /* The back color slot is skipped when the front color is
1270              * also written to.  In addition, some slots can be
1271              * written in the vertex shader and not read in the
1272              * fragment shader.  So the register number must always be
1273              * incremented, mapped or not.
1274              */
1275             if (fp_index >= 0)
1276                urb_setup[fp_index] = urb_next;
1277             urb_next++;
1278          }
1279       }
1280
1281       /*
1282        * It's a FS only attribute, and we did interpolation for this attribute
1283        * in SF thread. So, count it here, too.
1284        *
1285        * See compile_sf_prog() for more info.
1286        */
1287       if (fp->Base.InputsRead & BITFIELD64_BIT(FRAG_ATTRIB_PNTC))
1288          urb_setup[FRAG_ATTRIB_PNTC] = urb_next++;
1289    }
1290
1291    /* Each attribute is 4 setup channels, each of which is half a reg. */
1292    c->prog_data.urb_read_length = urb_next * 2;
1293 }
1294
1295 void
1296 fs_visitor::assign_urb_setup()
1297 {
1298    int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1299
1300    /* Offset all the urb_setup[] index by the actual position of the
1301     * setup regs, now that the location of the constants has been chosen.
1302     */
1303    foreach_list(node, &this->instructions) {
1304       fs_inst *inst = (fs_inst *)node;
1305
1306       if (inst->opcode == FS_OPCODE_LINTERP) {
1307          assert(inst->src[2].file == FIXED_HW_REG);
1308          inst->src[2].fixed_hw_reg.nr += urb_start;
1309       }
1310
1311       if (inst->opcode == FS_OPCODE_CINTERP) {
1312          assert(inst->src[0].file == FIXED_HW_REG);
1313          inst->src[0].fixed_hw_reg.nr += urb_start;
1314       }
1315    }
1316
1317    this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length;
1318 }
1319
1320 /**
1321  * Split large virtual GRFs into separate components if we can.
1322  *
1323  * This is mostly duplicated with what brw_fs_vector_splitting does,
1324  * but that's really conservative because it's afraid of doing
1325  * splitting that doesn't result in real progress after the rest of
1326  * the optimization phases, which would cause infinite looping in
1327  * optimization.  We can do it once here, safely.  This also has the
1328  * opportunity to split interpolated values, or maybe even uniforms,
1329  * which we don't have at the IR level.
1330  *
1331  * We want to split, because virtual GRFs are what we register
1332  * allocate and spill (due to contiguousness requirements for some
1333  * instructions), and they're what we naturally generate in the
1334  * codegen process, but most virtual GRFs don't actually need to be
1335  * contiguous sets of GRFs.  If we split, we'll end up with reduced
1336  * live intervals and better dead code elimination and coalescing.
1337  */
1338 void
1339 fs_visitor::split_virtual_grfs()
1340 {
1341    int num_vars = this->virtual_grf_count;
1342    bool split_grf[num_vars];
1343    int new_virtual_grf[num_vars];
1344
1345    /* Try to split anything > 0 sized. */
1346    for (int i = 0; i < num_vars; i++) {
1347       if (this->virtual_grf_sizes[i] != 1)
1348          split_grf[i] = true;
1349       else
1350          split_grf[i] = false;
1351    }
1352
1353    if (brw->has_pln &&
1354        this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1355       /* PLN opcodes rely on the delta_xy being contiguous.  We only have to
1356        * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1357        * Gen6, that was the only supported interpolation mode, and since Gen6,
1358        * delta_x and delta_y are in fixed hardware registers.
1359        */
1360       split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1361          false;
1362    }
1363
1364    foreach_list(node, &this->instructions) {
1365       fs_inst *inst = (fs_inst *)node;
1366
1367       /* If there's a SEND message that requires contiguous destination
1368        * registers, no splitting is allowed.
1369        */
1370       if (inst->regs_written() > 1) {
1371          split_grf[inst->dst.reg] = false;
1372       }
1373    }
1374
1375    /* Allocate new space for split regs.  Note that the virtual
1376     * numbers will be contiguous.
1377     */
1378    for (int i = 0; i < num_vars; i++) {
1379       if (split_grf[i]) {
1380          new_virtual_grf[i] = virtual_grf_alloc(1);
1381          for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1382             int reg = virtual_grf_alloc(1);
1383             assert(reg == new_virtual_grf[i] + j - 1);
1384             (void) reg;
1385          }
1386          this->virtual_grf_sizes[i] = 1;
1387       }
1388    }
1389
1390    foreach_list(node, &this->instructions) {
1391       fs_inst *inst = (fs_inst *)node;
1392
1393       if (inst->dst.file == GRF &&
1394           split_grf[inst->dst.reg] &&
1395           inst->dst.reg_offset != 0) {
1396          inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1397                           inst->dst.reg_offset - 1);
1398          inst->dst.reg_offset = 0;
1399       }
1400       for (int i = 0; i < 3; i++) {
1401          if (inst->src[i].file == GRF &&
1402              split_grf[inst->src[i].reg] &&
1403              inst->src[i].reg_offset != 0) {
1404             inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1405                                 inst->src[i].reg_offset - 1);
1406             inst->src[i].reg_offset = 0;
1407          }
1408       }
1409    }
1410    this->live_intervals_valid = false;
1411 }
1412
1413 /**
1414  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1415  *
1416  * During code generation, we create tons of temporary variables, many of
1417  * which get immediately killed and are never used again.  Yet, in later
1418  * optimization and analysis passes, such as compute_live_intervals, we need
1419  * to loop over all the virtual GRFs.  Compacting them can save a lot of
1420  * overhead.
1421  */
1422 void
1423 fs_visitor::compact_virtual_grfs()
1424 {
1425    /* Mark which virtual GRFs are used, and count how many. */
1426    int remap_table[this->virtual_grf_count];
1427    memset(remap_table, -1, sizeof(remap_table));
1428
1429    foreach_list(node, &this->instructions) {
1430       const fs_inst *inst = (const fs_inst *) node;
1431
1432       if (inst->dst.file == GRF)
1433          remap_table[inst->dst.reg] = 0;
1434
1435       for (int i = 0; i < 3; i++) {
1436          if (inst->src[i].file == GRF)
1437             remap_table[inst->src[i].reg] = 0;
1438       }
1439    }
1440
1441    /* In addition to registers used in instructions, fs_visitor keeps
1442     * direct references to certain special values which must be patched:
1443     */
1444    fs_reg *special[] = {
1445       &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1446       &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1447       &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1448       &delta_x[0], &delta_x[1], &delta_x[2],
1449       &delta_x[3], &delta_x[4], &delta_x[5],
1450       &delta_y[0], &delta_y[1], &delta_y[2],
1451       &delta_y[3], &delta_y[4], &delta_y[5],
1452    };
1453    STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1454    STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1455
1456    /* Treat all special values as used, to be conservative */
1457    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1458       if (special[i]->file == GRF)
1459          remap_table[special[i]->reg] = 0;
1460    }
1461
1462    /* Compact the GRF arrays. */
1463    int new_index = 0;
1464    for (int i = 0; i < this->virtual_grf_count; i++) {
1465       if (remap_table[i] != -1) {
1466          remap_table[i] = new_index;
1467          virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1468          if (live_intervals_valid) {
1469             virtual_grf_use[new_index] = virtual_grf_use[i];
1470             virtual_grf_def[new_index] = virtual_grf_def[i];
1471          }
1472          ++new_index;
1473       }
1474    }
1475
1476    this->virtual_grf_count = new_index;
1477
1478    /* Patch all the instructions to use the newly renumbered registers */
1479    foreach_list(node, &this->instructions) {
1480       fs_inst *inst = (fs_inst *) node;
1481
1482       if (inst->dst.file == GRF)
1483          inst->dst.reg = remap_table[inst->dst.reg];
1484
1485       for (int i = 0; i < 3; i++) {
1486          if (inst->src[i].file == GRF)
1487             inst->src[i].reg = remap_table[inst->src[i].reg];
1488       }
1489    }
1490
1491    /* Patch all the references to special values */
1492    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1493       if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1494          special[i]->reg = remap_table[special[i]->reg];
1495    }
1496 }
1497
1498 bool
1499 fs_visitor::remove_dead_constants()
1500 {
1501    if (dispatch_width == 8) {
1502       this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
1503
1504       for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
1505          this->params_remap[i] = -1;
1506
1507       /* Find which params are still in use. */
1508       foreach_list(node, &this->instructions) {
1509          fs_inst *inst = (fs_inst *)node;
1510
1511          for (int i = 0; i < 3; i++) {
1512             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1513
1514             if (inst->src[i].file != UNIFORM)
1515                continue;
1516
1517             assert(constant_nr < (int)c->prog_data.nr_params);
1518
1519             /* For now, set this to non-negative.  We'll give it the
1520              * actual new number in a moment, in order to keep the
1521              * register numbers nicely ordered.
1522              */
1523             this->params_remap[constant_nr] = 0;
1524          }
1525       }
1526
1527       /* Figure out what the new numbers for the params will be.  At some
1528        * point when we're doing uniform array access, we're going to want
1529        * to keep the distinction between .reg and .reg_offset, but for
1530        * now we don't care.
1531        */
1532       unsigned int new_nr_params = 0;
1533       for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1534          if (this->params_remap[i] != -1) {
1535             this->params_remap[i] = new_nr_params++;
1536          }
1537       }
1538
1539       /* Update the list of params to be uploaded to match our new numbering. */
1540       for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1541          int remapped = this->params_remap[i];
1542
1543          if (remapped == -1)
1544             continue;
1545
1546          c->prog_data.param[remapped] = c->prog_data.param[i];
1547       }
1548
1549       c->prog_data.nr_params = new_nr_params;
1550    } else {
1551       /* This should have been generated in the 8-wide pass already. */
1552       assert(this->params_remap);
1553    }
1554
1555    /* Now do the renumbering of the shader to remove unused params. */
1556    foreach_list(node, &this->instructions) {
1557       fs_inst *inst = (fs_inst *)node;
1558
1559       for (int i = 0; i < 3; i++) {
1560          int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1561
1562          if (inst->src[i].file != UNIFORM)
1563             continue;
1564
1565          assert(this->params_remap[constant_nr] != -1);
1566          inst->src[i].reg = this->params_remap[constant_nr];
1567          inst->src[i].reg_offset = 0;
1568       }
1569    }
1570
1571    return true;
1572 }
1573
1574 /*
1575  * Implements array access of uniforms by inserting a
1576  * PULL_CONSTANT_LOAD instruction.
1577  *
1578  * Unlike temporary GRF array access (where we don't support it due to
1579  * the difficulty of doing relative addressing on instruction
1580  * destinations), we could potentially do array access of uniforms
1581  * that were loaded in GRF space as push constants.  In real-world
1582  * usage we've seen, though, the arrays being used are always larger
1583  * than we could load as push constants, so just always move all
1584  * uniform array access out to a pull constant buffer.
1585  */
1586 void
1587 fs_visitor::move_uniform_array_access_to_pull_constants()
1588 {
1589    int pull_constant_loc[c->prog_data.nr_params];
1590
1591    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1592       pull_constant_loc[i] = -1;
1593    }
1594
1595    /* Walk through and find array access of uniforms.  Put a copy of that
1596     * uniform in the pull constant buffer.
1597     *
1598     * Note that we don't move constant-indexed accesses to arrays.  No
1599     * testing has been done of the performance impact of this choice.
1600     */
1601    foreach_list_safe(node, &this->instructions) {
1602       fs_inst *inst = (fs_inst *)node;
1603
1604       for (int i = 0 ; i < 3; i++) {
1605          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1606             continue;
1607
1608          int uniform = inst->src[i].reg;
1609
1610          /* If this array isn't already present in the pull constant buffer,
1611           * add it.
1612           */
1613          if (pull_constant_loc[uniform] == -1) {
1614             const float **values = &c->prog_data.param[uniform];
1615
1616             pull_constant_loc[uniform] = c->prog_data.nr_pull_params;
1617
1618             assert(param_size[uniform]);
1619
1620             for (int j = 0; j < param_size[uniform]; j++) {
1621                c->prog_data.pull_param[c->prog_data.nr_pull_params++] =
1622                   values[j];
1623             }
1624          }
1625
1626          /* Set up the annotation tracking for new generated instructions. */
1627          base_ir = inst->ir;
1628          current_annotation = inst->annotation;
1629
1630          fs_reg offset = fs_reg(this, glsl_type::int_type);
1631          inst->insert_before(ADD(offset, *inst->src[i].reladdr,
1632                                  fs_reg(pull_constant_loc[uniform] +
1633                                         inst->src[i].reg_offset)));
1634
1635          fs_reg surf_index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1636          fs_reg temp = fs_reg(this, glsl_type::float_type);
1637          exec_list list = VARYING_PULL_CONSTANT_LOAD(temp,
1638                                                      surf_index, offset);
1639          inst->insert_before(&list);
1640
1641          inst->src[i].file = temp.file;
1642          inst->src[i].reg = temp.reg;
1643          inst->src[i].reg_offset = temp.reg_offset;
1644          inst->src[i].reladdr = NULL;
1645       }
1646    }
1647 }
1648
1649 /**
1650  * Choose accesses from the UNIFORM file to demote to using the pull
1651  * constant buffer.
1652  *
1653  * We allow a fragment shader to have more than the specified minimum
1654  * maximum number of fragment shader uniform components (64).  If
1655  * there are too many of these, they'd fill up all of register space.
1656  * So, this will push some of them out to the pull constant buffer and
1657  * update the program to load them.
1658  */
1659 void
1660 fs_visitor::setup_pull_constants()
1661 {
1662    /* Only allow 16 registers (128 uniform components) as push constants. */
1663    unsigned int max_uniform_components = 16 * 8;
1664    if (c->prog_data.nr_params <= max_uniform_components)
1665       return;
1666
1667    if (dispatch_width == 16) {
1668       fail("Pull constants not supported in 16-wide\n");
1669       return;
1670    }
1671
1672    /* Just demote the end of the list.  We could probably do better
1673     * here, demoting things that are rarely used in the program first.
1674     */
1675    unsigned int pull_uniform_base = max_uniform_components;
1676
1677    int pull_constant_loc[c->prog_data.nr_params];
1678    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1679       if (i < pull_uniform_base) {
1680          pull_constant_loc[i] = -1;
1681       } else {
1682          pull_constant_loc[i] = -1;
1683          /* If our constant is already being uploaded for reladdr purposes,
1684           * reuse it.
1685           */
1686          for (unsigned int j = 0; j < c->prog_data.nr_pull_params; j++) {
1687             if (c->prog_data.pull_param[j] == c->prog_data.param[i]) {
1688                pull_constant_loc[i] = j;
1689                break;
1690             }
1691          }
1692          if (pull_constant_loc[i] == -1) {
1693             int pull_index = c->prog_data.nr_pull_params++;
1694             c->prog_data.pull_param[pull_index] = c->prog_data.param[i];
1695             pull_constant_loc[i] = pull_index;;
1696          }
1697       }
1698    }
1699    c->prog_data.nr_params = pull_uniform_base;
1700
1701    foreach_list(node, &this->instructions) {
1702       fs_inst *inst = (fs_inst *)node;
1703
1704       for (int i = 0; i < 3; i++) {
1705          if (inst->src[i].file != UNIFORM)
1706             continue;
1707
1708          int pull_index = pull_constant_loc[inst->src[i].reg +
1709                                             inst->src[i].reg_offset];
1710          if (pull_index == -1)
1711             continue;
1712
1713          assert(!inst->src[i].reladdr);
1714
1715          fs_reg dst = fs_reg(this, glsl_type::float_type);
1716          fs_reg index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1717          fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1718          fs_inst *pull =
1719             new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1720                                  dst, index, offset);
1721          pull->ir = inst->ir;
1722          pull->annotation = inst->annotation;
1723
1724          inst->insert_before(pull);
1725
1726          inst->src[i].file = GRF;
1727          inst->src[i].reg = dst.reg;
1728          inst->src[i].reg_offset = 0;
1729          inst->src[i].smear = pull_index & 3;
1730       }
1731    }
1732 }
1733
1734 bool
1735 fs_visitor::opt_algebraic()
1736 {
1737    bool progress = false;
1738
1739    foreach_list(node, &this->instructions) {
1740       fs_inst *inst = (fs_inst *)node;
1741
1742       switch (inst->opcode) {
1743       case BRW_OPCODE_MUL:
1744          if (inst->src[1].file != IMM)
1745             continue;
1746
1747          /* a * 1.0 = a */
1748          if (inst->src[1].is_one()) {
1749             inst->opcode = BRW_OPCODE_MOV;
1750             inst->src[1] = reg_undef;
1751             progress = true;
1752             break;
1753          }
1754
1755          /* a * 0.0 = 0.0 */
1756          if (inst->src[1].is_zero()) {
1757             inst->opcode = BRW_OPCODE_MOV;
1758             inst->src[0] = inst->src[1];
1759             inst->src[1] = reg_undef;
1760             progress = true;
1761             break;
1762          }
1763
1764          break;
1765       case BRW_OPCODE_ADD:
1766          if (inst->src[1].file != IMM)
1767             continue;
1768
1769          /* a + 0.0 = a */
1770          if (inst->src[1].is_zero()) {
1771             inst->opcode = BRW_OPCODE_MOV;
1772             inst->src[1] = reg_undef;
1773             progress = true;
1774             break;
1775          }
1776          break;
1777       default:
1778          break;
1779       }
1780    }
1781
1782    return progress;
1783 }
1784
1785 /**
1786  * Must be called after calculate_live_intervales() to remove unused
1787  * writes to registers -- register allocation will fail otherwise
1788  * because something deffed but not used won't be considered to
1789  * interfere with other regs.
1790  */
1791 bool
1792 fs_visitor::dead_code_eliminate()
1793 {
1794    bool progress = false;
1795    int pc = 0;
1796
1797    calculate_live_intervals();
1798
1799    foreach_list_safe(node, &this->instructions) {
1800       fs_inst *inst = (fs_inst *)node;
1801
1802       if (inst->dst.file == GRF && this->virtual_grf_use[inst->dst.reg] <= pc) {
1803          inst->remove();
1804          progress = true;
1805       }
1806
1807       pc++;
1808    }
1809
1810    if (progress)
1811       live_intervals_valid = false;
1812
1813    return progress;
1814 }
1815
1816 /**
1817  * Implements a second type of register coalescing: This one checks if
1818  * the two regs involved in a raw move don't interfere, in which case
1819  * they can both by stored in the same place and the MOV removed.
1820  */
1821 bool
1822 fs_visitor::register_coalesce_2()
1823 {
1824    bool progress = false;
1825
1826    calculate_live_intervals();
1827
1828    foreach_list_safe(node, &this->instructions) {
1829       fs_inst *inst = (fs_inst *)node;
1830
1831       if (inst->opcode != BRW_OPCODE_MOV ||
1832           inst->predicate ||
1833           inst->saturate ||
1834           inst->src[0].file != GRF ||
1835           inst->src[0].negate ||
1836           inst->src[0].abs ||
1837           inst->src[0].smear != -1 ||
1838           inst->dst.file != GRF ||
1839           inst->dst.type != inst->src[0].type ||
1840           virtual_grf_sizes[inst->src[0].reg] != 1 ||
1841           virtual_grf_interferes(inst->dst.reg, inst->src[0].reg)) {
1842          continue;
1843       }
1844
1845       int reg_from = inst->src[0].reg;
1846       assert(inst->src[0].reg_offset == 0);
1847       int reg_to = inst->dst.reg;
1848       int reg_to_offset = inst->dst.reg_offset;
1849
1850       foreach_list(node, &this->instructions) {
1851          fs_inst *scan_inst = (fs_inst *)node;
1852
1853          if (scan_inst->dst.file == GRF &&
1854              scan_inst->dst.reg == reg_from) {
1855             scan_inst->dst.reg = reg_to;
1856             scan_inst->dst.reg_offset = reg_to_offset;
1857          }
1858          for (int i = 0; i < 3; i++) {
1859             if (scan_inst->src[i].file == GRF &&
1860                 scan_inst->src[i].reg == reg_from) {
1861                scan_inst->src[i].reg = reg_to;
1862                scan_inst->src[i].reg_offset = reg_to_offset;
1863             }
1864          }
1865       }
1866
1867       inst->remove();
1868
1869       /* We don't need to recalculate live intervals inside the loop despite
1870        * flagging live_intervals_valid because we only use live intervals for
1871        * the interferes test, and we must have had a situation where the
1872        * intervals were:
1873        *
1874        *  from  to
1875        *  ^
1876        *  |
1877        *  v
1878        *        ^
1879        *        |
1880        *        v
1881        *
1882        * Some register R that might get coalesced with one of these two could
1883        * only be referencing "to", otherwise "from"'s range would have been
1884        * longer.  R's range could also only start at the end of "to" or later,
1885        * otherwise it will conflict with "to" when we try to coalesce "to"
1886        * into Rw anyway.
1887        */
1888       live_intervals_valid = false;
1889
1890       progress = true;
1891       continue;
1892    }
1893
1894    return progress;
1895 }
1896
1897 bool
1898 fs_visitor::register_coalesce()
1899 {
1900    bool progress = false;
1901    int if_depth = 0;
1902    int loop_depth = 0;
1903
1904    foreach_list_safe(node, &this->instructions) {
1905       fs_inst *inst = (fs_inst *)node;
1906
1907       /* Make sure that we dominate the instructions we're going to
1908        * scan for interfering with our coalescing, or we won't have
1909        * scanned enough to see if anything interferes with our
1910        * coalescing.  We don't dominate the following instructions if
1911        * we're in a loop or an if block.
1912        */
1913       switch (inst->opcode) {
1914       case BRW_OPCODE_DO:
1915          loop_depth++;
1916          break;
1917       case BRW_OPCODE_WHILE:
1918          loop_depth--;
1919          break;
1920       case BRW_OPCODE_IF:
1921          if_depth++;
1922          break;
1923       case BRW_OPCODE_ENDIF:
1924          if_depth--;
1925          break;
1926       default:
1927          break;
1928       }
1929       if (loop_depth || if_depth)
1930          continue;
1931
1932       if (inst->opcode != BRW_OPCODE_MOV ||
1933           inst->predicate ||
1934           inst->saturate ||
1935           inst->dst.file != GRF || (inst->src[0].file != GRF &&
1936                                     inst->src[0].file != UNIFORM)||
1937           inst->dst.type != inst->src[0].type)
1938          continue;
1939
1940       bool has_source_modifiers = (inst->src[0].abs ||
1941                                    inst->src[0].negate ||
1942                                    inst->src[0].smear != -1 ||
1943                                    inst->src[0].file == UNIFORM);
1944
1945       /* Found a move of a GRF to a GRF.  Let's see if we can coalesce
1946        * them: check for no writes to either one until the exit of the
1947        * program.
1948        */
1949       bool interfered = false;
1950
1951       for (fs_inst *scan_inst = (fs_inst *)inst->next;
1952            !scan_inst->is_tail_sentinel();
1953            scan_inst = (fs_inst *)scan_inst->next) {
1954          if (scan_inst->dst.file == GRF) {
1955             if (scan_inst->overwrites_reg(inst->dst) ||
1956                 scan_inst->overwrites_reg(inst->src[0])) {
1957                interfered = true;
1958                break;
1959             }
1960          }
1961
1962          /* The gen6 MATH instruction can't handle source modifiers or
1963           * unusual register regions, so avoid coalescing those for
1964           * now.  We should do something more specific.
1965           */
1966          if (has_source_modifiers && !can_do_source_mods(scan_inst)) {
1967             interfered = true;
1968             break;
1969          }
1970
1971          /* The accumulator result appears to get used for the
1972           * conditional modifier generation.  When negating a UD
1973           * value, there is a 33rd bit generated for the sign in the
1974           * accumulator value, so now you can't check, for example,
1975           * equality with a 32-bit value.  See piglit fs-op-neg-uint.
1976           */
1977          if (scan_inst->conditional_mod &&
1978              inst->src[0].negate &&
1979              inst->src[0].type == BRW_REGISTER_TYPE_UD) {
1980             interfered = true;
1981             break;
1982          }
1983       }
1984       if (interfered) {
1985          continue;
1986       }
1987
1988       /* Rewrite the later usage to point at the source of the move to
1989        * be removed.
1990        */
1991       for (fs_inst *scan_inst = inst;
1992            !scan_inst->is_tail_sentinel();
1993            scan_inst = (fs_inst *)scan_inst->next) {
1994          for (int i = 0; i < 3; i++) {
1995             if (scan_inst->src[i].file == GRF &&
1996                 scan_inst->src[i].reg == inst->dst.reg &&
1997                 scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
1998                fs_reg new_src = inst->src[0];
1999                if (scan_inst->src[i].abs) {
2000                   new_src.negate = 0;
2001                   new_src.abs = 1;
2002                }
2003                new_src.negate ^= scan_inst->src[i].negate;
2004                scan_inst->src[i] = new_src;
2005             }
2006          }
2007       }
2008
2009       inst->remove();
2010       progress = true;
2011    }
2012
2013    if (progress)
2014       live_intervals_valid = false;
2015
2016    return progress;
2017 }
2018
2019
2020 bool
2021 fs_visitor::compute_to_mrf()
2022 {
2023    bool progress = false;
2024    int next_ip = 0;
2025
2026    calculate_live_intervals();
2027
2028    foreach_list_safe(node, &this->instructions) {
2029       fs_inst *inst = (fs_inst *)node;
2030
2031       int ip = next_ip;
2032       next_ip++;
2033
2034       if (inst->opcode != BRW_OPCODE_MOV ||
2035           inst->predicate ||
2036           inst->dst.file != MRF || inst->src[0].file != GRF ||
2037           inst->dst.type != inst->src[0].type ||
2038           inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
2039          continue;
2040
2041       /* Work out which hardware MRF registers are written by this
2042        * instruction.
2043        */
2044       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2045       int mrf_high;
2046       if (inst->dst.reg & BRW_MRF_COMPR4) {
2047          mrf_high = mrf_low + 4;
2048       } else if (dispatch_width == 16 &&
2049                  (!inst->force_uncompressed && !inst->force_sechalf)) {
2050          mrf_high = mrf_low + 1;
2051       } else {
2052          mrf_high = mrf_low;
2053       }
2054
2055       /* Can't compute-to-MRF this GRF if someone else was going to
2056        * read it later.
2057        */
2058       if (this->virtual_grf_use[inst->src[0].reg] > ip)
2059          continue;
2060
2061       /* Found a move of a GRF to a MRF.  Let's see if we can go
2062        * rewrite the thing that made this GRF to write into the MRF.
2063        */
2064       fs_inst *scan_inst;
2065       for (scan_inst = (fs_inst *)inst->prev;
2066            scan_inst->prev != NULL;
2067            scan_inst = (fs_inst *)scan_inst->prev) {
2068          if (scan_inst->dst.file == GRF &&
2069              scan_inst->dst.reg == inst->src[0].reg) {
2070             /* Found the last thing to write our reg we want to turn
2071              * into a compute-to-MRF.
2072              */
2073
2074             /* If it's predicated, it (probably) didn't populate all
2075              * the channels.  We might be able to rewrite everything
2076              * that writes that reg, but it would require smarter
2077              * tracking to delay the rewriting until complete success.
2078              */
2079             if (scan_inst->predicate)
2080                break;
2081
2082             /* If it's half of register setup and not the same half as
2083              * our MOV we're trying to remove, bail for now.
2084              */
2085             if (scan_inst->force_uncompressed != inst->force_uncompressed ||
2086                 scan_inst->force_sechalf != inst->force_sechalf) {
2087                break;
2088             }
2089
2090             /* SEND instructions can't have MRF as a destination. */
2091             if (scan_inst->mlen)
2092                break;
2093
2094             if (intel->gen == 6) {
2095                /* gen6 math instructions must have the destination be
2096                 * GRF, so no compute-to-MRF for them.
2097                 */
2098                if (scan_inst->is_math()) {
2099                   break;
2100                }
2101             }
2102
2103             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2104                /* Found the creator of our MRF's source value. */
2105                scan_inst->dst.file = MRF;
2106                scan_inst->dst.reg = inst->dst.reg;
2107                scan_inst->saturate |= inst->saturate;
2108                inst->remove();
2109                progress = true;
2110             }
2111             break;
2112          }
2113
2114          /* We don't handle control flow here.  Most computation of
2115           * values that end up in MRFs are shortly before the MRF
2116           * write anyway.
2117           */
2118          if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2119             break;
2120
2121          /* You can't read from an MRF, so if someone else reads our
2122           * MRF's source GRF that we wanted to rewrite, that stops us.
2123           */
2124          bool interfered = false;
2125          for (int i = 0; i < 3; i++) {
2126             if (scan_inst->src[i].file == GRF &&
2127                 scan_inst->src[i].reg == inst->src[0].reg &&
2128                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2129                interfered = true;
2130             }
2131          }
2132          if (interfered)
2133             break;
2134
2135          if (scan_inst->dst.file == MRF) {
2136             /* If somebody else writes our MRF here, we can't
2137              * compute-to-MRF before that.
2138              */
2139             int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2140             int scan_mrf_high;
2141
2142             if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2143                scan_mrf_high = scan_mrf_low + 4;
2144             } else if (dispatch_width == 16 &&
2145                        (!scan_inst->force_uncompressed &&
2146                         !scan_inst->force_sechalf)) {
2147                scan_mrf_high = scan_mrf_low + 1;
2148             } else {
2149                scan_mrf_high = scan_mrf_low;
2150             }
2151
2152             if (mrf_low == scan_mrf_low ||
2153                 mrf_low == scan_mrf_high ||
2154                 mrf_high == scan_mrf_low ||
2155                 mrf_high == scan_mrf_high) {
2156                break;
2157             }
2158          }
2159
2160          if (scan_inst->mlen > 0) {
2161             /* Found a SEND instruction, which means that there are
2162              * live values in MRFs from base_mrf to base_mrf +
2163              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
2164              * above it.
2165              */
2166             if (mrf_low >= scan_inst->base_mrf &&
2167                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2168                break;
2169             }
2170             if (mrf_high >= scan_inst->base_mrf &&
2171                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2172                break;
2173             }
2174          }
2175       }
2176    }
2177
2178    if (progress)
2179       live_intervals_valid = false;
2180
2181    return progress;
2182 }
2183
2184 /**
2185  * Walks through basic blocks, looking for repeated MRF writes and
2186  * removing the later ones.
2187  */
2188 bool
2189 fs_visitor::remove_duplicate_mrf_writes()
2190 {
2191    fs_inst *last_mrf_move[16];
2192    bool progress = false;
2193
2194    /* Need to update the MRF tracking for compressed instructions. */
2195    if (dispatch_width == 16)
2196       return false;
2197
2198    memset(last_mrf_move, 0, sizeof(last_mrf_move));
2199
2200    foreach_list_safe(node, &this->instructions) {
2201       fs_inst *inst = (fs_inst *)node;
2202
2203       if (inst->is_control_flow()) {
2204          memset(last_mrf_move, 0, sizeof(last_mrf_move));
2205       }
2206
2207       if (inst->opcode == BRW_OPCODE_MOV &&
2208           inst->dst.file == MRF) {
2209          fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2210          if (prev_inst && inst->equals(prev_inst)) {
2211             inst->remove();
2212             progress = true;
2213             continue;
2214          }
2215       }
2216
2217       /* Clear out the last-write records for MRFs that were overwritten. */
2218       if (inst->dst.file == MRF) {
2219          last_mrf_move[inst->dst.reg] = NULL;
2220       }
2221
2222       if (inst->mlen > 0) {
2223          /* Found a SEND instruction, which will include two or fewer
2224           * implied MRF writes.  We could do better here.
2225           */
2226          for (int i = 0; i < implied_mrf_writes(inst); i++) {
2227             last_mrf_move[inst->base_mrf + i] = NULL;
2228          }
2229       }
2230
2231       /* Clear out any MRF move records whose sources got overwritten. */
2232       if (inst->dst.file == GRF) {
2233          for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2234             if (last_mrf_move[i] &&
2235                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2236                last_mrf_move[i] = NULL;
2237             }
2238          }
2239       }
2240
2241       if (inst->opcode == BRW_OPCODE_MOV &&
2242           inst->dst.file == MRF &&
2243           inst->src[0].file == GRF &&
2244           !inst->predicate) {
2245          last_mrf_move[inst->dst.reg] = inst;
2246       }
2247    }
2248
2249    if (progress)
2250       live_intervals_valid = false;
2251
2252    return progress;
2253 }
2254
2255 static void
2256 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2257                         int first_grf, int grf_len)
2258 {
2259    bool inst_16wide = (dispatch_width > 8 &&
2260                        !inst->force_uncompressed &&
2261                        !inst->force_sechalf);
2262
2263    /* Clear the flag for registers that actually got read (as expected). */
2264    for (int i = 0; i < 3; i++) {
2265       int grf;
2266       if (inst->src[i].file == GRF) {
2267          grf = inst->src[i].reg;
2268       } else if (inst->src[i].file == FIXED_HW_REG &&
2269                  inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2270          grf = inst->src[i].fixed_hw_reg.nr;
2271       } else {
2272          continue;
2273       }
2274
2275       if (grf >= first_grf &&
2276           grf < first_grf + grf_len) {
2277          deps[grf - first_grf] = false;
2278          if (inst_16wide)
2279             deps[grf - first_grf + 1] = false;
2280       }
2281    }
2282 }
2283
2284 /**
2285  * Implements this workaround for the original 965:
2286  *
2287  *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2288  *      check for post destination dependencies on this instruction, software
2289  *      must ensure that there is no destination hazard for the case of ‘write
2290  *      followed by a posted write’ shown in the following example.
2291  *
2292  *      1. mov r3 0
2293  *      2. send r3.xy <rest of send instruction>
2294  *      3. mov r2 r3
2295  *
2296  *      Due to no post-destination dependency check on the ‘send’, the above
2297  *      code sequence could have two instructions (1 and 2) in flight at the
2298  *      same time that both consider ‘r3’ as the target of their final writes.
2299  */
2300 void
2301 fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
2302 {
2303    int write_len = inst->regs_written() * dispatch_width / 8;
2304    int first_write_grf = inst->dst.reg;
2305    bool needs_dep[BRW_MAX_MRF];
2306    assert(write_len < (int)sizeof(needs_dep) - 1);
2307
2308    memset(needs_dep, false, sizeof(needs_dep));
2309    memset(needs_dep, true, write_len);
2310
2311    clear_deps_for_inst_src(inst, dispatch_width,
2312                            needs_dep, first_write_grf, write_len);
2313
2314    /* Walk backwards looking for writes to registers we're writing which
2315     * aren't read since being written.  If we hit the start of the program,
2316     * we assume that there are no outstanding dependencies on entry to the
2317     * program.
2318     */
2319    for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2320         scan_inst != NULL;
2321         scan_inst = (fs_inst *)scan_inst->prev) {
2322
2323       /* If we hit control flow, assume that there *are* outstanding
2324        * dependencies, and force their cleanup before our instruction.
2325        */
2326       if (scan_inst->is_control_flow()) {
2327          for (int i = 0; i < write_len; i++) {
2328             if (needs_dep[i]) {
2329                inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2330             }
2331          }
2332       }
2333
2334       bool scan_inst_16wide = (dispatch_width > 8 &&
2335                                !scan_inst->force_uncompressed &&
2336                                !scan_inst->force_sechalf);
2337
2338       /* We insert our reads as late as possible on the assumption that any
2339        * instruction but a MOV that might have left us an outstanding
2340        * dependency has more latency than a MOV.
2341        */
2342       if (scan_inst->dst.file == GRF &&
2343           scan_inst->dst.reg >= first_write_grf &&
2344           scan_inst->dst.reg < first_write_grf + write_len &&
2345           needs_dep[scan_inst->dst.reg - first_write_grf]) {
2346          inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2347          needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2348          if (scan_inst_16wide)
2349             needs_dep[scan_inst->dst.reg - first_write_grf + 1] = false;
2350       }
2351
2352       /* Clear the flag for registers that actually got read (as expected). */
2353       clear_deps_for_inst_src(scan_inst, dispatch_width,
2354                               needs_dep, first_write_grf, write_len);
2355
2356       /* Continue the loop only if we haven't resolved all the dependencies */
2357       int i;
2358       for (i = 0; i < write_len; i++) {
2359          if (needs_dep[i])
2360             break;
2361       }
2362       if (i == write_len)
2363          return;
2364    }
2365 }
2366
2367 /**
2368  * Implements this workaround for the original 965:
2369  *
2370  *     "[DevBW, DevCL] Errata: A destination register from a send can not be
2371  *      used as a destination register until after it has been sourced by an
2372  *      instruction with a different destination register.
2373  */
2374 void
2375 fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
2376 {
2377    int write_len = inst->regs_written() * dispatch_width / 8;
2378    int first_write_grf = inst->dst.reg;
2379    bool needs_dep[BRW_MAX_MRF];
2380    assert(write_len < (int)sizeof(needs_dep) - 1);
2381
2382    memset(needs_dep, false, sizeof(needs_dep));
2383    memset(needs_dep, true, write_len);
2384    /* Walk forwards looking for writes to registers we're writing which aren't
2385     * read before being written.
2386     */
2387    for (fs_inst *scan_inst = (fs_inst *)inst->next;
2388         !scan_inst->is_tail_sentinel();
2389         scan_inst = (fs_inst *)scan_inst->next) {
2390       /* If we hit control flow, force resolve all remaining dependencies. */
2391       if (scan_inst->is_control_flow()) {
2392          for (int i = 0; i < write_len; i++) {
2393             if (needs_dep[i])
2394                scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2395          }
2396       }
2397
2398       /* Clear the flag for registers that actually got read (as expected). */
2399       clear_deps_for_inst_src(scan_inst, dispatch_width,
2400                               needs_dep, first_write_grf, write_len);
2401
2402       /* We insert our reads as late as possible since they're reading the
2403        * result of a SEND, which has massive latency.
2404        */
2405       if (scan_inst->dst.file == GRF &&
2406           scan_inst->dst.reg >= first_write_grf &&
2407           scan_inst->dst.reg < first_write_grf + write_len &&
2408           needs_dep[scan_inst->dst.reg - first_write_grf]) {
2409          scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2410          needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2411       }
2412
2413       /* Continue the loop only if we haven't resolved all the dependencies */
2414       int i;
2415       for (i = 0; i < write_len; i++) {
2416          if (needs_dep[i])
2417             break;
2418       }
2419       if (i == write_len)
2420          return;
2421    }
2422
2423    /* If we hit the end of the program, resolve all remaining dependencies out
2424     * of paranoia.
2425     */
2426    fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2427    assert(last_inst->eot);
2428    for (int i = 0; i < write_len; i++) {
2429       if (needs_dep[i])
2430          last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2431    }
2432 }
2433
2434 void
2435 fs_visitor::insert_gen4_send_dependency_workarounds()
2436 {
2437    if (intel->gen != 4 || intel->is_g4x)
2438       return;
2439
2440    /* Note that we're done with register allocation, so GRF fs_regs always
2441     * have a .reg_offset of 0.
2442     */
2443
2444    foreach_list_safe(node, &this->instructions) {
2445       fs_inst *inst = (fs_inst *)node;
2446
2447       if (inst->mlen != 0 && inst->dst.file == GRF) {
2448          insert_gen4_pre_send_dependency_workarounds(inst);
2449          insert_gen4_post_send_dependency_workarounds(inst);
2450       }
2451    }
2452 }
2453
2454 /**
2455  * Turns the generic expression-style uniform pull constant load instruction
2456  * into a hardware-specific series of instructions for loading a pull
2457  * constant.
2458  *
2459  * The expression style allows the CSE pass before this to optimize out
2460  * repeated loads from the same offset, and gives the pre-register-allocation
2461  * scheduling full flexibility, while the conversion to native instructions
2462  * allows the post-register-allocation scheduler the best information
2463  * possible.
2464  */
2465 void
2466 fs_visitor::lower_uniform_pull_constant_loads()
2467 {
2468    foreach_list(node, &this->instructions) {
2469       fs_inst *inst = (fs_inst *)node;
2470
2471       if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2472          continue;
2473
2474       if (intel->gen >= 7) {
2475          fs_reg const_offset_reg = inst->src[1];
2476          assert(const_offset_reg.file == IMM &&
2477                 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2478          const_offset_reg.imm.u /= 16;
2479          fs_reg payload = fs_reg(this, glsl_type::uint_type);
2480          struct brw_reg g0 = retype(brw_vec8_grf(0, 0),
2481                                     BRW_REGISTER_TYPE_UD);
2482
2483          fs_inst *setup1 = MOV(payload, fs_reg(g0));
2484          setup1->force_writemask_all = true;
2485          /* We don't need the second half of this vgrf to be filled with g1
2486           * in the 16-wide case, but if we use force_uncompressed then live
2487           * variable analysis won't consider this a def!
2488           */
2489
2490          fs_inst *setup2 = new(mem_ctx) fs_inst(FS_OPCODE_SET_GLOBAL_OFFSET,
2491                                                 payload, payload,
2492                                                 const_offset_reg);
2493
2494          setup1->ir = inst->ir;
2495          setup1->annotation = inst->annotation;
2496          inst->insert_before(setup1);
2497          setup2->ir = inst->ir;
2498          setup2->annotation = inst->annotation;
2499          inst->insert_before(setup2);
2500          inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2501          inst->src[1] = payload;
2502       } else {
2503          /* Before register allocation, we didn't tell the scheduler about the
2504           * MRF we use.  We know it's safe to use this MRF because nothing
2505           * else does except for register spill/unspill, which generates and
2506           * uses its MRF within a single IR instruction.
2507           */
2508          inst->base_mrf = 14;
2509          inst->mlen = 1;
2510       }
2511    }
2512 }
2513
2514 void
2515 fs_visitor::dump_instruction(fs_inst *inst)
2516 {
2517    if (inst->predicate) {
2518       printf("(%cf0.%d) ",
2519              inst->predicate_inverse ? '-' : '+',
2520              inst->flag_subreg);
2521    }
2522
2523    if (inst->opcode < ARRAY_SIZE(opcode_descs) &&
2524        opcode_descs[inst->opcode].name) {
2525       printf("%s", opcode_descs[inst->opcode].name);
2526    } else {
2527       switch (inst->opcode) {
2528       case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
2529          printf("uniform_pull_const");
2530          break;
2531       case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7:
2532          printf("uniform_pull_const_gen7");
2533          break;
2534       case FS_OPCODE_SET_GLOBAL_OFFSET:
2535          printf("set_global_offset");
2536          break;
2537       default:
2538          printf("op%d", inst->opcode);
2539          break;
2540       }
2541    }
2542    if (inst->saturate)
2543       printf(".sat");
2544    if (inst->conditional_mod) {
2545       printf(".cmod");
2546       if (!inst->predicate &&
2547           (intel->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2548                               inst->opcode != BRW_OPCODE_IF &&
2549                               inst->opcode != BRW_OPCODE_WHILE))) {
2550          printf(".f0.%d\n", inst->flag_subreg);
2551       }
2552    }
2553    printf(" ");
2554
2555
2556    switch (inst->dst.file) {
2557    case GRF:
2558       printf("vgrf%d", inst->dst.reg);
2559       if (inst->dst.reg_offset)
2560          printf("+%d", inst->dst.reg_offset);
2561       break;
2562    case MRF:
2563       printf("m%d", inst->dst.reg);
2564       break;
2565    case BAD_FILE:
2566       printf("(null)");
2567       break;
2568    case UNIFORM:
2569       printf("***u%d***", inst->dst.reg);
2570       break;
2571    default:
2572       printf("???");
2573       break;
2574    }
2575    printf(", ");
2576
2577    for (int i = 0; i < 3; i++) {
2578       if (inst->src[i].negate)
2579          printf("-");
2580       if (inst->src[i].abs)
2581          printf("|");
2582       switch (inst->src[i].file) {
2583       case GRF:
2584          printf("vgrf%d", inst->src[i].reg);
2585          if (inst->src[i].reg_offset)
2586             printf("+%d", inst->src[i].reg_offset);
2587          break;
2588       case MRF:
2589          printf("***m%d***", inst->src[i].reg);
2590          break;
2591       case UNIFORM:
2592          printf("u%d", inst->src[i].reg);
2593          if (inst->src[i].reg_offset)
2594             printf(".%d", inst->src[i].reg_offset);
2595          break;
2596       case BAD_FILE:
2597          printf("(null)");
2598          break;
2599       case IMM:
2600          switch (inst->src[i].type) {
2601          case BRW_REGISTER_TYPE_F:
2602             printf("%ff", inst->src[i].imm.f);
2603             break;
2604          case BRW_REGISTER_TYPE_D:
2605             printf("%dd", inst->src[i].imm.i);
2606             break;
2607          case BRW_REGISTER_TYPE_UD:
2608             printf("%uu", inst->src[i].imm.u);
2609             break;
2610          default:
2611             printf("???");
2612             break;
2613          }
2614          break;
2615       default:
2616          printf("???");
2617          break;
2618       }
2619       if (inst->src[i].abs)
2620          printf("|");
2621
2622       if (i < 3)
2623          printf(", ");
2624    }
2625
2626    printf(" ");
2627
2628    if (inst->force_uncompressed)
2629       printf("1sthalf ");
2630
2631    if (inst->force_sechalf)
2632       printf("2ndhalf ");
2633
2634    printf("\n");
2635 }
2636
2637 void
2638 fs_visitor::dump_instructions()
2639 {
2640    int ip = 0;
2641    foreach_list(node, &this->instructions) {
2642       fs_inst *inst = (fs_inst *)node;
2643       printf("%d: ", ip++);
2644       dump_instruction(inst);
2645    }
2646 }
2647
2648 /**
2649  * Possibly returns an instruction that set up @param reg.
2650  *
2651  * Sometimes we want to take the result of some expression/variable
2652  * dereference tree and rewrite the instruction generating the result
2653  * of the tree.  When processing the tree, we know that the
2654  * instructions generated are all writing temporaries that are dead
2655  * outside of this tree.  So, if we have some instructions that write
2656  * a temporary, we're free to point that temp write somewhere else.
2657  *
2658  * Note that this doesn't guarantee that the instruction generated
2659  * only reg -- it might be the size=4 destination of a texture instruction.
2660  */
2661 fs_inst *
2662 fs_visitor::get_instruction_generating_reg(fs_inst *start,
2663                                            fs_inst *end,
2664                                            fs_reg reg)
2665 {
2666    if (end == start ||
2667        end->predicate ||
2668        end->force_uncompressed ||
2669        end->force_sechalf ||
2670        reg.reladdr ||
2671        !reg.equals(end->dst)) {
2672       return NULL;
2673    } else {
2674       return end;
2675    }
2676 }
2677
2678 void
2679 fs_visitor::setup_payload_gen6()
2680 {
2681    struct intel_context *intel = &brw->intel;
2682    bool uses_depth =
2683       (fp->Base.InputsRead & (1 << FRAG_ATTRIB_WPOS)) != 0;
2684    unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
2685
2686    assert(intel->gen >= 6);
2687
2688    /* R0-1: masks, pixel X/Y coordinates. */
2689    c->nr_payload_regs = 2;
2690    /* R2: only for 32-pixel dispatch.*/
2691
2692    /* R3-26: barycentric interpolation coordinates.  These appear in the
2693     * same order that they appear in the brw_wm_barycentric_interp_mode
2694     * enum.  Each set of coordinates occupies 2 registers if dispatch width
2695     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
2696     * appear if they were enabled using the "Barycentric Interpolation
2697     * Mode" bits in WM_STATE.
2698     */
2699    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
2700       if (barycentric_interp_modes & (1 << i)) {
2701          c->barycentric_coord_reg[i] = c->nr_payload_regs;
2702          c->nr_payload_regs += 2;
2703          if (dispatch_width == 16) {
2704             c->nr_payload_regs += 2;
2705          }
2706       }
2707    }
2708
2709    /* R27: interpolated depth if uses source depth */
2710    if (uses_depth) {
2711       c->source_depth_reg = c->nr_payload_regs;
2712       c->nr_payload_regs++;
2713       if (dispatch_width == 16) {
2714          /* R28: interpolated depth if not 8-wide. */
2715          c->nr_payload_regs++;
2716       }
2717    }
2718    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
2719    if (uses_depth) {
2720       c->source_w_reg = c->nr_payload_regs;
2721       c->nr_payload_regs++;
2722       if (dispatch_width == 16) {
2723          /* R30: interpolated W if not 8-wide. */
2724          c->nr_payload_regs++;
2725       }
2726    }
2727    /* R31: MSAA position offsets. */
2728    /* R32-: bary for 32-pixel. */
2729    /* R58-59: interp W for 32-pixel. */
2730
2731    if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
2732       c->source_depth_to_render_target = true;
2733    }
2734 }
2735
2736 bool
2737 fs_visitor::run()
2738 {
2739    sanity_param_count = fp->Base.Parameters->NumParameters;
2740    uint32_t orig_nr_params = c->prog_data.nr_params;
2741
2742    if (intel->gen >= 6)
2743       setup_payload_gen6();
2744    else
2745       setup_payload_gen4();
2746
2747    if (0) {
2748       emit_dummy_fs();
2749    } else {
2750       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2751          emit_shader_time_begin();
2752
2753       calculate_urb_setup();
2754       if (intel->gen < 6)
2755          emit_interpolation_setup_gen4();
2756       else
2757          emit_interpolation_setup_gen6();
2758
2759       /* We handle discards by keeping track of the still-live pixels in f0.1.
2760        * Initialize it with the dispatched pixels.
2761        */
2762       if (fp->UsesKill) {
2763          fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
2764          discard_init->flag_subreg = 1;
2765       }
2766
2767       /* Generate FS IR for main().  (the visitor only descends into
2768        * functions called "main").
2769        */
2770       if (shader) {
2771          foreach_list(node, &*shader->ir) {
2772             ir_instruction *ir = (ir_instruction *)node;
2773             base_ir = ir;
2774             this->result = reg_undef;
2775             ir->accept(this);
2776          }
2777       } else {
2778          emit_fragment_program_code();
2779       }
2780       base_ir = NULL;
2781       if (failed)
2782          return false;
2783
2784       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2785          emit_shader_time_end();
2786
2787       emit_fb_writes();
2788
2789       split_virtual_grfs();
2790
2791       move_uniform_array_access_to_pull_constants();
2792       setup_pull_constants();
2793
2794       bool progress;
2795       do {
2796          progress = false;
2797
2798          compact_virtual_grfs();
2799
2800          progress = remove_duplicate_mrf_writes() || progress;
2801
2802          progress = opt_algebraic() || progress;
2803          progress = opt_cse() || progress;
2804          progress = opt_copy_propagate() || progress;
2805          progress = dead_code_eliminate() || progress;
2806          progress = register_coalesce() || progress;
2807          progress = register_coalesce_2() || progress;
2808          progress = compute_to_mrf() || progress;
2809       } while (progress);
2810
2811       remove_dead_constants();
2812
2813       schedule_instructions(false);
2814
2815       lower_uniform_pull_constant_loads();
2816
2817       assign_curb_setup();
2818       assign_urb_setup();
2819
2820       if (0) {
2821          /* Debug of register spilling: Go spill everything. */
2822          for (int i = 0; i < virtual_grf_count; i++) {
2823             spill_reg(i);
2824          }
2825       }
2826
2827       if (0)
2828          assign_regs_trivial();
2829       else {
2830          while (!assign_regs()) {
2831             if (failed)
2832                break;
2833          }
2834       }
2835    }
2836    assert(force_uncompressed_stack == 0);
2837    assert(force_sechalf_stack == 0);
2838
2839    /* This must come after all optimization and register allocation, since
2840     * it inserts dead code that happens to have side effects, and it does
2841     * so based on the actual physical registers in use.
2842     */
2843    insert_gen4_send_dependency_workarounds();
2844
2845    if (failed)
2846       return false;
2847
2848    schedule_instructions(true);
2849
2850    if (dispatch_width == 8) {
2851       c->prog_data.reg_blocks = brw_register_blocks(grf_used);
2852    } else {
2853       c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
2854
2855       /* Make sure we didn't try to sneak in an extra uniform */
2856       assert(orig_nr_params == c->prog_data.nr_params);
2857       (void) orig_nr_params;
2858    }
2859
2860    /* If any state parameters were appended, then ParameterValues could have
2861     * been realloced, in which case the driver uniform storage set up by
2862     * _mesa_associate_uniform_storage() would point to freed memory.  Make
2863     * sure that didn't happen.
2864     */
2865    assert(sanity_param_count == fp->Base.Parameters->NumParameters);
2866
2867    return !failed;
2868 }
2869
2870 const unsigned *
2871 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
2872                struct gl_fragment_program *fp,
2873                struct gl_shader_program *prog,
2874                unsigned *final_assembly_size)
2875 {
2876    struct intel_context *intel = &brw->intel;
2877    bool start_busy = false;
2878    float start_time = 0;
2879
2880    if (unlikely(INTEL_DEBUG & DEBUG_PERF)) {
2881       start_busy = (intel->batch.last_bo &&
2882                     drm_intel_bo_busy(intel->batch.last_bo));
2883       start_time = get_time();
2884    }
2885
2886    struct brw_shader *shader = NULL;
2887    if (prog)
2888       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
2889
2890    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
2891       if (shader) {
2892          printf("GLSL IR for native fragment shader %d:\n", prog->Name);
2893          _mesa_print_ir(shader->ir, NULL);
2894          printf("\n\n");
2895       } else {
2896          printf("ARB_fragment_program %d ir for native fragment shader\n",
2897                 fp->Base.Id);
2898          _mesa_print_program(&fp->Base);
2899       }
2900    }
2901
2902    /* Now the main event: Visit the shader IR and generate our FS IR for it.
2903     */
2904    fs_visitor v(brw, c, prog, fp, 8);
2905    if (!v.run()) {
2906       prog->LinkStatus = false;
2907       ralloc_strcat(&prog->InfoLog, v.fail_msg);
2908
2909       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
2910                     v.fail_msg);
2911
2912       return NULL;
2913    }
2914
2915    exec_list *simd16_instructions = NULL;
2916    fs_visitor v2(brw, c, prog, fp, 16);
2917    bool no16 = INTEL_DEBUG & DEBUG_NO16;
2918    if (intel->gen >= 5 && c->prog_data.nr_pull_params == 0 && likely(!no16)) {
2919       v2.import_uniforms(&v);
2920       if (!v2.run()) {
2921          perf_debug("16-wide shader failed to compile, falling back to "
2922                     "8-wide at a 10-20%% performance cost: %s", v2.fail_msg);
2923       } else {
2924          simd16_instructions = &v2.instructions;
2925       }
2926    }
2927
2928    c->prog_data.dispatch_width = 8;
2929
2930    fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
2931    const unsigned *generated = g.generate_assembly(&v.instructions,
2932                                                    simd16_instructions,
2933                                                    final_assembly_size);
2934
2935    if (unlikely(INTEL_DEBUG & DEBUG_PERF) && shader) {
2936       if (shader->compiled_once)
2937          brw_wm_debug_recompile(brw, prog, &c->key);
2938       shader->compiled_once = true;
2939
2940       if (start_busy && !drm_intel_bo_busy(intel->batch.last_bo)) {
2941          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
2942                     (get_time() - start_time) * 1000);
2943       }
2944    }
2945
2946    return generated;
2947 }
2948
2949 bool
2950 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
2951 {
2952    struct brw_context *brw = brw_context(ctx);
2953    struct intel_context *intel = &brw->intel;
2954    struct brw_wm_prog_key key;
2955
2956    if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
2957       return true;
2958
2959    struct gl_fragment_program *fp = (struct gl_fragment_program *)
2960       prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
2961    struct brw_fragment_program *bfp = brw_fragment_program(fp);
2962    bool program_uses_dfdy = fp->UsesDFdy;
2963
2964    memset(&key, 0, sizeof(key));
2965
2966    if (intel->gen < 6) {
2967       if (fp->UsesKill)
2968          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
2969
2970       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
2971          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
2972
2973       /* Just assume depth testing. */
2974       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
2975       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
2976    }
2977
2978    if (prog->Name != 0)
2979       key.proj_attrib_mask = 0xffffffff;
2980
2981    if (intel->gen < 6)
2982       key.vp_outputs_written |= BITFIELD64_BIT(FRAG_ATTRIB_WPOS);
2983
2984    for (int i = 0; i < FRAG_ATTRIB_MAX; i++) {
2985       if (!(fp->Base.InputsRead & BITFIELD64_BIT(i)))
2986          continue;
2987
2988       if (prog->Name == 0)
2989          key.proj_attrib_mask |= 1 << i;
2990
2991       if (intel->gen < 6) {
2992          int vp_index = _mesa_vert_result_to_frag_attrib((gl_vert_result) i);
2993
2994          if (vp_index >= 0)
2995             key.vp_outputs_written |= BITFIELD64_BIT(vp_index);
2996       }
2997    }
2998
2999    key.clamp_fragment_color = true;
3000
3001    for (int i = 0; i < MAX_SAMPLERS; i++) {
3002       if (fp->Base.ShadowSamplers & (1 << i)) {
3003          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3004          key.tex.swizzles[i] =
3005             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3006       } else {
3007          /* Color sampler: assume no swizzling. */
3008          key.tex.swizzles[i] = SWIZZLE_XYZW;
3009       }
3010    }
3011
3012    if (fp->Base.InputsRead & FRAG_BIT_WPOS) {
3013       key.drawable_height = ctx->DrawBuffer->Height;
3014    }
3015
3016    if ((fp->Base.InputsRead & FRAG_BIT_WPOS) || program_uses_dfdy) {
3017       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
3018    }
3019
3020    key.nr_color_regions = 1;
3021
3022    key.program_string_id = bfp->id;
3023
3024    uint32_t old_prog_offset = brw->wm.prog_offset;
3025    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3026
3027    bool success = do_wm_prog(brw, prog, bfp, &key);
3028
3029    brw->wm.prog_offset = old_prog_offset;
3030    brw->wm.prog_data = old_prog_data;
3031
3032    return success;
3033 }