src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 extern "C" {
  32
  33 #include <sys/types.h>
  34
  35 #include "main/hash_table.h"
  36 #include "main/macros.h"
  37 #include "main/shaderobj.h"
  38 #include "main/uniforms.h"
  39 #include "main/fbobject.h"
  40 #include "program/prog_parameter.h"
  41 #include "program/prog_print.h"
  42 #include "program/register_allocate.h"
  43 #include "program/sampler.h"
  44 #include "program/hash_table.h"
  45 #include "brw_context.h"
  46 #include "brw_eu.h"
  47 #include "brw_wm.h"
  48 }
  49 #include "brw_fs.h"
  50 #include "glsl/glsl_types.h"
  51 #include "glsl/ir_print_visitor.h"
  52
  53 void
  54 fs_inst::init()
  55 {
  56    memset(this, 0, sizeof(*this));
  57    this->opcode = BRW_OPCODE_NOP;
  58    this->conditional_mod = BRW_CONDITIONAL_NONE;
  59
  60    this->dst = reg_undef;
  61    this->src[0] = reg_undef;
  62    this->src[1] = reg_undef;
  63    this->src[2] = reg_undef;
  64
  65    /* This will be the case for almost all instructions. */
  66    this->regs_written = 1;
  67 }
  68
  69 fs_inst::fs_inst()
  70 {
  71    init();
  72 }
  73
  74 fs_inst::fs_inst(enum opcode opcode)
  75 {
  76    init();
  77    this->opcode = opcode;
  78 }
  79
  80 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
  81 {
  82    init();
  83    this->opcode = opcode;
  84    this->dst = dst;
  85
  86    if (dst.file == GRF)
  87       assert(dst.reg_offset >= 0);
  88 }
  89
  90 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
  91 {
  92    init();
  93    this->opcode = opcode;
  94    this->dst = dst;
  95    this->src[0] = src0;
  96
  97    if (dst.file == GRF)
  98       assert(dst.reg_offset >= 0);
  99    if (src[0].file == GRF)
 100       assert(src[0].reg_offset >= 0);
 101 }
 102
 103 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 104 {
 105    init();
 106    this->opcode = opcode;
 107    this->dst = dst;
 108    this->src[0] = src0;
 109    this->src[1] = src1;
 110
 111    if (dst.file == GRF)
 112       assert(dst.reg_offset >= 0);
 113    if (src[0].file == GRF)
 114       assert(src[0].reg_offset >= 0);
 115    if (src[1].file == GRF)
 116       assert(src[1].reg_offset >= 0);
 117 }
 118
 119 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
 120                  fs_reg src0, fs_reg src1, fs_reg src2)
 121 {
 122    init();
 123    this->opcode = opcode;
 124    this->dst = dst;
 125    this->src[0] = src0;
 126    this->src[1] = src1;
 127    this->src[2] = src2;
 128
 129    if (dst.file == GRF)
 130       assert(dst.reg_offset >= 0);
 131    if (src[0].file == GRF)
 132       assert(src[0].reg_offset >= 0);
 133    if (src[1].file == GRF)
 134       assert(src[1].reg_offset >= 0);
 135    if (src[2].file == GRF)
 136       assert(src[2].reg_offset >= 0);
 137 }
 138
 139 #define ALU1(op)                                                        \
 140    fs_inst *                                                            \
 141    fs_visitor::op(fs_reg dst, fs_reg src0)                              \
 142    {                                                                    \
 143       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
 144    }
 145
 146 #define ALU2(op)                                                        \
 147    fs_inst *                                                            \
 148    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1)                 \
 149    {                                                                    \
 150       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
 151    }
 152
 153 #define ALU3(op)                                                        \
 154    fs_inst *                                                            \
 155    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1, fs_reg src2)    \
 156    {                                                                    \
 157       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
 158    }
 159
 160 ALU1(NOT)
 161 ALU1(MOV)
 162 ALU1(FRC)
 163 ALU1(RNDD)
 164 ALU1(RNDE)
 165 ALU1(RNDZ)
 166 ALU2(ADD)
 167 ALU2(MUL)
 168 ALU2(MACH)
 169 ALU2(AND)
 170 ALU2(OR)
 171 ALU2(XOR)
 172 ALU2(SHL)
 173 ALU2(SHR)
 174 ALU2(ASR)
 175 ALU3(LRP)
 176
 177 /** Gen4 predicated IF. */
 178 fs_inst *
 179 fs_visitor::IF(uint32_t predicate)
 180 {
 181    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
 182    inst->predicate = predicate;
 183    return inst;
 184 }
 185
 186 /** Gen6+ IF with embedded comparison. */
 187 fs_inst *
 188 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
 189 {
 190    assert(intel->gen >= 6);
 191    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
 192                                         reg_null_d, src0, src1);
 193    inst->conditional_mod = condition;
 194    return inst;
 195 }
 196
 197 /**
 198  * CMP: Sets the low bit of the destination channels with the result
 199  * of the comparison, while the upper bits are undefined, and updates
 200  * the flag register with the packed 16 bits of the result.
 201  */
 202 fs_inst *
 203 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
 204 {
 205    fs_inst *inst;
 206
 207    /* Take the instruction:
 208     *
 209     * CMP null<d> src0<f> src1<f>
 210     *
 211     * Original gen4 does type conversion to the destination type before
 212     * comparison, producing garbage results for floating point comparisons.
 213     * gen5 does the comparison on the execution type (resolved source types),
 214     * so dst type doesn't matter.  gen6 does comparison and then uses the
 215     * result as if it was the dst type with no conversion, which happens to
 216     * mostly work out for float-interpreted-as-int since our comparisons are
 217     * for >0, =0, <0.
 218     */
 219    if (intel->gen == 4) {
 220       dst.type = src0.type;
 221       if (dst.file == FIXED_HW_REG)
 222          dst.fixed_hw_reg.type = dst.type;
 223    }
 224
 225    resolve_ud_negate(&src0);
 226    resolve_ud_negate(&src1);
 227
 228    inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
 229    inst->conditional_mod = condition;
 230
 231    return inst;
 232 }
 233
 234 exec_list
 235 fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
 236                                        fs_reg varying_offset,
 237                                        uint32_t const_offset)
 238 {
 239    exec_list instructions;
 240    fs_inst *inst;
 241
 242    /* We have our constant surface use a pitch of 4 bytes, so our index can
 243     * be any component of a vector, and then we load 4 contiguous
 244     * components starting from that.
 245     *
 246     * We break down the const_offset to a portion added to the variable
 247     * offset and a portion done using reg_offset, which means that if you
 248     * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
 249     * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
 250     * CSE can later notice that those loads are all the same and eliminate
 251     * the redundant ones.
 252     */
 253    fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
 254    instructions.push_tail(ADD(vec4_offset,
 255                               varying_offset, const_offset & ~3));
 256
 257    int scale = 1;
 258    if (intel->gen == 4 && dispatch_width == 8) {
 259       /* Pre-gen5, we can either use a SIMD8 message that requires (header,
 260        * u, v, r) as parameters, or we can just use the SIMD16 message
 261        * consisting of (header, u).  We choose the second, at the cost of a
 262        * longer return length.
 263        */
 264       scale = 2;
 265    }
 266
 267    enum opcode op;
 268    if (intel->gen >= 7)
 269       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
 270    else
 271       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
 272    fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4 * scale), dst.type);
 273    inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
 274    inst->regs_written = 4 * scale;
 275    instructions.push_tail(inst);
 276
 277    if (intel->gen < 7) {
 278       inst->base_mrf = 13;
 279       inst->header_present = true;
 280       if (intel->gen == 4)
 281          inst->mlen = 3;
 282       else
 283          inst->mlen = 1 + dispatch_width / 8;
 284    }
 285
 286    vec4_result.reg_offset += (const_offset & 3) * scale;
 287    instructions.push_tail(MOV(dst, vec4_result));
 288
 289    return instructions;
 290 }
 291
 292 /**
 293  * A helper for MOV generation for fixing up broken hardware SEND dependency
 294  * handling.
 295  */
 296 fs_inst *
 297 fs_visitor::DEP_RESOLVE_MOV(int grf)
 298 {
 299    fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
 300
 301    inst->ir = NULL;
 302    inst->annotation = "send dependency resolve";
 303
 304    /* The caller always wants uncompressed to emit the minimal extra
 305     * dependencies, and to avoid having to deal with aligning its regs to 2.
 306     */
 307    inst->force_uncompressed = true;
 308
 309    return inst;
 310 }
 311
 312 bool
 313 fs_inst::equals(fs_inst *inst)
 314 {
 315    return (opcode == inst->opcode &&
 316            dst.equals(inst->dst) &&
 317            src[0].equals(inst->src[0]) &&
 318            src[1].equals(inst->src[1]) &&
 319            src[2].equals(inst->src[2]) &&
 320            saturate == inst->saturate &&
 321            predicate == inst->predicate &&
 322            conditional_mod == inst->conditional_mod &&
 323            mlen == inst->mlen &&
 324            base_mrf == inst->base_mrf &&
 325            sampler == inst->sampler &&
 326            target == inst->target &&
 327            eot == inst->eot &&
 328            header_present == inst->header_present &&
 329            shadow_compare == inst->shadow_compare &&
 330            offset == inst->offset);
 331 }
 332
 333 bool
 334 fs_inst::overwrites_reg(const fs_reg &reg)
 335 {
 336    return (reg.file == dst.file &&
 337            reg.reg == dst.reg &&
 338            reg.reg_offset >= dst.reg_offset  &&
 339            reg.reg_offset < dst.reg_offset + regs_written);
 340 }
 341
 342 bool
 343 fs_inst::is_tex()
 344 {
 345    return (opcode == SHADER_OPCODE_TEX ||
 346            opcode == FS_OPCODE_TXB ||
 347            opcode == SHADER_OPCODE_TXD ||
 348            opcode == SHADER_OPCODE_TXF ||
 349            opcode == SHADER_OPCODE_TXF_MS ||
 350            opcode == SHADER_OPCODE_TXL ||
 351            opcode == SHADER_OPCODE_TXS ||
 352            opcode == SHADER_OPCODE_LOD);
 353 }
 354
 355 bool
 356 fs_inst::is_math()
 357 {
 358    return (opcode == SHADER_OPCODE_RCP ||
 359            opcode == SHADER_OPCODE_RSQ ||
 360            opcode == SHADER_OPCODE_SQRT ||
 361            opcode == SHADER_OPCODE_EXP2 ||
 362            opcode == SHADER_OPCODE_LOG2 ||
 363            opcode == SHADER_OPCODE_SIN ||
 364            opcode == SHADER_OPCODE_COS ||
 365            opcode == SHADER_OPCODE_INT_QUOTIENT ||
 366            opcode == SHADER_OPCODE_INT_REMAINDER ||
 367            opcode == SHADER_OPCODE_POW);
 368 }
 369
 370 bool
 371 fs_inst::is_control_flow()
 372 {
 373    switch (opcode) {
 374    case BRW_OPCODE_DO:
 375    case BRW_OPCODE_WHILE:
 376    case BRW_OPCODE_IF:
 377    case BRW_OPCODE_ELSE:
 378    case BRW_OPCODE_ENDIF:
 379    case BRW_OPCODE_BREAK:
 380    case BRW_OPCODE_CONTINUE:
 381       return true;
 382    default:
 383       return false;
 384    }
 385 }
 386
 387 bool
 388 fs_inst::is_send_from_grf()
 389 {
 390    return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
 391            opcode == SHADER_OPCODE_SHADER_TIME_ADD ||
 392            (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
 393             src[1].file == GRF));
 394 }
 395
 396 bool
 397 fs_visitor::can_do_source_mods(fs_inst *inst)
 398 {
 399    if (intel->gen == 6 && inst->is_math())
 400       return false;
 401
 402    if (inst->is_send_from_grf())
 403       return false;
 404
 405    return true;
 406 }
 407
 408 void
 409 fs_reg::init()
 410 {
 411    memset(this, 0, sizeof(*this));
 412    this->smear = -1;
 413 }
 414
 415 /** Generic unset register constructor. */
 416 fs_reg::fs_reg()
 417 {
 418    init();
 419    this->file = BAD_FILE;
 420 }
 421
 422 /** Immediate value constructor. */
 423 fs_reg::fs_reg(float f)
 424 {
 425    init();
 426    this->file = IMM;
 427    this->type = BRW_REGISTER_TYPE_F;
 428    this->imm.f = f;
 429 }
 430
 431 /** Immediate value constructor. */
 432 fs_reg::fs_reg(int32_t i)
 433 {
 434    init();
 435    this->file = IMM;
 436    this->type = BRW_REGISTER_TYPE_D;
 437    this->imm.i = i;
 438 }
 439
 440 /** Immediate value constructor. */
 441 fs_reg::fs_reg(uint32_t u)
 442 {
 443    init();
 444    this->file = IMM;
 445    this->type = BRW_REGISTER_TYPE_UD;
 446    this->imm.u = u;
 447 }
 448
 449 /** Fixed brw_reg Immediate value constructor. */
 450 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
 451 {
 452    init();
 453    this->file = FIXED_HW_REG;
 454    this->fixed_hw_reg = fixed_hw_reg;
 455    this->type = fixed_hw_reg.type;
 456 }
 457
 458 bool
 459 fs_reg::equals(const fs_reg &r) const
 460 {
 461    return (file == r.file &&
 462            reg == r.reg &&
 463            reg_offset == r.reg_offset &&
 464            type == r.type &&
 465            negate == r.negate &&
 466            abs == r.abs &&
 467            !reladdr && !r.reladdr &&
 468            memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
 469                   sizeof(fixed_hw_reg)) == 0 &&
 470            smear == r.smear &&
 471            imm.u == r.imm.u);
 472 }
 473
 474 bool
 475 fs_reg::is_zero() const
 476 {
 477    if (file != IMM)
 478       return false;
 479
 480    return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
 481 }
 482
 483 bool
 484 fs_reg::is_one() const
 485 {
 486    if (file != IMM)
 487       return false;
 488
 489    return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
 490 }
 491
 492 int
 493 fs_visitor::type_size(const struct glsl_type *type)
 494 {
 495    unsigned int size, i;
 496
 497    switch (type->base_type) {
 498    case GLSL_TYPE_UINT:
 499    case GLSL_TYPE_INT:
 500    case GLSL_TYPE_FLOAT:
 501    case GLSL_TYPE_BOOL:
 502       return type->components();
 503    case GLSL_TYPE_ARRAY:
 504       return type_size(type->fields.array) * type->length;
 505    case GLSL_TYPE_STRUCT:
 506       size = 0;
 507       for (i = 0; i < type->length; i++) {
 508          size += type_size(type->fields.structure[i].type);
 509       }
 510       return size;
 511    case GLSL_TYPE_SAMPLER:
 512       /* Samplers take up no register space, since they're baked in at
 513        * link time.
 514        */
 515       return 0;
 516    case GLSL_TYPE_VOID:
 517    case GLSL_TYPE_ERROR:
 518    case GLSL_TYPE_INTERFACE:
 519       assert(!"not reached");
 520       break;
 521    }
 522
 523    return 0;
 524 }
 525
 526 fs_reg
 527 fs_visitor::get_timestamp()
 528 {
 529    assert(intel->gen >= 7);
 530
 531    fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 532                                           BRW_ARF_TIMESTAMP,
 533                                           0),
 534                              BRW_REGISTER_TYPE_UD));
 535
 536    fs_reg dst = fs_reg(this, glsl_type::uint_type);
 537
 538    fs_inst *mov = emit(MOV(dst, ts));
 539    /* We want to read the 3 fields we care about (mostly field 0, but also 2)
 540     * even if it's not enabled in the dispatch.
 541     */
 542    mov->force_writemask_all = true;
 543    mov->force_uncompressed = true;
 544
 545    /* The caller wants the low 32 bits of the timestamp.  Since it's running
 546     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
 547     * which is plenty of time for our purposes.  It is identical across the
 548     * EUs, but since it's tracking GPU core speed it will increment at a
 549     * varying rate as render P-states change.
 550     *
 551     * The caller could also check if render P-states have changed (or anything
 552     * else that might disrupt timing) by setting smear to 2 and checking if
 553     * that field is != 0.
 554     */
 555    dst.smear = 0;
 556
 557    return dst;
 558 }
 559
 560 void
 561 fs_visitor::emit_shader_time_begin()
 562 {
 563    current_annotation = "shader time start";
 564    shader_start_time = get_timestamp();
 565 }
 566
 567 void
 568 fs_visitor::emit_shader_time_end()
 569 {
 570    current_annotation = "shader time end";
 571
 572    enum shader_time_shader_type type, written_type, reset_type;
 573    if (dispatch_width == 8) {
 574       type = ST_FS8;
 575       written_type = ST_FS8_WRITTEN;
 576       reset_type = ST_FS8_RESET;
 577    } else {
 578       assert(dispatch_width == 16);
 579       type = ST_FS16;
 580       written_type = ST_FS16_WRITTEN;
 581       reset_type = ST_FS16_RESET;
 582    }
 583
 584    fs_reg shader_end_time = get_timestamp();
 585
 586    /* Check that there weren't any timestamp reset events (assuming these
 587     * were the only two timestamp reads that happened).
 588     */
 589    fs_reg reset = shader_end_time;
 590    reset.smear = 2;
 591    fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
 592    test->conditional_mod = BRW_CONDITIONAL_Z;
 593    emit(IF(BRW_PREDICATE_NORMAL));
 594
 595    push_force_uncompressed();
 596    fs_reg start = shader_start_time;
 597    start.negate = true;
 598    fs_reg diff = fs_reg(this, glsl_type::uint_type);
 599    emit(ADD(diff, start, shader_end_time));
 600
 601    /* If there were no instructions between the two timestamp gets, the diff
 602     * is 2 cycles.  Remove that overhead, so I can forget about that when
 603     * trying to determine the time taken for single instructions.
 604     */
 605    emit(ADD(diff, diff, fs_reg(-2u)));
 606
 607    emit_shader_time_write(type, diff);
 608    emit_shader_time_write(written_type, fs_reg(1u));
 609    emit(BRW_OPCODE_ELSE);
 610    emit_shader_time_write(reset_type, fs_reg(1u));
 611    emit(BRW_OPCODE_ENDIF);
 612
 613    pop_force_uncompressed();
 614 }
 615
 616 void
 617 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
 618                                    fs_reg value)
 619 {
 620    int shader_time_index =
 621       brw_get_shader_time_index(brw, shader_prog, &fp->Base, type);
 622    fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
 623
 624    fs_reg payload;
 625    if (dispatch_width == 8)
 626       payload = fs_reg(this, glsl_type::uvec2_type);
 627    else
 628       payload = fs_reg(this, glsl_type::uint_type);
 629
 630    emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
 631                 fs_reg(), payload, offset, value));
 632 }
 633
 634 void
 635 fs_visitor::fail(const char *format, ...)
 636 {
 637    va_list va;
 638    char *msg;
 639
 640    if (failed)
 641       return;
 642
 643    failed = true;
 644
 645    va_start(va, format);
 646    msg = ralloc_vasprintf(mem_ctx, format, va);
 647    va_end(va);
 648    msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
 649
 650    this->fail_msg = msg;
 651
 652    if (INTEL_DEBUG & DEBUG_WM) {
 653       fprintf(stderr, "%s",  msg);
 654    }
 655 }
 656
 657 fs_inst *
 658 fs_visitor::emit(enum opcode opcode)
 659 {
 660    return emit(fs_inst(opcode));
 661 }
 662
 663 fs_inst *
 664 fs_visitor::emit(enum opcode opcode, fs_reg dst)
 665 {
 666    return emit(fs_inst(opcode, dst));
 667 }
 668
 669 fs_inst *
 670 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
 671 {
 672    return emit(fs_inst(opcode, dst, src0));
 673 }
 674
 675 fs_inst *
 676 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 677 {
 678    return emit(fs_inst(opcode, dst, src0, src1));
 679 }
 680
 681 fs_inst *
 682 fs_visitor::emit(enum opcode opcode, fs_reg dst,
 683                  fs_reg src0, fs_reg src1, fs_reg src2)
 684 {
 685    return emit(fs_inst(opcode, dst, src0, src1, src2));
 686 }
 687
 688 void
 689 fs_visitor::push_force_uncompressed()
 690 {
 691    force_uncompressed_stack++;
 692 }
 693
 694 void
 695 fs_visitor::pop_force_uncompressed()
 696 {
 697    force_uncompressed_stack--;
 698    assert(force_uncompressed_stack >= 0);
 699 }
 700
 701 void
 702 fs_visitor::push_force_sechalf()
 703 {
 704    force_sechalf_stack++;
 705 }
 706
 707 void
 708 fs_visitor::pop_force_sechalf()
 709 {
 710    force_sechalf_stack--;
 711    assert(force_sechalf_stack >= 0);
 712 }
 713
 714 /**
 715  * Returns true if the instruction has a flag that means it won't
 716  * update an entire destination register.
 717  *
 718  * For example, dead code elimination and live variable analysis want to know
 719  * when a write to a variable screens off any preceding values that were in
 720  * it.
 721  */
 722 bool
 723 fs_inst::is_partial_write()
 724 {
 725    return (this->predicate ||
 726            this->force_uncompressed ||
 727            this->force_sechalf);
 728 }
 729
 730 /**
 731  * Returns how many MRFs an FS opcode will write over.
 732  *
 733  * Note that this is not the 0 or 1 implied writes in an actual gen
 734  * instruction -- the FS opcodes often generate MOVs in addition.
 735  */
 736 int
 737 fs_visitor::implied_mrf_writes(fs_inst *inst)
 738 {
 739    if (inst->mlen == 0)
 740       return 0;
 741
 742    switch (inst->opcode) {
 743    case SHADER_OPCODE_RCP:
 744    case SHADER_OPCODE_RSQ:
 745    case SHADER_OPCODE_SQRT:
 746    case SHADER_OPCODE_EXP2:
 747    case SHADER_OPCODE_LOG2:
 748    case SHADER_OPCODE_SIN:
 749    case SHADER_OPCODE_COS:
 750       return 1 * dispatch_width / 8;
 751    case SHADER_OPCODE_POW:
 752    case SHADER_OPCODE_INT_QUOTIENT:
 753    case SHADER_OPCODE_INT_REMAINDER:
 754       return 2 * dispatch_width / 8;
 755    case SHADER_OPCODE_TEX:
 756    case FS_OPCODE_TXB:
 757    case SHADER_OPCODE_TXD:
 758    case SHADER_OPCODE_TXF:
 759    case SHADER_OPCODE_TXF_MS:
 760    case SHADER_OPCODE_TXL:
 761    case SHADER_OPCODE_TXS:
 762    case SHADER_OPCODE_LOD:
 763       return 1;
 764    case FS_OPCODE_FB_WRITE:
 765       return 2;
 766    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 767    case FS_OPCODE_UNSPILL:
 768       return 1;
 769    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
 770       return inst->mlen;
 771    case FS_OPCODE_SPILL:
 772       return 2;
 773    default:
 774       assert(!"not reached");
 775       return inst->mlen;
 776    }
 777 }
 778
 779 int
 780 fs_visitor::virtual_grf_alloc(int size)
 781 {
 782    if (virtual_grf_array_size <= virtual_grf_count) {
 783       if (virtual_grf_array_size == 0)
 784          virtual_grf_array_size = 16;
 785       else
 786          virtual_grf_array_size *= 2;
 787       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 788                                    virtual_grf_array_size);
 789    }
 790    virtual_grf_sizes[virtual_grf_count] = size;
 791    return virtual_grf_count++;
 792 }
 793
 794 /** Fixed HW reg constructor. */
 795 fs_reg::fs_reg(enum register_file file, int reg)
 796 {
 797    init();
 798    this->file = file;
 799    this->reg = reg;
 800    this->type = BRW_REGISTER_TYPE_F;
 801 }
 802
 803 /** Fixed HW reg constructor. */
 804 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
 805 {
 806    init();
 807    this->file = file;
 808    this->reg = reg;
 809    this->type = type;
 810 }
 811
 812 /** Automatic reg constructor. */
 813 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
 814 {
 815    init();
 816
 817    this->file = GRF;
 818    this->reg = v->virtual_grf_alloc(v->type_size(type));
 819    this->reg_offset = 0;
 820    this->type = brw_type_for_base_type(type);
 821 }
 822
 823 fs_reg *
 824 fs_visitor::variable_storage(ir_variable *var)
 825 {
 826    return (fs_reg *)hash_table_find(this->variable_ht, var);
 827 }
 828
 829 void
 830 import_uniforms_callback(const void *key,
 831                          void *data,
 832                          void *closure)
 833 {
 834    struct hash_table *dst_ht = (struct hash_table *)closure;
 835    const fs_reg *reg = (const fs_reg *)data;
 836
 837    if (reg->file != UNIFORM)
 838       return;
 839
 840    hash_table_insert(dst_ht, data, key);
 841 }
 842
 843 /* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
 844  * This brings in those uniform definitions
 845  */
 846 void
 847 fs_visitor::import_uniforms(fs_visitor *v)
 848 {
 849    hash_table_call_foreach(v->variable_ht,
 850                            import_uniforms_callback,
 851                            variable_ht);
 852    this->params_remap = v->params_remap;
 853 }
 854
 855 /* Our support for uniforms is piggy-backed on the struct
 856  * gl_fragment_program, because that's where the values actually
 857  * get stored, rather than in some global gl_shader_program uniform
 858  * store.
 859  */
 860 void
 861 fs_visitor::setup_uniform_values(ir_variable *ir)
 862 {
 863    int namelen = strlen(ir->name);
 864
 865    /* The data for our (non-builtin) uniforms is stored in a series of
 866     * gl_uniform_driver_storage structs for each subcomponent that
 867     * glGetUniformLocation() could name.  We know it's been set up in the same
 868     * order we'd walk the type, so walk the list of storage and find anything
 869     * with our name, or the prefix of a component that starts with our name.
 870     */
 871    unsigned params_before = c->prog_data.nr_params;
 872    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
 873       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 874
 875       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 876           (storage->name[namelen] != 0 &&
 877            storage->name[namelen] != '.' &&
 878            storage->name[namelen] != '[')) {
 879          continue;
 880       }
 881
 882       unsigned slots = storage->type->component_slots();
 883       if (storage->array_elements)
 884          slots *= storage->array_elements;
 885
 886       for (unsigned i = 0; i < slots; i++) {
 887          c->prog_data.param[c->prog_data.nr_params++] =
 888             &storage->storage[i].f;
 889       }
 890    }
 891
 892    /* Make sure we actually initialized the right amount of stuff here. */
 893    assert(params_before + ir->type->component_slots() ==
 894           c->prog_data.nr_params);
 895 }
 896
 897
 898 /* Our support for builtin uniforms is even scarier than non-builtin.
 899  * It sits on top of the PROG_STATE_VAR parameters that are
 900  * automatically updated from GL context state.
 901  */
 902 void
 903 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
 904 {
 905    const ir_state_slot *const slots = ir->state_slots;
 906    assert(ir->state_slots != NULL);
 907
 908    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 909       /* This state reference has already been setup by ir_to_mesa, but we'll
 910        * get the same index back here.
 911        */
 912       int index = _mesa_add_state_reference(this->fp->Base.Parameters,
 913                                             (gl_state_index *)slots[i].tokens);
 914
 915       /* Add each of the unique swizzles of the element as a parameter.
 916        * This'll end up matching the expected layout of the
 917        * array/matrix/structure we're trying to fill in.
 918        */
 919       int last_swiz = -1;
 920       for (unsigned int j = 0; j < 4; j++) {
 921          int swiz = GET_SWZ(slots[i].swizzle, j);
 922          if (swiz == last_swiz)
 923             break;
 924          last_swiz = swiz;
 925
 926          c->prog_data.param[c->prog_data.nr_params++] =
 927             &fp->Base.Parameters->ParameterValues[index][swiz].f;
 928       }
 929    }
 930 }
 931
 932 fs_reg *
 933 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
 934 {
 935    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 936    fs_reg wpos = *reg;
 937    bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
 938
 939    /* gl_FragCoord.x */
 940    if (ir->pixel_center_integer) {
 941       emit(MOV(wpos, this->pixel_x));
 942    } else {
 943       emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
 944    }
 945    wpos.reg_offset++;
 946
 947    /* gl_FragCoord.y */
 948    if (!flip && ir->pixel_center_integer) {
 949       emit(MOV(wpos, this->pixel_y));
 950    } else {
 951       fs_reg pixel_y = this->pixel_y;
 952       float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
 953
 954       if (flip) {
 955          pixel_y.negate = true;
 956          offset += c->key.drawable_height - 1.0;
 957       }
 958
 959       emit(ADD(wpos, pixel_y, fs_reg(offset)));
 960    }
 961    wpos.reg_offset++;
 962
 963    /* gl_FragCoord.z */
 964    if (intel->gen >= 6) {
 965       emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
 966    } else {
 967       emit(FS_OPCODE_LINTERP, wpos,
 968            this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
 969            this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
 970            interp_reg(VARYING_SLOT_POS, 2));
 971    }
 972    wpos.reg_offset++;
 973
 974    /* gl_FragCoord.w: Already set up in emit_interpolation */
 975    emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
 976
 977    return reg;
 978 }
 979
 980 fs_inst *
 981 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
 982                          glsl_interp_qualifier interpolation_mode,
 983                          bool is_centroid)
 984 {
 985    brw_wm_barycentric_interp_mode barycoord_mode;
 986    if (is_centroid) {
 987       if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
 988          barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
 989       else
 990          barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
 991    } else {
 992       if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
 993          barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
 994       else
 995          barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
 996    }
 997    return emit(FS_OPCODE_LINTERP, attr,
 998                this->delta_x[barycoord_mode],
 999                this->delta_y[barycoord_mode], interp);
1000 }
1001
1002 fs_reg *
1003 fs_visitor::emit_general_interpolation(ir_variable *ir)
1004 {
1005    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1006    reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
1007    fs_reg attr = *reg;
1008
1009    unsigned int array_elements;
1010    const glsl_type *type;
1011
1012    if (ir->type->is_array()) {
1013       array_elements = ir->type->length;
1014       if (array_elements == 0) {
1015          fail("dereferenced array '%s' has length 0\n", ir->name);
1016       }
1017       type = ir->type->fields.array;
1018    } else {
1019       array_elements = 1;
1020       type = ir->type;
1021    }
1022
1023    glsl_interp_qualifier interpolation_mode =
1024       ir->determine_interpolation_mode(c->key.flat_shade);
1025
1026    int location = ir->location;
1027    for (unsigned int i = 0; i < array_elements; i++) {
1028       for (unsigned int j = 0; j < type->matrix_columns; j++) {
1029          if (urb_setup[location] == -1) {
1030             /* If there's no incoming setup data for this slot, don't
1031              * emit interpolation for it.
1032              */
1033             attr.reg_offset += type->vector_elements;
1034             location++;
1035             continue;
1036          }
1037
1038          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1039             /* Constant interpolation (flat shading) case. The SF has
1040              * handed us defined values in only the constant offset
1041              * field of the setup reg.
1042              */
1043             for (unsigned int k = 0; k < type->vector_elements; k++) {
1044                struct brw_reg interp = interp_reg(location, k);
1045                interp = suboffset(interp, 3);
1046                interp.type = reg->type;
1047                emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1048                attr.reg_offset++;
1049             }
1050          } else {
1051             /* Smooth/noperspective interpolation case. */
1052             for (unsigned int k = 0; k < type->vector_elements; k++) {
1053                /* FINISHME: At some point we probably want to push
1054                 * this farther by giving similar treatment to the
1055                 * other potentially constant components of the
1056                 * attribute, as well as making brw_vs_constval.c
1057                 * handle varyings other than gl_TexCoord.
1058                 */
1059                struct brw_reg interp = interp_reg(location, k);
1060                emit_linterp(attr, fs_reg(interp), interpolation_mode,
1061                             ir->centroid);
1062                if (brw->needs_unlit_centroid_workaround && ir->centroid) {
1063                   /* Get the pixel/sample mask into f0 so that we know
1064                    * which pixels are lit.  Then, for each channel that is
1065                    * unlit, replace the centroid data with non-centroid
1066                    * data.
1067                    */
1068                   emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1069                   fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1070                                                interpolation_mode, false);
1071                   inst->predicate = BRW_PREDICATE_NORMAL;
1072                   inst->predicate_inverse = true;
1073                }
1074                if (intel->gen < 6) {
1075                   emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1076                }
1077                attr.reg_offset++;
1078             }
1079
1080          }
1081          location++;
1082       }
1083    }
1084
1085    return reg;
1086 }
1087
1088 fs_reg *
1089 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1090 {
1091    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1092
1093    /* The frontfacing comes in as a bit in the thread payload. */
1094    if (intel->gen >= 6) {
1095       emit(BRW_OPCODE_ASR, *reg,
1096            fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1097            fs_reg(15));
1098       emit(BRW_OPCODE_NOT, *reg, *reg);
1099       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1100    } else {
1101       struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1102       /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1103        * us front face
1104        */
1105       emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1106       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1107    }
1108
1109    return reg;
1110 }
1111
1112 fs_reg
1113 fs_visitor::fix_math_operand(fs_reg src)
1114 {
1115    /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1116     * might be able to do better by doing execsize = 1 math and then
1117     * expanding that result out, but we would need to be careful with
1118     * masking.
1119     *
1120     * The hardware ignores source modifiers (negate and abs) on math
1121     * instructions, so we also move to a temp to set those up.
1122     */
1123    if (intel->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1124        !src.abs && !src.negate)
1125       return src;
1126
1127    /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1128     * operands to math
1129     */
1130    if (intel->gen >= 7 && src.file != IMM)
1131       return src;
1132
1133    fs_reg expanded = fs_reg(this, glsl_type::float_type);
1134    expanded.type = src.type;
1135    emit(BRW_OPCODE_MOV, expanded, src);
1136    return expanded;
1137 }
1138
1139 fs_inst *
1140 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1141 {
1142    switch (opcode) {
1143    case SHADER_OPCODE_RCP:
1144    case SHADER_OPCODE_RSQ:
1145    case SHADER_OPCODE_SQRT:
1146    case SHADER_OPCODE_EXP2:
1147    case SHADER_OPCODE_LOG2:
1148    case SHADER_OPCODE_SIN:
1149    case SHADER_OPCODE_COS:
1150       break;
1151    default:
1152       assert(!"not reached: bad math opcode");
1153       return NULL;
1154    }
1155
1156    /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
1157     * might be able to do better by doing execsize = 1 math and then
1158     * expanding that result out, but we would need to be careful with
1159     * masking.
1160     *
1161     * Gen 6 hardware ignores source modifiers (negate and abs) on math
1162     * instructions, so we also move to a temp to set those up.
1163     */
1164    if (intel->gen >= 6)
1165       src = fix_math_operand(src);
1166
1167    fs_inst *inst = emit(opcode, dst, src);
1168
1169    if (intel->gen < 6) {
1170       inst->base_mrf = 2;
1171       inst->mlen = dispatch_width / 8;
1172    }
1173
1174    return inst;
1175 }
1176
1177 fs_inst *
1178 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1179 {
1180    int base_mrf = 2;
1181    fs_inst *inst;
1182
1183    switch (opcode) {
1184    case SHADER_OPCODE_INT_QUOTIENT:
1185    case SHADER_OPCODE_INT_REMAINDER:
1186       if (intel->gen >= 7 && dispatch_width == 16)
1187          fail("16-wide INTDIV unsupported\n");
1188       break;
1189    case SHADER_OPCODE_POW:
1190       break;
1191    default:
1192       assert(!"not reached: unsupported binary math opcode.");
1193       return NULL;
1194    }
1195
1196    if (intel->gen >= 6) {
1197       src0 = fix_math_operand(src0);
1198       src1 = fix_math_operand(src1);
1199
1200       inst = emit(opcode, dst, src0, src1);
1201    } else {
1202       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1203        * "Message Payload":
1204        *
1205        * "Operand0[7].  For the INT DIV functions, this operand is the
1206        *  denominator."
1207        *  ...
1208        * "Operand1[7].  For the INT DIV functions, this operand is the
1209        *  numerator."
1210        */
1211       bool is_int_div = opcode != SHADER_OPCODE_POW;
1212       fs_reg &op0 = is_int_div ? src1 : src0;
1213       fs_reg &op1 = is_int_div ? src0 : src1;
1214
1215       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1216       inst = emit(opcode, dst, op0, reg_null_f);
1217
1218       inst->base_mrf = base_mrf;
1219       inst->mlen = 2 * dispatch_width / 8;
1220    }
1221    return inst;
1222 }
1223
1224 void
1225 fs_visitor::assign_curb_setup()
1226 {
1227    c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
1228    if (dispatch_width == 8) {
1229       c->prog_data.first_curbe_grf = c->nr_payload_regs;
1230    } else {
1231       c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1232    }
1233
1234    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1235    foreach_list(node, &this->instructions) {
1236       fs_inst *inst = (fs_inst *)node;
1237
1238       for (unsigned int i = 0; i < 3; i++) {
1239          if (inst->src[i].file == UNIFORM) {
1240             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1241             struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1242                                                   constant_nr / 8,
1243                                                   constant_nr % 8);
1244
1245             inst->src[i].file = FIXED_HW_REG;
1246             inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
1247          }
1248       }
1249    }
1250 }
1251
1252 void
1253 fs_visitor::calculate_urb_setup()
1254 {
1255    for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1256       urb_setup[i] = -1;
1257    }
1258
1259    int urb_next = 0;
1260    /* Figure out where each of the incoming setup attributes lands. */
1261    if (intel->gen >= 6) {
1262       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1263          if (fp->Base.InputsRead & BITFIELD64_BIT(i)) {
1264             urb_setup[i] = urb_next++;
1265          }
1266       }
1267    } else {
1268       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1269       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1270          /* Point size is packed into the header, not as a general attribute */
1271          if (i == VARYING_SLOT_PSIZ)
1272             continue;
1273
1274          if (c->key.input_slots_valid & BITFIELD64_BIT(i)) {
1275             /* The back color slot is skipped when the front color is
1276              * also written to.  In addition, some slots can be
1277              * written in the vertex shader and not read in the
1278              * fragment shader.  So the register number must always be
1279              * incremented, mapped or not.
1280              */
1281             if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1282                urb_setup[i] = urb_next;
1283             urb_next++;
1284          }
1285       }
1286
1287       /*
1288        * It's a FS only attribute, and we did interpolation for this attribute
1289        * in SF thread. So, count it here, too.
1290        *
1291        * See compile_sf_prog() for more info.
1292        */
1293       if (fp->Base.InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1294          urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1295    }
1296
1297    /* Each attribute is 4 setup channels, each of which is half a reg. */
1298    c->prog_data.urb_read_length = urb_next * 2;
1299 }
1300
1301 void
1302 fs_visitor::assign_urb_setup()
1303 {
1304    int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1305
1306    /* Offset all the urb_setup[] index by the actual position of the
1307     * setup regs, now that the location of the constants has been chosen.
1308     */
1309    foreach_list(node, &this->instructions) {
1310       fs_inst *inst = (fs_inst *)node;
1311
1312       if (inst->opcode == FS_OPCODE_LINTERP) {
1313          assert(inst->src[2].file == FIXED_HW_REG);
1314          inst->src[2].fixed_hw_reg.nr += urb_start;
1315       }
1316
1317       if (inst->opcode == FS_OPCODE_CINTERP) {
1318          assert(inst->src[0].file == FIXED_HW_REG);
1319          inst->src[0].fixed_hw_reg.nr += urb_start;
1320       }
1321    }
1322
1323    this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length;
1324 }
1325
1326 /**
1327  * Split large virtual GRFs into separate components if we can.
1328  *
1329  * This is mostly duplicated with what brw_fs_vector_splitting does,
1330  * but that's really conservative because it's afraid of doing
1331  * splitting that doesn't result in real progress after the rest of
1332  * the optimization phases, which would cause infinite looping in
1333  * optimization.  We can do it once here, safely.  This also has the
1334  * opportunity to split interpolated values, or maybe even uniforms,
1335  * which we don't have at the IR level.
1336  *
1337  * We want to split, because virtual GRFs are what we register
1338  * allocate and spill (due to contiguousness requirements for some
1339  * instructions), and they're what we naturally generate in the
1340  * codegen process, but most virtual GRFs don't actually need to be
1341  * contiguous sets of GRFs.  If we split, we'll end up with reduced
1342  * live intervals and better dead code elimination and coalescing.
1343  */
1344 void
1345 fs_visitor::split_virtual_grfs()
1346 {
1347    int num_vars = this->virtual_grf_count;
1348    bool split_grf[num_vars];
1349    int new_virtual_grf[num_vars];
1350
1351    /* Try to split anything > 0 sized. */
1352    for (int i = 0; i < num_vars; i++) {
1353       if (this->virtual_grf_sizes[i] != 1)
1354          split_grf[i] = true;
1355       else
1356          split_grf[i] = false;
1357    }
1358
1359    if (brw->has_pln &&
1360        this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1361       /* PLN opcodes rely on the delta_xy being contiguous.  We only have to
1362        * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1363        * Gen6, that was the only supported interpolation mode, and since Gen6,
1364        * delta_x and delta_y are in fixed hardware registers.
1365        */
1366       split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1367          false;
1368    }
1369
1370    foreach_list(node, &this->instructions) {
1371       fs_inst *inst = (fs_inst *)node;
1372
1373       /* If there's a SEND message that requires contiguous destination
1374        * registers, no splitting is allowed.
1375        */
1376       if (inst->regs_written > 1) {
1377          split_grf[inst->dst.reg] = false;
1378       }
1379
1380       /* If we're sending from a GRF, don't split it, on the assumption that
1381        * the send is reading the whole thing.
1382        */
1383       if (inst->is_send_from_grf()) {
1384          split_grf[inst->src[0].reg] = false;
1385       }
1386    }
1387
1388    /* Allocate new space for split regs.  Note that the virtual
1389     * numbers will be contiguous.
1390     */
1391    for (int i = 0; i < num_vars; i++) {
1392       if (split_grf[i]) {
1393          new_virtual_grf[i] = virtual_grf_alloc(1);
1394          for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1395             int reg = virtual_grf_alloc(1);
1396             assert(reg == new_virtual_grf[i] + j - 1);
1397             (void) reg;
1398          }
1399          this->virtual_grf_sizes[i] = 1;
1400       }
1401    }
1402
1403    foreach_list(node, &this->instructions) {
1404       fs_inst *inst = (fs_inst *)node;
1405
1406       if (inst->dst.file == GRF &&
1407           split_grf[inst->dst.reg] &&
1408           inst->dst.reg_offset != 0) {
1409          inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1410                           inst->dst.reg_offset - 1);
1411          inst->dst.reg_offset = 0;
1412       }
1413       for (int i = 0; i < 3; i++) {
1414          if (inst->src[i].file == GRF &&
1415              split_grf[inst->src[i].reg] &&
1416              inst->src[i].reg_offset != 0) {
1417             inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1418                                 inst->src[i].reg_offset - 1);
1419             inst->src[i].reg_offset = 0;
1420          }
1421       }
1422    }
1423    this->live_intervals_valid = false;
1424 }
1425
1426 /**
1427  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1428  *
1429  * During code generation, we create tons of temporary variables, many of
1430  * which get immediately killed and are never used again.  Yet, in later
1431  * optimization and analysis passes, such as compute_live_intervals, we need
1432  * to loop over all the virtual GRFs.  Compacting them can save a lot of
1433  * overhead.
1434  */
1435 void
1436 fs_visitor::compact_virtual_grfs()
1437 {
1438    /* Mark which virtual GRFs are used, and count how many. */
1439    int remap_table[this->virtual_grf_count];
1440    memset(remap_table, -1, sizeof(remap_table));
1441
1442    foreach_list(node, &this->instructions) {
1443       const fs_inst *inst = (const fs_inst *) node;
1444
1445       if (inst->dst.file == GRF)
1446          remap_table[inst->dst.reg] = 0;
1447
1448       for (int i = 0; i < 3; i++) {
1449          if (inst->src[i].file == GRF)
1450             remap_table[inst->src[i].reg] = 0;
1451       }
1452    }
1453
1454    /* In addition to registers used in instructions, fs_visitor keeps
1455     * direct references to certain special values which must be patched:
1456     */
1457    fs_reg *special[] = {
1458       &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1459       &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1460       &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1461       &delta_x[0], &delta_x[1], &delta_x[2],
1462       &delta_x[3], &delta_x[4], &delta_x[5],
1463       &delta_y[0], &delta_y[1], &delta_y[2],
1464       &delta_y[3], &delta_y[4], &delta_y[5],
1465    };
1466    STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1467    STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1468
1469    /* Treat all special values as used, to be conservative */
1470    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1471       if (special[i]->file == GRF)
1472          remap_table[special[i]->reg] = 0;
1473    }
1474
1475    /* Compact the GRF arrays. */
1476    int new_index = 0;
1477    for (int i = 0; i < this->virtual_grf_count; i++) {
1478       if (remap_table[i] != -1) {
1479          remap_table[i] = new_index;
1480          virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1481          if (live_intervals_valid) {
1482             virtual_grf_use[new_index] = virtual_grf_use[i];
1483             virtual_grf_def[new_index] = virtual_grf_def[i];
1484          }
1485          ++new_index;
1486       }
1487    }
1488
1489    this->virtual_grf_count = new_index;
1490
1491    /* Patch all the instructions to use the newly renumbered registers */
1492    foreach_list(node, &this->instructions) {
1493       fs_inst *inst = (fs_inst *) node;
1494
1495       if (inst->dst.file == GRF)
1496          inst->dst.reg = remap_table[inst->dst.reg];
1497
1498       for (int i = 0; i < 3; i++) {
1499          if (inst->src[i].file == GRF)
1500             inst->src[i].reg = remap_table[inst->src[i].reg];
1501       }
1502    }
1503
1504    /* Patch all the references to special values */
1505    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1506       if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1507          special[i]->reg = remap_table[special[i]->reg];
1508    }
1509 }
1510
1511 bool
1512 fs_visitor::remove_dead_constants()
1513 {
1514    if (dispatch_width == 8) {
1515       this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
1516
1517       for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
1518          this->params_remap[i] = -1;
1519
1520       /* Find which params are still in use. */
1521       foreach_list(node, &this->instructions) {
1522          fs_inst *inst = (fs_inst *)node;
1523
1524          for (int i = 0; i < 3; i++) {
1525             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1526
1527             if (inst->src[i].file != UNIFORM)
1528                continue;
1529
1530             assert(constant_nr < (int)c->prog_data.nr_params);
1531
1532             /* For now, set this to non-negative.  We'll give it the
1533              * actual new number in a moment, in order to keep the
1534              * register numbers nicely ordered.
1535              */
1536             this->params_remap[constant_nr] = 0;
1537          }
1538       }
1539
1540       /* Figure out what the new numbers for the params will be.  At some
1541        * point when we're doing uniform array access, we're going to want
1542        * to keep the distinction between .reg and .reg_offset, but for
1543        * now we don't care.
1544        */
1545       unsigned int new_nr_params = 0;
1546       for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1547          if (this->params_remap[i] != -1) {
1548             this->params_remap[i] = new_nr_params++;
1549          }
1550       }
1551
1552       /* Update the list of params to be uploaded to match our new numbering. */
1553       for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1554          int remapped = this->params_remap[i];
1555
1556          if (remapped == -1)
1557             continue;
1558
1559          c->prog_data.param[remapped] = c->prog_data.param[i];
1560       }
1561
1562       c->prog_data.nr_params = new_nr_params;
1563    } else {
1564       /* This should have been generated in the 8-wide pass already. */
1565       assert(this->params_remap);
1566    }
1567
1568    /* Now do the renumbering of the shader to remove unused params. */
1569    foreach_list(node, &this->instructions) {
1570       fs_inst *inst = (fs_inst *)node;
1571
1572       for (int i = 0; i < 3; i++) {
1573          int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1574
1575          if (inst->src[i].file != UNIFORM)
1576             continue;
1577
1578          assert(this->params_remap[constant_nr] != -1);
1579          inst->src[i].reg = this->params_remap[constant_nr];
1580          inst->src[i].reg_offset = 0;
1581       }
1582    }
1583
1584    return true;
1585 }
1586
1587 /*
1588  * Implements array access of uniforms by inserting a
1589  * PULL_CONSTANT_LOAD instruction.
1590  *
1591  * Unlike temporary GRF array access (where we don't support it due to
1592  * the difficulty of doing relative addressing on instruction
1593  * destinations), we could potentially do array access of uniforms
1594  * that were loaded in GRF space as push constants.  In real-world
1595  * usage we've seen, though, the arrays being used are always larger
1596  * than we could load as push constants, so just always move all
1597  * uniform array access out to a pull constant buffer.
1598  */
1599 void
1600 fs_visitor::move_uniform_array_access_to_pull_constants()
1601 {
1602    int pull_constant_loc[c->prog_data.nr_params];
1603
1604    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1605       pull_constant_loc[i] = -1;
1606    }
1607
1608    /* Walk through and find array access of uniforms.  Put a copy of that
1609     * uniform in the pull constant buffer.
1610     *
1611     * Note that we don't move constant-indexed accesses to arrays.  No
1612     * testing has been done of the performance impact of this choice.
1613     */
1614    foreach_list_safe(node, &this->instructions) {
1615       fs_inst *inst = (fs_inst *)node;
1616
1617       for (int i = 0 ; i < 3; i++) {
1618          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1619             continue;
1620
1621          int uniform = inst->src[i].reg;
1622
1623          /* If this array isn't already present in the pull constant buffer,
1624           * add it.
1625           */
1626          if (pull_constant_loc[uniform] == -1) {
1627             const float **values = &c->prog_data.param[uniform];
1628
1629             pull_constant_loc[uniform] = c->prog_data.nr_pull_params;
1630
1631             assert(param_size[uniform]);
1632
1633             for (int j = 0; j < param_size[uniform]; j++) {
1634                c->prog_data.pull_param[c->prog_data.nr_pull_params++] =
1635                   values[j];
1636             }
1637          }
1638
1639          /* Set up the annotation tracking for new generated instructions. */
1640          base_ir = inst->ir;
1641          current_annotation = inst->annotation;
1642
1643          fs_reg surf_index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1644          fs_reg temp = fs_reg(this, glsl_type::float_type);
1645          exec_list list = VARYING_PULL_CONSTANT_LOAD(temp,
1646                                                      surf_index,
1647                                                      *inst->src[i].reladdr,
1648                                                      pull_constant_loc[uniform] +
1649                                                      inst->src[i].reg_offset);
1650          inst->insert_before(&list);
1651
1652          inst->src[i].file = temp.file;
1653          inst->src[i].reg = temp.reg;
1654          inst->src[i].reg_offset = temp.reg_offset;
1655          inst->src[i].reladdr = NULL;
1656       }
1657    }
1658 }
1659
1660 /**
1661  * Choose accesses from the UNIFORM file to demote to using the pull
1662  * constant buffer.
1663  *
1664  * We allow a fragment shader to have more than the specified minimum
1665  * maximum number of fragment shader uniform components (64).  If
1666  * there are too many of these, they'd fill up all of register space.
1667  * So, this will push some of them out to the pull constant buffer and
1668  * update the program to load them.
1669  */
1670 void
1671 fs_visitor::setup_pull_constants()
1672 {
1673    /* Only allow 16 registers (128 uniform components) as push constants. */
1674    unsigned int max_uniform_components = 16 * 8;
1675    if (c->prog_data.nr_params <= max_uniform_components)
1676       return;
1677
1678    if (dispatch_width == 16) {
1679       fail("Pull constants not supported in 16-wide\n");
1680       return;
1681    }
1682
1683    /* Just demote the end of the list.  We could probably do better
1684     * here, demoting things that are rarely used in the program first.
1685     */
1686    unsigned int pull_uniform_base = max_uniform_components;
1687
1688    int pull_constant_loc[c->prog_data.nr_params];
1689    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1690       if (i < pull_uniform_base) {
1691          pull_constant_loc[i] = -1;
1692       } else {
1693          pull_constant_loc[i] = -1;
1694          /* If our constant is already being uploaded for reladdr purposes,
1695           * reuse it.
1696           */
1697          for (unsigned int j = 0; j < c->prog_data.nr_pull_params; j++) {
1698             if (c->prog_data.pull_param[j] == c->prog_data.param[i]) {
1699                pull_constant_loc[i] = j;
1700                break;
1701             }
1702          }
1703          if (pull_constant_loc[i] == -1) {
1704             int pull_index = c->prog_data.nr_pull_params++;
1705             c->prog_data.pull_param[pull_index] = c->prog_data.param[i];
1706             pull_constant_loc[i] = pull_index;;
1707          }
1708       }
1709    }
1710    c->prog_data.nr_params = pull_uniform_base;
1711
1712    foreach_list(node, &this->instructions) {
1713       fs_inst *inst = (fs_inst *)node;
1714
1715       for (int i = 0; i < 3; i++) {
1716          if (inst->src[i].file != UNIFORM)
1717             continue;
1718
1719          int pull_index = pull_constant_loc[inst->src[i].reg +
1720                                             inst->src[i].reg_offset];
1721          if (pull_index == -1)
1722             continue;
1723
1724          assert(!inst->src[i].reladdr);
1725
1726          fs_reg dst = fs_reg(this, glsl_type::float_type);
1727          fs_reg index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1728          fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1729          fs_inst *pull =
1730             new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1731                                  dst, index, offset);
1732          pull->ir = inst->ir;
1733          pull->annotation = inst->annotation;
1734
1735          inst->insert_before(pull);
1736
1737          inst->src[i].file = GRF;
1738          inst->src[i].reg = dst.reg;
1739          inst->src[i].reg_offset = 0;
1740          inst->src[i].smear = pull_index & 3;
1741       }
1742    }
1743 }
1744
1745 bool
1746 fs_visitor::opt_algebraic()
1747 {
1748    bool progress = false;
1749
1750    foreach_list(node, &this->instructions) {
1751       fs_inst *inst = (fs_inst *)node;
1752
1753       switch (inst->opcode) {
1754       case BRW_OPCODE_MUL:
1755          if (inst->src[1].file != IMM)
1756             continue;
1757
1758          /* a * 1.0 = a */
1759          if (inst->src[1].is_one()) {
1760             inst->opcode = BRW_OPCODE_MOV;
1761             inst->src[1] = reg_undef;
1762             progress = true;
1763             break;
1764          }
1765
1766          /* a * 0.0 = 0.0 */
1767          if (inst->src[1].is_zero()) {
1768             inst->opcode = BRW_OPCODE_MOV;
1769             inst->src[0] = inst->src[1];
1770             inst->src[1] = reg_undef;
1771             progress = true;
1772             break;
1773          }
1774
1775          break;
1776       case BRW_OPCODE_ADD:
1777          if (inst->src[1].file != IMM)
1778             continue;
1779
1780          /* a + 0.0 = a */
1781          if (inst->src[1].is_zero()) {
1782             inst->opcode = BRW_OPCODE_MOV;
1783             inst->src[1] = reg_undef;
1784             progress = true;
1785             break;
1786          }
1787          break;
1788       default:
1789          break;
1790       }
1791    }
1792
1793    return progress;
1794 }
1795
1796 /**
1797  * Must be called after calculate_live_intervales() to remove unused
1798  * writes to registers -- register allocation will fail otherwise
1799  * because something deffed but not used won't be considered to
1800  * interfere with other regs.
1801  */
1802 bool
1803 fs_visitor::dead_code_eliminate()
1804 {
1805    bool progress = false;
1806    int pc = 0;
1807
1808    calculate_live_intervals();
1809
1810    foreach_list_safe(node, &this->instructions) {
1811       fs_inst *inst = (fs_inst *)node;
1812
1813       if (inst->dst.file == GRF && this->virtual_grf_use[inst->dst.reg] <= pc) {
1814          inst->remove();
1815          progress = true;
1816       }
1817
1818       pc++;
1819    }
1820
1821    if (progress)
1822       live_intervals_valid = false;
1823
1824    return progress;
1825 }
1826
1827 struct dead_code_hash_key
1828 {
1829    int vgrf;
1830    int reg_offset;
1831 };
1832
1833 static bool
1834 dead_code_hash_compare(const void *a, const void *b)
1835 {
1836    return memcmp(a, b, sizeof(struct dead_code_hash_key)) == 0;
1837 }
1838
1839 static void
1840 clear_dead_code_hash(struct hash_table *ht)
1841 {
1842    struct hash_entry *entry;
1843
1844    hash_table_foreach(ht, entry) {
1845       _mesa_hash_table_remove(ht, entry);
1846    }
1847 }
1848
1849 static void
1850 insert_dead_code_hash(struct hash_table *ht,
1851                       int vgrf, int reg_offset, fs_inst *inst)
1852 {
1853    /* We don't bother freeing keys, because they'll be GCed with the ht. */
1854    struct dead_code_hash_key *key = ralloc(ht, struct dead_code_hash_key);
1855
1856    key->vgrf = vgrf;
1857    key->reg_offset = reg_offset;
1858
1859    _mesa_hash_table_insert(ht, _mesa_hash_data(key, sizeof(*key)), key, inst);
1860 }
1861
1862 static struct hash_entry *
1863 get_dead_code_hash_entry(struct hash_table *ht, int vgrf, int reg_offset)
1864 {
1865    struct dead_code_hash_key key;
1866
1867    key.vgrf = vgrf;
1868    key.reg_offset = reg_offset;
1869
1870    return _mesa_hash_table_search(ht, _mesa_hash_data(&key, sizeof(key)), &key);
1871 }
1872
1873 static void
1874 remove_dead_code_hash(struct hash_table *ht,
1875                       int vgrf, int reg_offset)
1876 {
1877    struct hash_entry *entry = get_dead_code_hash_entry(ht, vgrf, reg_offset);
1878    if (!entry)
1879       return;
1880
1881    _mesa_hash_table_remove(ht, entry);
1882 }
1883
1884 /**
1885  * Walks basic blocks, removing any regs that are written but not read before
1886  * being redefined.
1887  *
1888  * The dead_code_eliminate() function implements a global dead code
1889  * elimination, but it only handles the removing the last write to a register
1890  * if it's never read.  This one can handle intermediate writes, but only
1891  * within a basic block.
1892  */
1893 bool
1894 fs_visitor::dead_code_eliminate_local()
1895 {
1896    struct hash_table *ht;
1897    bool progress = false;
1898
1899    ht = _mesa_hash_table_create(mem_ctx, dead_code_hash_compare);
1900
1901    foreach_list_safe(node, &this->instructions) {
1902       fs_inst *inst = (fs_inst *)node;
1903
1904       /* At a basic block, empty the HT since we don't understand dataflow
1905        * here.
1906        */
1907       if (inst->is_control_flow()) {
1908          clear_dead_code_hash(ht);
1909          continue;
1910       }
1911
1912       /* Clear the HT of any instructions that got read. */
1913       for (int i = 0; i < 3; i++) {
1914          fs_reg src = inst->src[i];
1915          if (src.file != GRF)
1916             continue;
1917
1918          int read = 1;
1919          if (inst->is_send_from_grf())
1920             read = virtual_grf_sizes[src.reg] - src.reg_offset;
1921
1922          for (int reg_offset = src.reg_offset;
1923               reg_offset < src.reg_offset + read;
1924               reg_offset++) {
1925             remove_dead_code_hash(ht, src.reg, reg_offset);
1926          }
1927       }
1928
1929       /* Add any update of a GRF to the HT, removing a previous write if it
1930        * wasn't read.
1931        */
1932       if (inst->dst.file == GRF) {
1933          if (inst->regs_written > 1) {
1934             /* We don't know how to trim channels from an instruction's
1935              * writes, so we can't incrementally remove unread channels from
1936              * it.  Just remove whatever it overwrites from the table
1937              */
1938             for (int i = 0; i < inst->regs_written; i++) {
1939                remove_dead_code_hash(ht,
1940                                      inst->dst.reg,
1941                                      inst->dst.reg_offset + i);
1942             }
1943          } else {
1944             struct hash_entry *entry =
1945                get_dead_code_hash_entry(ht, inst->dst.reg,
1946                                         inst->dst.reg_offset);
1947
1948             if (inst->is_partial_write()) {
1949                /* For a partial write, we can't remove any previous dead code
1950                 * candidate, since we're just modifying their result, but we can
1951                 * be dead code eliminiated ourselves.
1952                 */
1953                if (entry) {
1954                   entry->data = inst;
1955                } else {
1956                   insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
1957                                         inst);
1958                }
1959             } else {
1960                if (entry) {
1961                   /* We're completely updating a channel, and there was a
1962                    * previous write to the channel that wasn't read.  Kill it!
1963                    */
1964                   fs_inst *inst = (fs_inst *)entry->data;
1965                   inst->remove();
1966                   progress = true;
1967                   _mesa_hash_table_remove(ht, entry);
1968                }
1969
1970                insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
1971                                      inst);
1972             }
1973          }
1974       }
1975    }
1976
1977    _mesa_hash_table_destroy(ht, NULL);
1978
1979    if (progress)
1980       live_intervals_valid = false;
1981
1982    return progress;
1983 }
1984
1985 /**
1986  * Implements a second type of register coalescing: This one checks if
1987  * the two regs involved in a raw move don't interfere, in which case
1988  * they can both by stored in the same place and the MOV removed.
1989  */
1990 bool
1991 fs_visitor::register_coalesce_2()
1992 {
1993    bool progress = false;
1994
1995    calculate_live_intervals();
1996
1997    foreach_list_safe(node, &this->instructions) {
1998       fs_inst *inst = (fs_inst *)node;
1999
2000       if (inst->opcode != BRW_OPCODE_MOV ||
2001           inst->predicate ||
2002           inst->saturate ||
2003           inst->src[0].file != GRF ||
2004           inst->src[0].negate ||
2005           inst->src[0].abs ||
2006           inst->src[0].smear != -1 ||
2007           inst->dst.file != GRF ||
2008           inst->dst.type != inst->src[0].type ||
2009           virtual_grf_sizes[inst->src[0].reg] != 1 ||
2010           virtual_grf_interferes(inst->dst.reg, inst->src[0].reg)) {
2011          continue;
2012       }
2013
2014       int reg_from = inst->src[0].reg;
2015       assert(inst->src[0].reg_offset == 0);
2016       int reg_to = inst->dst.reg;
2017       int reg_to_offset = inst->dst.reg_offset;
2018
2019       foreach_list(node, &this->instructions) {
2020          fs_inst *scan_inst = (fs_inst *)node;
2021
2022          if (scan_inst->dst.file == GRF &&
2023              scan_inst->dst.reg == reg_from) {
2024             scan_inst->dst.reg = reg_to;
2025             scan_inst->dst.reg_offset = reg_to_offset;
2026          }
2027          for (int i = 0; i < 3; i++) {
2028             if (scan_inst->src[i].file == GRF &&
2029                 scan_inst->src[i].reg == reg_from) {
2030                scan_inst->src[i].reg = reg_to;
2031                scan_inst->src[i].reg_offset = reg_to_offset;
2032             }
2033          }
2034       }
2035
2036       inst->remove();
2037
2038       /* We don't need to recalculate live intervals inside the loop despite
2039        * flagging live_intervals_valid because we only use live intervals for
2040        * the interferes test, and we must have had a situation where the
2041        * intervals were:
2042        *
2043        *  from  to
2044        *  ^
2045        *  |
2046        *  v
2047        *        ^
2048        *        |
2049        *        v
2050        *
2051        * Some register R that might get coalesced with one of these two could
2052        * only be referencing "to", otherwise "from"'s range would have been
2053        * longer.  R's range could also only start at the end of "to" or later,
2054        * otherwise it will conflict with "to" when we try to coalesce "to"
2055        * into Rw anyway.
2056        */
2057       live_intervals_valid = false;
2058
2059       progress = true;
2060       continue;
2061    }
2062
2063    return progress;
2064 }
2065
2066 bool
2067 fs_visitor::register_coalesce()
2068 {
2069    bool progress = false;
2070    int if_depth = 0;
2071    int loop_depth = 0;
2072
2073    foreach_list_safe(node, &this->instructions) {
2074       fs_inst *inst = (fs_inst *)node;
2075
2076       /* Make sure that we dominate the instructions we're going to
2077        * scan for interfering with our coalescing, or we won't have
2078        * scanned enough to see if anything interferes with our
2079        * coalescing.  We don't dominate the following instructions if
2080        * we're in a loop or an if block.
2081        */
2082       switch (inst->opcode) {
2083       case BRW_OPCODE_DO:
2084          loop_depth++;
2085          break;
2086       case BRW_OPCODE_WHILE:
2087          loop_depth--;
2088          break;
2089       case BRW_OPCODE_IF:
2090          if_depth++;
2091          break;
2092       case BRW_OPCODE_ENDIF:
2093          if_depth--;
2094          break;
2095       default:
2096          break;
2097       }
2098       if (loop_depth || if_depth)
2099          continue;
2100
2101       if (inst->opcode != BRW_OPCODE_MOV ||
2102           inst->predicate ||
2103           inst->saturate ||
2104           inst->dst.file != GRF || (inst->src[0].file != GRF &&
2105                                     inst->src[0].file != UNIFORM)||
2106           inst->dst.type != inst->src[0].type)
2107          continue;
2108
2109       bool has_source_modifiers = (inst->src[0].abs ||
2110                                    inst->src[0].negate ||
2111                                    inst->src[0].smear != -1 ||
2112                                    inst->src[0].file == UNIFORM);
2113
2114       /* Found a move of a GRF to a GRF.  Let's see if we can coalesce
2115        * them: check for no writes to either one until the exit of the
2116        * program.
2117        */
2118       bool interfered = false;
2119
2120       for (fs_inst *scan_inst = (fs_inst *)inst->next;
2121            !scan_inst->is_tail_sentinel();
2122            scan_inst = (fs_inst *)scan_inst->next) {
2123          if (scan_inst->dst.file == GRF) {
2124             if (scan_inst->overwrites_reg(inst->dst) ||
2125                 scan_inst->overwrites_reg(inst->src[0])) {
2126                interfered = true;
2127                break;
2128             }
2129          }
2130
2131          /* The gen6 MATH instruction can't handle source modifiers or
2132           * unusual register regions, so avoid coalescing those for
2133           * now.  We should do something more specific.
2134           */
2135          if (has_source_modifiers && !can_do_source_mods(scan_inst)) {
2136             interfered = true;
2137             break;
2138          }
2139
2140          /* The accumulator result appears to get used for the
2141           * conditional modifier generation.  When negating a UD
2142           * value, there is a 33rd bit generated for the sign in the
2143           * accumulator value, so now you can't check, for example,
2144           * equality with a 32-bit value.  See piglit fs-op-neg-uint.
2145           */
2146          if (scan_inst->conditional_mod &&
2147              inst->src[0].negate &&
2148              inst->src[0].type == BRW_REGISTER_TYPE_UD) {
2149             interfered = true;
2150             break;
2151          }
2152       }
2153       if (interfered) {
2154          continue;
2155       }
2156
2157       /* Rewrite the later usage to point at the source of the move to
2158        * be removed.
2159        */
2160       for (fs_inst *scan_inst = inst;
2161            !scan_inst->is_tail_sentinel();
2162            scan_inst = (fs_inst *)scan_inst->next) {
2163          for (int i = 0; i < 3; i++) {
2164             if (scan_inst->src[i].file == GRF &&
2165                 scan_inst->src[i].reg == inst->dst.reg &&
2166                 scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
2167                fs_reg new_src = inst->src[0];
2168                if (scan_inst->src[i].abs) {
2169                   new_src.negate = 0;
2170                   new_src.abs = 1;
2171                }
2172                new_src.negate ^= scan_inst->src[i].negate;
2173                scan_inst->src[i] = new_src;
2174             }
2175          }
2176       }
2177
2178       inst->remove();
2179       progress = true;
2180    }
2181
2182    if (progress)
2183       live_intervals_valid = false;
2184
2185    return progress;
2186 }
2187
2188
2189 bool
2190 fs_visitor::compute_to_mrf()
2191 {
2192    bool progress = false;
2193    int next_ip = 0;
2194
2195    calculate_live_intervals();
2196
2197    foreach_list_safe(node, &this->instructions) {
2198       fs_inst *inst = (fs_inst *)node;
2199
2200       int ip = next_ip;
2201       next_ip++;
2202
2203       if (inst->opcode != BRW_OPCODE_MOV ||
2204           inst->predicate ||
2205           inst->dst.file != MRF || inst->src[0].file != GRF ||
2206           inst->dst.type != inst->src[0].type ||
2207           inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
2208          continue;
2209
2210       /* Work out which hardware MRF registers are written by this
2211        * instruction.
2212        */
2213       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2214       int mrf_high;
2215       if (inst->dst.reg & BRW_MRF_COMPR4) {
2216          mrf_high = mrf_low + 4;
2217       } else if (dispatch_width == 16 &&
2218                  (!inst->force_uncompressed && !inst->force_sechalf)) {
2219          mrf_high = mrf_low + 1;
2220       } else {
2221          mrf_high = mrf_low;
2222       }
2223
2224       /* Can't compute-to-MRF this GRF if someone else was going to
2225        * read it later.
2226        */
2227       if (this->virtual_grf_use[inst->src[0].reg] > ip)
2228          continue;
2229
2230       /* Found a move of a GRF to a MRF.  Let's see if we can go
2231        * rewrite the thing that made this GRF to write into the MRF.
2232        */
2233       fs_inst *scan_inst;
2234       for (scan_inst = (fs_inst *)inst->prev;
2235            scan_inst->prev != NULL;
2236            scan_inst = (fs_inst *)scan_inst->prev) {
2237          if (scan_inst->dst.file == GRF &&
2238              scan_inst->dst.reg == inst->src[0].reg) {
2239             /* Found the last thing to write our reg we want to turn
2240              * into a compute-to-MRF.
2241              */
2242
2243             /* If this one instruction didn't populate all the
2244              * channels, bail.  We might be able to rewrite everything
2245              * that writes that reg, but it would require smarter
2246              * tracking to delay the rewriting until complete success.
2247              */
2248             if (scan_inst->is_partial_write())
2249                break;
2250
2251             /* Things returning more than one register would need us to
2252              * understand coalescing out more than one MOV at a time.
2253              */
2254             if (scan_inst->regs_written > 1)
2255                break;
2256
2257             /* SEND instructions can't have MRF as a destination. */
2258             if (scan_inst->mlen)
2259                break;
2260
2261             if (intel->gen == 6) {
2262                /* gen6 math instructions must have the destination be
2263                 * GRF, so no compute-to-MRF for them.
2264                 */
2265                if (scan_inst->is_math()) {
2266                   break;
2267                }
2268             }
2269
2270             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2271                /* Found the creator of our MRF's source value. */
2272                scan_inst->dst.file = MRF;
2273                scan_inst->dst.reg = inst->dst.reg;
2274                scan_inst->saturate |= inst->saturate;
2275                inst->remove();
2276                progress = true;
2277             }
2278             break;
2279          }
2280
2281          /* We don't handle control flow here.  Most computation of
2282           * values that end up in MRFs are shortly before the MRF
2283           * write anyway.
2284           */
2285          if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2286             break;
2287
2288          /* You can't read from an MRF, so if someone else reads our
2289           * MRF's source GRF that we wanted to rewrite, that stops us.
2290           */
2291          bool interfered = false;
2292          for (int i = 0; i < 3; i++) {
2293             if (scan_inst->src[i].file == GRF &&
2294                 scan_inst->src[i].reg == inst->src[0].reg &&
2295                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2296                interfered = true;
2297             }
2298          }
2299          if (interfered)
2300             break;
2301
2302          if (scan_inst->dst.file == MRF) {
2303             /* If somebody else writes our MRF here, we can't
2304              * compute-to-MRF before that.
2305              */
2306             int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2307             int scan_mrf_high;
2308
2309             if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2310                scan_mrf_high = scan_mrf_low + 4;
2311             } else if (dispatch_width == 16 &&
2312                        (!scan_inst->force_uncompressed &&
2313                         !scan_inst->force_sechalf)) {
2314                scan_mrf_high = scan_mrf_low + 1;
2315             } else {
2316                scan_mrf_high = scan_mrf_low;
2317             }
2318
2319             if (mrf_low == scan_mrf_low ||
2320                 mrf_low == scan_mrf_high ||
2321                 mrf_high == scan_mrf_low ||
2322                 mrf_high == scan_mrf_high) {
2323                break;
2324             }
2325          }
2326
2327          if (scan_inst->mlen > 0) {
2328             /* Found a SEND instruction, which means that there are
2329              * live values in MRFs from base_mrf to base_mrf +
2330              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
2331              * above it.
2332              */
2333             if (mrf_low >= scan_inst->base_mrf &&
2334                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2335                break;
2336             }
2337             if (mrf_high >= scan_inst->base_mrf &&
2338                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2339                break;
2340             }
2341          }
2342       }
2343    }
2344
2345    if (progress)
2346       live_intervals_valid = false;
2347
2348    return progress;
2349 }
2350
2351 /**
2352  * Walks through basic blocks, looking for repeated MRF writes and
2353  * removing the later ones.
2354  */
2355 bool
2356 fs_visitor::remove_duplicate_mrf_writes()
2357 {
2358    fs_inst *last_mrf_move[16];
2359    bool progress = false;
2360
2361    /* Need to update the MRF tracking for compressed instructions. */
2362    if (dispatch_width == 16)
2363       return false;
2364
2365    memset(last_mrf_move, 0, sizeof(last_mrf_move));
2366
2367    foreach_list_safe(node, &this->instructions) {
2368       fs_inst *inst = (fs_inst *)node;
2369
2370       if (inst->is_control_flow()) {
2371          memset(last_mrf_move, 0, sizeof(last_mrf_move));
2372       }
2373
2374       if (inst->opcode == BRW_OPCODE_MOV &&
2375           inst->dst.file == MRF) {
2376          fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2377          if (prev_inst && inst->equals(prev_inst)) {
2378             inst->remove();
2379             progress = true;
2380             continue;
2381          }
2382       }
2383
2384       /* Clear out the last-write records for MRFs that were overwritten. */
2385       if (inst->dst.file == MRF) {
2386          last_mrf_move[inst->dst.reg] = NULL;
2387       }
2388
2389       if (inst->mlen > 0) {
2390          /* Found a SEND instruction, which will include two or fewer
2391           * implied MRF writes.  We could do better here.
2392           */
2393          for (int i = 0; i < implied_mrf_writes(inst); i++) {
2394             last_mrf_move[inst->base_mrf + i] = NULL;
2395          }
2396       }
2397
2398       /* Clear out any MRF move records whose sources got overwritten. */
2399       if (inst->dst.file == GRF) {
2400          for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2401             if (last_mrf_move[i] &&
2402                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2403                last_mrf_move[i] = NULL;
2404             }
2405          }
2406       }
2407
2408       if (inst->opcode == BRW_OPCODE_MOV &&
2409           inst->dst.file == MRF &&
2410           inst->src[0].file == GRF &&
2411           !inst->predicate) {
2412          last_mrf_move[inst->dst.reg] = inst;
2413       }
2414    }
2415
2416    if (progress)
2417       live_intervals_valid = false;
2418
2419    return progress;
2420 }
2421
2422 static void
2423 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2424                         int first_grf, int grf_len)
2425 {
2426    bool inst_16wide = (dispatch_width > 8 &&
2427                        !inst->force_uncompressed &&
2428                        !inst->force_sechalf);
2429
2430    /* Clear the flag for registers that actually got read (as expected). */
2431    for (int i = 0; i < 3; i++) {
2432       int grf;
2433       if (inst->src[i].file == GRF) {
2434          grf = inst->src[i].reg;
2435       } else if (inst->src[i].file == FIXED_HW_REG &&
2436                  inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2437          grf = inst->src[i].fixed_hw_reg.nr;
2438       } else {
2439          continue;
2440       }
2441
2442       if (grf >= first_grf &&
2443           grf < first_grf + grf_len) {
2444          deps[grf - first_grf] = false;
2445          if (inst_16wide)
2446             deps[grf - first_grf + 1] = false;
2447       }
2448    }
2449 }
2450
2451 /**
2452  * Implements this workaround for the original 965:
2453  *
2454  *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2455  *      check for post destination dependencies on this instruction, software
2456  *      must ensure that there is no destination hazard for the case of ‘write
2457  *      followed by a posted write’ shown in the following example.
2458  *
2459  *      1. mov r3 0
2460  *      2. send r3.xy <rest of send instruction>
2461  *      3. mov r2 r3
2462  *
2463  *      Due to no post-destination dependency check on the ‘send’, the above
2464  *      code sequence could have two instructions (1 and 2) in flight at the
2465  *      same time that both consider ‘r3’ as the target of their final writes.
2466  */
2467 void
2468 fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
2469 {
2470    int reg_size = dispatch_width / 8;
2471    int write_len = inst->regs_written * reg_size;
2472    int first_write_grf = inst->dst.reg;
2473    bool needs_dep[BRW_MAX_MRF];
2474    assert(write_len < (int)sizeof(needs_dep) - 1);
2475
2476    memset(needs_dep, false, sizeof(needs_dep));
2477    memset(needs_dep, true, write_len);
2478
2479    clear_deps_for_inst_src(inst, dispatch_width,
2480                            needs_dep, first_write_grf, write_len);
2481
2482    /* Walk backwards looking for writes to registers we're writing which
2483     * aren't read since being written.  If we hit the start of the program,
2484     * we assume that there are no outstanding dependencies on entry to the
2485     * program.
2486     */
2487    for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2488         scan_inst != NULL;
2489         scan_inst = (fs_inst *)scan_inst->prev) {
2490
2491       /* If we hit control flow, assume that there *are* outstanding
2492        * dependencies, and force their cleanup before our instruction.
2493        */
2494       if (scan_inst->is_control_flow()) {
2495          for (int i = 0; i < write_len; i++) {
2496             if (needs_dep[i]) {
2497                inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2498             }
2499          }
2500          return;
2501       }
2502
2503       bool scan_inst_16wide = (dispatch_width > 8 &&
2504                                !scan_inst->force_uncompressed &&
2505                                !scan_inst->force_sechalf);
2506
2507       /* We insert our reads as late as possible on the assumption that any
2508        * instruction but a MOV that might have left us an outstanding
2509        * dependency has more latency than a MOV.
2510        */
2511       if (scan_inst->dst.file == GRF) {
2512          for (int i = 0; i < scan_inst->regs_written; i++) {
2513             int reg = scan_inst->dst.reg + i * reg_size;
2514
2515             if (reg >= first_write_grf &&
2516                 reg < first_write_grf + write_len &&
2517                 needs_dep[reg - first_write_grf]) {
2518                inst->insert_before(DEP_RESOLVE_MOV(reg));
2519                needs_dep[reg - first_write_grf] = false;
2520                if (scan_inst_16wide)
2521                   needs_dep[reg - first_write_grf + 1] = false;
2522             }
2523          }
2524       }
2525
2526       /* Clear the flag for registers that actually got read (as expected). */
2527       clear_deps_for_inst_src(scan_inst, dispatch_width,
2528                               needs_dep, first_write_grf, write_len);
2529
2530       /* Continue the loop only if we haven't resolved all the dependencies */
2531       int i;
2532       for (i = 0; i < write_len; i++) {
2533          if (needs_dep[i])
2534             break;
2535       }
2536       if (i == write_len)
2537          return;
2538    }
2539 }
2540
2541 /**
2542  * Implements this workaround for the original 965:
2543  *
2544  *     "[DevBW, DevCL] Errata: A destination register from a send can not be
2545  *      used as a destination register until after it has been sourced by an
2546  *      instruction with a different destination register.
2547  */
2548 void
2549 fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
2550 {
2551    int write_len = inst->regs_written * dispatch_width / 8;
2552    int first_write_grf = inst->dst.reg;
2553    bool needs_dep[BRW_MAX_MRF];
2554    assert(write_len < (int)sizeof(needs_dep) - 1);
2555
2556    memset(needs_dep, false, sizeof(needs_dep));
2557    memset(needs_dep, true, write_len);
2558    /* Walk forwards looking for writes to registers we're writing which aren't
2559     * read before being written.
2560     */
2561    for (fs_inst *scan_inst = (fs_inst *)inst->next;
2562         !scan_inst->is_tail_sentinel();
2563         scan_inst = (fs_inst *)scan_inst->next) {
2564       /* If we hit control flow, force resolve all remaining dependencies. */
2565       if (scan_inst->is_control_flow()) {
2566          for (int i = 0; i < write_len; i++) {
2567             if (needs_dep[i])
2568                scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2569          }
2570          return;
2571       }
2572
2573       /* Clear the flag for registers that actually got read (as expected). */
2574       clear_deps_for_inst_src(scan_inst, dispatch_width,
2575                               needs_dep, first_write_grf, write_len);
2576
2577       /* We insert our reads as late as possible since they're reading the
2578        * result of a SEND, which has massive latency.
2579        */
2580       if (scan_inst->dst.file == GRF &&
2581           scan_inst->dst.reg >= first_write_grf &&
2582           scan_inst->dst.reg < first_write_grf + write_len &&
2583           needs_dep[scan_inst->dst.reg - first_write_grf]) {
2584          scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2585          needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2586       }
2587
2588       /* Continue the loop only if we haven't resolved all the dependencies */
2589       int i;
2590       for (i = 0; i < write_len; i++) {
2591          if (needs_dep[i])
2592             break;
2593       }
2594       if (i == write_len)
2595          return;
2596    }
2597
2598    /* If we hit the end of the program, resolve all remaining dependencies out
2599     * of paranoia.
2600     */
2601    fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2602    assert(last_inst->eot);
2603    for (int i = 0; i < write_len; i++) {
2604       if (needs_dep[i])
2605          last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2606    }
2607 }
2608
2609 void
2610 fs_visitor::insert_gen4_send_dependency_workarounds()
2611 {
2612    if (intel->gen != 4 || intel->is_g4x)
2613       return;
2614
2615    /* Note that we're done with register allocation, so GRF fs_regs always
2616     * have a .reg_offset of 0.
2617     */
2618
2619    foreach_list_safe(node, &this->instructions) {
2620       fs_inst *inst = (fs_inst *)node;
2621
2622       if (inst->mlen != 0 && inst->dst.file == GRF) {
2623          insert_gen4_pre_send_dependency_workarounds(inst);
2624          insert_gen4_post_send_dependency_workarounds(inst);
2625       }
2626    }
2627 }
2628
2629 /**
2630  * Turns the generic expression-style uniform pull constant load instruction
2631  * into a hardware-specific series of instructions for loading a pull
2632  * constant.
2633  *
2634  * The expression style allows the CSE pass before this to optimize out
2635  * repeated loads from the same offset, and gives the pre-register-allocation
2636  * scheduling full flexibility, while the conversion to native instructions
2637  * allows the post-register-allocation scheduler the best information
2638  * possible.
2639  *
2640  * Note that execution masking for setting up pull constant loads is special:
2641  * the channels that need to be written are unrelated to the current execution
2642  * mask, since a later instruction will use one of the result channels as a
2643  * source operand for all 8 or 16 of its channels.
2644  */
2645 void
2646 fs_visitor::lower_uniform_pull_constant_loads()
2647 {
2648    foreach_list(node, &this->instructions) {
2649       fs_inst *inst = (fs_inst *)node;
2650
2651       if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2652          continue;
2653
2654       if (intel->gen >= 7) {
2655          /* The offset arg before was a vec4-aligned byte offset.  We need to
2656           * turn it into a dword offset.
2657           */
2658          fs_reg const_offset_reg = inst->src[1];
2659          assert(const_offset_reg.file == IMM &&
2660                 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2661          const_offset_reg.imm.u /= 4;
2662          fs_reg payload = fs_reg(this, glsl_type::uint_type);
2663
2664          /* This is actually going to be a MOV, but since only the first dword
2665           * is accessed, we have a special opcode to do just that one.  Note
2666           * that this needs to be an operation that will be considered a def
2667           * by live variable analysis, or register allocation will explode.
2668           */
2669          fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2670                                                payload, const_offset_reg);
2671          setup->force_writemask_all = true;
2672
2673          setup->ir = inst->ir;
2674          setup->annotation = inst->annotation;
2675          inst->insert_before(setup);
2676
2677          /* Similarly, this will only populate the first 4 channels of the
2678           * result register (since we only use smear values from 0-3), but we
2679           * don't tell the optimizer.
2680           */
2681          inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2682          inst->src[1] = payload;
2683
2684          this->live_intervals_valid = false;
2685       } else {
2686          /* Before register allocation, we didn't tell the scheduler about the
2687           * MRF we use.  We know it's safe to use this MRF because nothing
2688           * else does except for register spill/unspill, which generates and
2689           * uses its MRF within a single IR instruction.
2690           */
2691          inst->base_mrf = 14;
2692          inst->mlen = 1;
2693       }
2694    }
2695 }
2696
2697 void
2698 fs_visitor::dump_instruction(fs_inst *inst)
2699 {
2700    if (inst->predicate) {
2701       printf("(%cf0.%d) ",
2702              inst->predicate_inverse ? '-' : '+',
2703              inst->flag_subreg);
2704    }
2705
2706    printf("%s", brw_instruction_name(inst->opcode));
2707    if (inst->saturate)
2708       printf(".sat");
2709    if (inst->conditional_mod) {
2710       printf(".cmod");
2711       if (!inst->predicate &&
2712           (intel->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2713                               inst->opcode != BRW_OPCODE_IF &&
2714                               inst->opcode != BRW_OPCODE_WHILE))) {
2715          printf(".f0.%d\n", inst->flag_subreg);
2716       }
2717    }
2718    printf(" ");
2719
2720
2721    switch (inst->dst.file) {
2722    case GRF:
2723       printf("vgrf%d", inst->dst.reg);
2724       if (inst->dst.reg_offset)
2725          printf("+%d", inst->dst.reg_offset);
2726       break;
2727    case MRF:
2728       printf("m%d", inst->dst.reg);
2729       break;
2730    case BAD_FILE:
2731       printf("(null)");
2732       break;
2733    case UNIFORM:
2734       printf("***u%d***", inst->dst.reg);
2735       break;
2736    default:
2737       printf("???");
2738       break;
2739    }
2740    printf(", ");
2741
2742    for (int i = 0; i < 3; i++) {
2743       if (inst->src[i].negate)
2744          printf("-");
2745       if (inst->src[i].abs)
2746          printf("|");
2747       switch (inst->src[i].file) {
2748       case GRF:
2749          printf("vgrf%d", inst->src[i].reg);
2750          if (inst->src[i].reg_offset)
2751             printf("+%d", inst->src[i].reg_offset);
2752          break;
2753       case MRF:
2754          printf("***m%d***", inst->src[i].reg);
2755          break;
2756       case UNIFORM:
2757          printf("u%d", inst->src[i].reg);
2758          if (inst->src[i].reg_offset)
2759             printf(".%d", inst->src[i].reg_offset);
2760          break;
2761       case BAD_FILE:
2762          printf("(null)");
2763          break;
2764       case IMM:
2765          switch (inst->src[i].type) {
2766          case BRW_REGISTER_TYPE_F:
2767             printf("%ff", inst->src[i].imm.f);
2768             break;
2769          case BRW_REGISTER_TYPE_D:
2770             printf("%dd", inst->src[i].imm.i);
2771             break;
2772          case BRW_REGISTER_TYPE_UD:
2773             printf("%uu", inst->src[i].imm.u);
2774             break;
2775          default:
2776             printf("???");
2777             break;
2778          }
2779          break;
2780       default:
2781          printf("???");
2782          break;
2783       }
2784       if (inst->src[i].abs)
2785          printf("|");
2786
2787       if (i < 3)
2788          printf(", ");
2789    }
2790
2791    printf(" ");
2792
2793    if (inst->force_uncompressed)
2794       printf("1sthalf ");
2795
2796    if (inst->force_sechalf)
2797       printf("2ndhalf ");
2798
2799    printf("\n");
2800 }
2801
2802 void
2803 fs_visitor::dump_instructions()
2804 {
2805    int ip = 0;
2806    foreach_list(node, &this->instructions) {
2807       fs_inst *inst = (fs_inst *)node;
2808       printf("%d: ", ip++);
2809       dump_instruction(inst);
2810    }
2811 }
2812
2813 /**
2814  * Possibly returns an instruction that set up @param reg.
2815  *
2816  * Sometimes we want to take the result of some expression/variable
2817  * dereference tree and rewrite the instruction generating the result
2818  * of the tree.  When processing the tree, we know that the
2819  * instructions generated are all writing temporaries that are dead
2820  * outside of this tree.  So, if we have some instructions that write
2821  * a temporary, we're free to point that temp write somewhere else.
2822  *
2823  * Note that this doesn't guarantee that the instruction generated
2824  * only reg -- it might be the size=4 destination of a texture instruction.
2825  */
2826 fs_inst *
2827 fs_visitor::get_instruction_generating_reg(fs_inst *start,
2828                                            fs_inst *end,
2829                                            fs_reg reg)
2830 {
2831    if (end == start ||
2832        end->is_partial_write() ||
2833        reg.reladdr ||
2834        !reg.equals(end->dst)) {
2835       return NULL;
2836    } else {
2837       return end;
2838    }
2839 }
2840
2841 void
2842 fs_visitor::setup_payload_gen6()
2843 {
2844    struct intel_context *intel = &brw->intel;
2845    bool uses_depth =
2846       (fp->Base.InputsRead & (1 << VARYING_SLOT_POS)) != 0;
2847    unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
2848
2849    assert(intel->gen >= 6);
2850
2851    /* R0-1: masks, pixel X/Y coordinates. */
2852    c->nr_payload_regs = 2;
2853    /* R2: only for 32-pixel dispatch.*/
2854
2855    /* R3-26: barycentric interpolation coordinates.  These appear in the
2856     * same order that they appear in the brw_wm_barycentric_interp_mode
2857     * enum.  Each set of coordinates occupies 2 registers if dispatch width
2858     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
2859     * appear if they were enabled using the "Barycentric Interpolation
2860     * Mode" bits in WM_STATE.
2861     */
2862    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
2863       if (barycentric_interp_modes & (1 << i)) {
2864          c->barycentric_coord_reg[i] = c->nr_payload_regs;
2865          c->nr_payload_regs += 2;
2866          if (dispatch_width == 16) {
2867             c->nr_payload_regs += 2;
2868          }
2869       }
2870    }
2871
2872    /* R27: interpolated depth if uses source depth */
2873    if (uses_depth) {
2874       c->source_depth_reg = c->nr_payload_regs;
2875       c->nr_payload_regs++;
2876       if (dispatch_width == 16) {
2877          /* R28: interpolated depth if not 8-wide. */
2878          c->nr_payload_regs++;
2879       }
2880    }
2881    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
2882    if (uses_depth) {
2883       c->source_w_reg = c->nr_payload_regs;
2884       c->nr_payload_regs++;
2885       if (dispatch_width == 16) {
2886          /* R30: interpolated W if not 8-wide. */
2887          c->nr_payload_regs++;
2888       }
2889    }
2890    /* R31: MSAA position offsets. */
2891    /* R32-: bary for 32-pixel. */
2892    /* R58-59: interp W for 32-pixel. */
2893
2894    if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
2895       c->source_depth_to_render_target = true;
2896    }
2897 }
2898
2899 bool
2900 fs_visitor::run()
2901 {
2902    sanity_param_count = fp->Base.Parameters->NumParameters;
2903    uint32_t orig_nr_params = c->prog_data.nr_params;
2904
2905    if (intel->gen >= 6)
2906       setup_payload_gen6();
2907    else
2908       setup_payload_gen4();
2909
2910    if (0) {
2911       emit_dummy_fs();
2912    } else {
2913       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2914          emit_shader_time_begin();
2915
2916       calculate_urb_setup();
2917       if (intel->gen < 6)
2918          emit_interpolation_setup_gen4();
2919       else
2920          emit_interpolation_setup_gen6();
2921
2922       /* We handle discards by keeping track of the still-live pixels in f0.1.
2923        * Initialize it with the dispatched pixels.
2924        */
2925       if (fp->UsesKill) {
2926          fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
2927          discard_init->flag_subreg = 1;
2928       }
2929
2930       /* Generate FS IR for main().  (the visitor only descends into
2931        * functions called "main").
2932        */
2933       if (shader) {
2934          foreach_list(node, &*shader->ir) {
2935             ir_instruction *ir = (ir_instruction *)node;
2936             base_ir = ir;
2937             this->result = reg_undef;
2938             ir->accept(this);
2939          }
2940       } else {
2941          emit_fragment_program_code();
2942       }
2943       base_ir = NULL;
2944       if (failed)
2945          return false;
2946
2947       emit(FS_OPCODE_PLACEHOLDER_HALT);
2948
2949       emit_fb_writes();
2950
2951       split_virtual_grfs();
2952
2953       move_uniform_array_access_to_pull_constants();
2954       setup_pull_constants();
2955
2956       bool progress;
2957       do {
2958          progress = false;
2959
2960          compact_virtual_grfs();
2961
2962          progress = remove_duplicate_mrf_writes() || progress;
2963
2964          progress = opt_algebraic() || progress;
2965          progress = opt_cse() || progress;
2966          progress = opt_copy_propagate() || progress;
2967          progress = dead_code_eliminate() || progress;
2968          progress = dead_code_eliminate_local() || progress;
2969          progress = register_coalesce() || progress;
2970          progress = register_coalesce_2() || progress;
2971          progress = compute_to_mrf() || progress;
2972       } while (progress);
2973
2974       remove_dead_constants();
2975
2976       schedule_instructions(false);
2977
2978       lower_uniform_pull_constant_loads();
2979
2980       assign_curb_setup();
2981       assign_urb_setup();
2982
2983       if (0) {
2984          /* Debug of register spilling: Go spill everything. */
2985          for (int i = 0; i < virtual_grf_count; i++) {
2986             spill_reg(i);
2987          }
2988       }
2989
2990       if (0)
2991          assign_regs_trivial();
2992       else {
2993          while (!assign_regs()) {
2994             if (failed)
2995                break;
2996          }
2997       }
2998    }
2999    assert(force_uncompressed_stack == 0);
3000    assert(force_sechalf_stack == 0);
3001
3002    /* This must come after all optimization and register allocation, since
3003     * it inserts dead code that happens to have side effects, and it does
3004     * so based on the actual physical registers in use.
3005     */
3006    insert_gen4_send_dependency_workarounds();
3007
3008    if (failed)
3009       return false;
3010
3011    schedule_instructions(true);
3012
3013    if (dispatch_width == 8) {
3014       c->prog_data.reg_blocks = brw_register_blocks(grf_used);
3015    } else {
3016       c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
3017
3018       /* Make sure we didn't try to sneak in an extra uniform */
3019       assert(orig_nr_params == c->prog_data.nr_params);
3020       (void) orig_nr_params;
3021    }
3022
3023    /* If any state parameters were appended, then ParameterValues could have
3024     * been realloced, in which case the driver uniform storage set up by
3025     * _mesa_associate_uniform_storage() would point to freed memory.  Make
3026     * sure that didn't happen.
3027     */
3028    assert(sanity_param_count == fp->Base.Parameters->NumParameters);
3029
3030    return !failed;
3031 }
3032
3033 const unsigned *
3034 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
3035                struct gl_fragment_program *fp,
3036                struct gl_shader_program *prog,
3037                unsigned *final_assembly_size)
3038 {
3039    struct intel_context *intel = &brw->intel;
3040    bool start_busy = false;
3041    float start_time = 0;
3042
3043    if (unlikely(intel->perf_debug)) {
3044       start_busy = (intel->batch.last_bo &&
3045                     drm_intel_bo_busy(intel->batch.last_bo));
3046       start_time = get_time();
3047    }
3048
3049    struct brw_shader *shader = NULL;
3050    if (prog)
3051       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3052
3053    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3054       if (prog) {
3055          printf("GLSL IR for native fragment shader %d:\n", prog->Name);
3056          _mesa_print_ir(shader->ir, NULL);
3057          printf("\n\n");
3058       } else {
3059          printf("ARB_fragment_program %d ir for native fragment shader\n",
3060                 fp->Base.Id);
3061          _mesa_print_program(&fp->Base);
3062       }
3063    }
3064
3065    /* Now the main event: Visit the shader IR and generate our FS IR for it.
3066     */
3067    fs_visitor v(brw, c, prog, fp, 8);
3068    if (!v.run()) {
3069       if (prog) {
3070          prog->LinkStatus = false;
3071          ralloc_strcat(&prog->InfoLog, v.fail_msg);
3072       }
3073
3074       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3075                     v.fail_msg);
3076
3077       return NULL;
3078    }
3079
3080    exec_list *simd16_instructions = NULL;
3081    fs_visitor v2(brw, c, prog, fp, 16);
3082    bool no16 = INTEL_DEBUG & DEBUG_NO16;
3083    if (intel->gen >= 5 && c->prog_data.nr_pull_params == 0 && likely(!no16)) {
3084       v2.import_uniforms(&v);
3085       if (!v2.run()) {
3086          perf_debug("16-wide shader failed to compile, falling back to "
3087                     "8-wide at a 10-20%% performance cost: %s", v2.fail_msg);
3088       } else {
3089          simd16_instructions = &v2.instructions;
3090       }
3091    }
3092
3093    c->prog_data.dispatch_width = 8;
3094
3095    fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
3096    const unsigned *generated = g.generate_assembly(&v.instructions,
3097                                                    simd16_instructions,
3098                                                    final_assembly_size);
3099
3100    if (unlikely(intel->perf_debug) && shader) {
3101       if (shader->compiled_once)
3102          brw_wm_debug_recompile(brw, prog, &c->key);
3103       shader->compiled_once = true;
3104
3105       if (start_busy && !drm_intel_bo_busy(intel->batch.last_bo)) {
3106          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3107                     (get_time() - start_time) * 1000);
3108       }
3109    }
3110
3111    return generated;
3112 }
3113
3114 bool
3115 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
3116 {
3117    struct brw_context *brw = brw_context(ctx);
3118    struct intel_context *intel = &brw->intel;
3119    struct brw_wm_prog_key key;
3120
3121    if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
3122       return true;
3123
3124    struct gl_fragment_program *fp = (struct gl_fragment_program *)
3125       prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
3126    struct brw_fragment_program *bfp = brw_fragment_program(fp);
3127    bool program_uses_dfdy = fp->UsesDFdy;
3128
3129    memset(&key, 0, sizeof(key));
3130
3131    if (intel->gen < 6) {
3132       if (fp->UsesKill)
3133          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3134
3135       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3136          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3137
3138       /* Just assume depth testing. */
3139       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3140       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3141    }
3142
3143    if (intel->gen < 6)
3144       key.input_slots_valid |= BITFIELD64_BIT(VARYING_SLOT_POS);
3145
3146    for (int i = 0; i < VARYING_SLOT_MAX; i++) {
3147       if (!(fp->Base.InputsRead & BITFIELD64_BIT(i)))
3148          continue;
3149
3150       if (intel->gen < 6) {
3151          if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
3152             key.input_slots_valid |= BITFIELD64_BIT(i);
3153       }
3154    }
3155
3156    key.clamp_fragment_color = true;
3157
3158    for (int i = 0; i < MAX_SAMPLERS; i++) {
3159       if (fp->Base.ShadowSamplers & (1 << i)) {
3160          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3161          key.tex.swizzles[i] =
3162             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3163       } else {
3164          /* Color sampler: assume no swizzling. */
3165          key.tex.swizzles[i] = SWIZZLE_XYZW;
3166       }
3167    }
3168
3169    if (fp->Base.InputsRead & VARYING_BIT_POS) {
3170       key.drawable_height = ctx->DrawBuffer->Height;
3171    }
3172
3173    if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3174       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
3175    }
3176
3177    key.nr_color_regions = 1;
3178
3179    key.program_string_id = bfp->id;
3180
3181    uint32_t old_prog_offset = brw->wm.prog_offset;
3182    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3183
3184    bool success = do_wm_prog(brw, prog, bfp, &key);
3185
3186    brw->wm.prog_offset = old_prog_offset;
3187    brw->wm.prog_data = old_prog_data;
3188
3189    return success;
3190 }